2 *******************************************************************************
\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.io.IOException;
\r
10 import java.text.ParseException;
\r
11 import java.util.Hashtable;
\r
12 import java.util.Vector;
\r
13 import java.util.Arrays;
\r
14 import java.util.Enumeration;
\r
16 import com.ibm.icu.impl.TrieBuilder;
\r
17 import com.ibm.icu.impl.IntTrieBuilder;
\r
18 import com.ibm.icu.impl.TrieIterator;
\r
19 import com.ibm.icu.impl.Utility;
\r
20 import com.ibm.icu.impl.UCharacterProperty;
\r
21 import com.ibm.icu.lang.UCharacter;
\r
22 import com.ibm.icu.lang.UCharacterCategory;
\r
23 import com.ibm.icu.impl.NormalizerImpl;
\r
24 import com.ibm.icu.util.RangeValueIterator;
\r
25 import com.ibm.icu.util.VersionInfo;
\r
28 * Class for building a collator from a list of collation rules.
\r
29 * This class is uses CollationRuleParser
\r
30 * @author Syn Wee Quek
\r
31 * @since release 2.2, June 11 2002
\r
33 final class CollationParsedRuleBuilder
\r
35 // package private constructors ------------------------------------------
\r
39 * @param rules collation rules
\r
40 * @exception ParseException thrown when argument rules have an invalid
\r
43 CollationParsedRuleBuilder(String rules) throws ParseException
\r
45 m_parser_ = new CollationRuleParser(rules);
\r
46 m_parser_.assembleTokenList();
\r
47 m_utilColEIter_ = RuleBasedCollator.UCA_.getCollationElementIterator(
\r
51 // package private inner classes -----------------------------------------
\r
54 * Inverse UCA wrapper
\r
56 static class InverseUCA
\r
58 // package private constructor ---------------------------------------
\r
64 // package private data member ---------------------------------------
\r
67 * Array list of characters
\r
71 * Array list of continuation characters
\r
73 char m_continuations_[];
\r
76 * UCA version of inverse UCA table
\r
78 VersionInfo m_UCA_version_;
\r
80 // package private method --------------------------------------------
\r
83 * Returns the previous inverse ces of the argument ces
\r
84 * @param ce ce to test
\r
85 * @param contce continuation ce to test
\r
86 * @param strength collation strength
\r
87 * @param prevresult an array to store the return results previous
\r
88 * inverse ce and previous inverse continuation ce
\r
89 * @return result of the inverse ce
\r
91 final int getInversePrevCE(int ce, int contce, int strength,
\r
94 int result = findInverseCE(ce, contce);
\r
97 prevresult[0] = CollationElementIterator.NULLORDER;
\r
101 ce &= STRENGTH_MASK_[strength];
\r
102 contce &= STRENGTH_MASK_[strength];
\r
104 prevresult[0] = ce;
\r
105 prevresult[1] = contce;
\r
107 while ((prevresult[0] & STRENGTH_MASK_[strength]) == ce
\r
108 && (prevresult[1] & STRENGTH_MASK_[strength])== contce
\r
110 // this condition should prevent falling off the edge of the
\r
112 // here, we end up in a singularity - zero
\r
113 prevresult[0] = m_table_[3 * (-- result)];
\r
114 prevresult[1] = m_table_[3 * result + 1];
\r
119 final int getCEStrengthDifference(int CE, int contCE,
\r
120 int prevCE, int prevContCE) {
\r
121 int strength = Collator.TERTIARY;
\r
123 ((prevCE & STRENGTH_MASK_[strength]) != (CE & STRENGTH_MASK_[strength])
\r
124 || (prevContCE & STRENGTH_MASK_[strength]) != (contCE & STRENGTH_MASK_[strength]))
\r
125 && (strength != 0)) {
\r
131 private int compareCEs(int source0, int source1, int target0, int target1) {
\r
132 int s1 = source0, s2, t1 = target0, t2;
\r
133 if(RuleBasedCollator.isContinuation(source1)) {
\r
138 if(RuleBasedCollator.isContinuation(target1)) {
\r
145 if(s1 == t1 && s2 == t2) {
\r
148 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
\r
149 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
\r
151 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
\r
152 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
\r
154 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
\r
155 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
\r
156 return Utility.compareUnsigned(s, t);
\r
158 return Utility.compareUnsigned(s, t);
\r
161 return Utility.compareUnsigned(s, t);
\r
166 * Finding the inverse CE of the argument CEs
\r
167 * @param ce CE to be tested
\r
168 * @param contce continuation CE
\r
169 * @return inverse CE
\r
171 int findInverseCE(int ce, int contce)
\r
174 int top = m_table_.length / 3;
\r
177 while (bottom < top - 1) {
\r
178 result = (top + bottom) >> 1;
\r
179 int first = m_table_[3 * result];
\r
180 int second = m_table_[3 * result + 1];
\r
181 int comparison = compareCEs(first, second, ce, contce);
\r
182 if (comparison > 0) {
\r
185 else if (comparison < 0) {
\r
197 * Getting gap offsets in the inverse UCA
\r
198 * @param listheader parsed token lists
\r
199 * @exception Exception thrown when error occurs while finding the
\r
202 void getInverseGapPositions(CollationRuleParser.TokenListHeader
\r
206 // reset all the gaps
\r
207 CollationRuleParser.Token token = listheader.m_first_;
\r
208 int tokenstrength = token.m_strength_;
\r
210 for (int i = 0; i < 3; i ++) {
\r
211 listheader.m_gapsHi_[3 * i] = 0;
\r
212 listheader.m_gapsHi_[3 * i + 1] = 0;
\r
213 listheader.m_gapsHi_[3 * i + 2] = 0;
\r
214 listheader.m_gapsLo_[3 * i] = 0;
\r
215 listheader.m_gapsLo_[3 * i + 1] = 0;
\r
216 listheader.m_gapsLo_[3 * i + 2] = 0;
\r
217 listheader.m_numStr_[i] = 0;
\r
218 listheader.m_fStrToken_[i] = null;
\r
219 listheader.m_lStrToken_[i] = null;
\r
220 listheader.m_pos_[i] = -1;
\r
223 if ((listheader.m_baseCE_ >>> 24)
\r
224 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_
\r
225 && (listheader.m_baseCE_ >>> 24)
\r
226 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)
\r
229 listheader.m_pos_[0] = 0;
\r
230 int t1 = listheader.m_baseCE_;
\r
231 int t2 = listheader.m_baseContCE_;
\r
232 listheader.m_gapsLo_[0] = mergeCE(t1, t2,
\r
234 listheader.m_gapsLo_[1] = mergeCE(t1, t2,
\r
235 Collator.SECONDARY);
\r
236 listheader.m_gapsLo_[2] = mergeCE(t1, t2,
\r
237 Collator.TERTIARY);
\r
238 int primaryCE = t1 & RuleBasedCollator.CE_PRIMARY_MASK_ | (t2 & RuleBasedCollator.CE_PRIMARY_MASK_) >>> 16;
\r
239 primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(RuleBasedCollator.impCEGen_.getRawFromImplicit(primaryCE)+1);
\r
241 t1 = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
\r
242 t2 = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;
\r
244 // if (listheader.m_baseCE_ < 0xEF000000) {
\r
245 // // first implicits have three byte primaries, with a gap of
\r
246 // // one so we esentially need to add 2 to the top byte in
\r
247 // // listheader.m_baseContCE_
\r
248 // t2 += 0x02000000;
\r
251 // // second implicits have four byte primaries, with a gap of
\r
252 // // IMPLICIT_LAST2_MULTIPLIER_
\r
253 // // Now, this guy is not really accessible here, so until we
\r
254 // // find a better way to pass it around, assume that the gap is 1
\r
255 // t2 += 0x00020000;
\r
257 listheader.m_gapsHi_[0] = mergeCE(t1, t2,
\r
259 listheader.m_gapsHi_[1] = mergeCE(t1, t2,
\r
260 Collator.SECONDARY);
\r
261 listheader.m_gapsHi_[2] = mergeCE(t1, t2,
\r
262 Collator.TERTIARY);
\r
264 else if (listheader.m_indirect_ == true
\r
265 && listheader.m_nextCE_ != 0) {
\r
266 listheader.m_pos_[0] = 0;
\r
267 int t1 = listheader.m_baseCE_;
\r
268 int t2 = listheader.m_baseContCE_;
\r
269 listheader.m_gapsLo_[0] = mergeCE(t1, t2,
\r
271 listheader.m_gapsLo_[1] = mergeCE(t1, t2,
\r
272 Collator.SECONDARY);
\r
273 listheader.m_gapsLo_[2] = mergeCE(t1, t2,
\r
274 Collator.TERTIARY);
\r
275 t1 = listheader.m_nextCE_;
\r
276 t2 = listheader.m_nextContCE_;
\r
277 listheader.m_gapsHi_[0] = mergeCE(t1, t2,
\r
279 listheader.m_gapsHi_[1] = mergeCE(t1, t2,
\r
280 Collator.SECONDARY);
\r
281 listheader.m_gapsHi_[2] = mergeCE(t1, t2,
\r
282 Collator.TERTIARY);
\r
286 if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {
\r
287 listheader.m_pos_[tokenstrength]
\r
288 = getInverseNext(listheader,
\r
290 if (listheader.m_pos_[tokenstrength] >= 0) {
\r
291 listheader.m_fStrToken_[tokenstrength] = token;
\r
294 // The CE must be implicit, since it's not in the
\r
297 throw new Exception("Internal program error");
\r
301 while (token != null && token.m_strength_ >= tokenstrength)
\r
303 if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {
\r
304 listheader.m_lStrToken_[tokenstrength] = token;
\r
306 token = token.m_next_;
\r
308 if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_ - 1) {
\r
309 // check if previous interval is the same and merge the
\r
310 // intervals if it is so
\r
311 if (listheader.m_pos_[tokenstrength]
\r
312 == listheader.m_pos_[tokenstrength + 1]) {
\r
313 listheader.m_fStrToken_[tokenstrength]
\r
314 = listheader.m_fStrToken_[tokenstrength
\r
316 listheader.m_fStrToken_[tokenstrength + 1] = null;
\r
317 listheader.m_lStrToken_[tokenstrength + 1] = null;
\r
318 listheader.m_pos_[tokenstrength + 1] = -1;
\r
321 if (token != null) {
\r
322 tokenstrength = token.m_strength_;
\r
328 for (int st = 0; st < 3; st ++) {
\r
329 int pos = listheader.m_pos_[st];
\r
331 int t1 = m_table_[3 * pos];
\r
332 int t2 = m_table_[3 * pos + 1];
\r
333 listheader.m_gapsHi_[3 * st] = mergeCE(t1, t2,
\r
335 listheader.m_gapsHi_[3 * st + 1] = mergeCE(t1, t2,
\r
336 Collator.SECONDARY);
\r
337 listheader.m_gapsHi_[3 * st + 2] = (t1 & 0x3f) << 24
\r
338 | (t2 & 0x3f) << 16;
\r
340 //t1 = m_table_[3 * pos];
\r
341 //t2 = m_table_[3 * pos + 1];
\r
342 t1 = listheader.m_baseCE_;
\r
343 t2 = listheader.m_baseContCE_;
\r
345 listheader.m_gapsLo_[3 * st] = mergeCE(t1, t2,
\r
347 listheader.m_gapsLo_[3 * st + 1] = mergeCE(t1, t2,
\r
348 Collator.SECONDARY);
\r
349 listheader.m_gapsLo_[3 * st + 2] = (t1 & 0x3f) << 24
\r
350 | (t2 & 0x3f) << 16;
\r
357 * Gets the next CE in the inverse table
\r
358 * @param listheader token list header
\r
359 * @param strength collation strength
\r
362 private final int getInverseNext(CollationRuleParser.TokenListHeader
\r
366 int ce = listheader.m_baseCE_;
\r
367 int secondce = listheader.m_baseContCE_;
\r
368 int result = findInverseCE(ce, secondce);
\r
374 ce &= STRENGTH_MASK_[strength];
\r
375 secondce &= STRENGTH_MASK_[strength];
\r
378 int nextcontce = secondce;
\r
380 while((nextce & STRENGTH_MASK_[strength]) == ce
\r
381 && (nextcontce & STRENGTH_MASK_[strength]) == secondce) {
\r
382 nextce = m_table_[3 * (++ result)];
\r
383 nextcontce = m_table_[3 * result + 1];
\r
386 listheader.m_nextCE_ = nextce;
\r
387 listheader.m_nextContCE_ = nextcontce;
\r
393 // package private data members ------------------------------------------
\r
396 * Inverse UCA, instantiate only when required
\r
398 static final InverseUCA INVERSE_UCA_;
\r
401 * UCA and Inverse UCA version do not match
\r
403 private static final String INV_UCA_VERSION_MISMATCH_ =
\r
404 "UCA versions of UCA and inverse UCA should match";
\r
407 * UCA and Inverse UCA version do not match
\r
409 private static final String UCA_NOT_INSTANTIATED_ =
\r
410 "UCA is not instantiated!";
\r
413 * Initializing the inverse UCA
\r
416 InverseUCA temp = null;
\r
418 temp = CollatorReader.getInverseUCA();
\r
419 } catch (IOException e) {
\r
424 String invdat = "/com/ibm/icu/impl/data/invuca.icu";
\r
425 InputStream i = CollationParsedRuleBuilder.class.getResourceAsStream(invdat);
\r
426 BufferedInputStream b = new BufferedInputStream(i, 110000);
\r
427 INVERSE_UCA_ = CollatorReader.readInverseUCA(b);
\r
431 catch (Exception e)
\r
433 e.printStackTrace();
\r
434 throw new RuntimeException(e.getMessage());
\r
438 if(temp != null && RuleBasedCollator.UCA_ != null) {
\r
439 if(!temp.m_UCA_version_.equals(RuleBasedCollator.UCA_.m_UCA_version_)) {
\r
440 throw new RuntimeException(INV_UCA_VERSION_MISMATCH_);
\r
443 throw new RuntimeException(UCA_NOT_INSTANTIATED_);
\r
446 INVERSE_UCA_ = temp;
\r
449 // package private methods -----------------------------------------------
\r
452 * Parse and sets the collation rules in the argument collator
\r
453 * @param collator to set
\r
454 * @exception Exception thrown when internal program error occurs
\r
456 void setRules(RuleBasedCollator collator) throws Exception
\r
458 if (m_parser_.m_resultLength_ > 0 || m_parser_.m_removeSet_ != null) {
\r
459 // we have a set of rules, let's make something of it
\r
460 assembleTailoringTable(collator);
\r
462 else { // no rules, but no error either must be only options
\r
463 // We will init the collator from UCA
\r
464 collator.setWithUCATables();
\r
466 // And set only the options
\r
467 m_parser_.setDefaultOptionsInCollator(collator);
\r
470 private void copyRangeFromUCA(BuildTable t, int start, int end) {
\r
472 for (u = start; u <= end; u ++) {
\r
473 // if ((CE = ucmpe32_get(t.m_mapping, u)) == UCOL_NOT_FOUND
\r
474 int CE = t.m_mapping_.getValue(u);
\r
475 if (CE == CE_NOT_FOUND_
\r
476 // this test is for contractions that are missing the starting
\r
477 // element. Looks like latin-1 should be done before
\r
478 // assembling the table, even if it results in more false
\r
479 // closure elements
\r
480 || (isContractionTableElement(CE)
\r
481 && getCE(t.m_contractions_, CE, 0) == CE_NOT_FOUND_)) {
\r
482 //m_utilElement_.m_uchars_ = str.toString();
\r
483 m_utilElement_.m_uchars_ = UCharacter.toString(u);
\r
484 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
\r
485 m_utilElement_.m_prefix_ = 0;
\r
486 m_utilElement_.m_CELength_ = 0;
\r
487 m_utilElement_.m_prefixChars_ = null;
\r
488 m_utilColEIter_.setText(m_utilElement_.m_uchars_);
\r
489 while (CE != CollationElementIterator.NULLORDER) {
\r
490 CE = m_utilColEIter_.next();
\r
491 if (CE != CollationElementIterator.NULLORDER) {
\r
492 m_utilElement_.m_CEs_[m_utilElement_.m_CELength_ ++]
\r
496 addAnElement(t, m_utilElement_);
\r
502 * 2. Eliminate the negative lists by doing the following for each
\r
503 * non-null negative list:
\r
504 * o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
\r
505 * create new ListHeader X
\r
506 * o reverse the list, add to the end of X's positive list. Reset the
\r
507 * strength of the first item you add, based on the stronger strength
\r
508 * levels of the two lists.
\r
510 * 3. For each ListHeader with a non-null positive list:
\r
511 * o Find all character strings with CEs between the baseCE and the
\r
512 * next/previous CE, at the strength of the first token. Add these to the
\r
514 * ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
\r
515 * tailoring has & x < z...
\r
516 * ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
\r
518 * It is possible that this part should be done even while constructing list
\r
519 * The problem is that it is unknown what is going to be the strongest
\r
521 * So we might as well do it here
\r
522 * o Allocate CEs for each token in the list, based on the total number N
\r
523 * of the largest level difference, and the gap G between baseCE and nextCE
\r
524 * at that level. The relation * between the last item and nextCE is the
\r
525 * same as the strongest strength.
\r
526 * o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
\r
527 * ? There are 3 primary items: a, d, e. Fit them into the primary gap.
\r
528 * Then fit b and c into the secondary gap between a and d, then fit q
\r
529 * into the tertiary gap between b and c.
\r
530 * o Example: baseCE << b <<< q << c * nextCE(X,2)
\r
531 * ? There are 2 secondary items: b, c. Fit them into the secondary gap.
\r
532 * Then fit q into the tertiary gap between b and c.
\r
533 * o When incrementing primary values, we will not cross high byte
\r
534 * boundaries except where there is only a single-byte primary. That is
\r
535 * to ensure that the script reordering will continue to work.
\r
536 * @param collator the rule based collator to update
\r
537 * @exception Exception thrown when internal program error occurs
\r
539 void assembleTailoringTable(RuleBasedCollator collator) throws Exception
\r
542 for (int i = 0; i < m_parser_.m_resultLength_; i ++) {
\r
543 // now we need to generate the CEs
\r
544 // We stuff the initial value in the buffers, and increase the
\r
545 // appropriate buffer according to strength
\r
546 if (m_parser_.m_listHeader_[i].m_first_ != null) {
\r
547 // if there are any elements
\r
548 // due to the way parser works, subsequent tailorings
\r
549 // may remove all the elements from a sequence, therefore
\r
550 // leaving an empty tailoring sequence.
\r
551 initBuffers(m_parser_.m_listHeader_[i]);
\r
555 if (m_parser_.m_variableTop_ != null) {
\r
556 // stuff the variable top value
\r
557 m_parser_.m_options_.m_variableTopValue_
\r
558 = m_parser_.m_variableTop_.m_CE_[0] >>> 16;
\r
559 // remove it from the list
\r
560 if (m_parser_.m_variableTop_.m_listHeader_.m_first_
\r
561 == m_parser_.m_variableTop_) { // first in list
\r
562 m_parser_.m_variableTop_.m_listHeader_.m_first_
\r
563 = m_parser_.m_variableTop_.m_next_;
\r
565 if (m_parser_.m_variableTop_.m_listHeader_.m_last_
\r
566 == m_parser_.m_variableTop_) {
\r
568 m_parser_.m_variableTop_.m_listHeader_.m_last_
\r
569 = m_parser_.m_variableTop_.m_previous_;
\r
571 if (m_parser_.m_variableTop_.m_next_ != null) {
\r
572 m_parser_.m_variableTop_.m_next_.m_previous_
\r
573 = m_parser_.m_variableTop_.m_previous_;
\r
575 if (m_parser_.m_variableTop_.m_previous_ != null) {
\r
576 m_parser_.m_variableTop_.m_previous_.m_next_
\r
577 = m_parser_.m_variableTop_.m_next_;
\r
581 BuildTable t = new BuildTable(m_parser_);
\r
583 // After this, we have assigned CE values to all regular CEs now we
\r
584 // will go through list once more and resolve expansions, make
\r
585 // UCAElements structs and add them to table
\r
586 for (int i = 0; i < m_parser_.m_resultLength_; i ++) {
\r
587 // now we need to generate the CEs
\r
588 // We stuff the initial value in the buffers, and increase the
\r
589 // appropriate buffer according to strength */
\r
590 createElements(t, m_parser_.m_listHeader_[i]);
\r
593 m_utilElement_.clear();
\r
594 StringBuffer str = new StringBuffer();
\r
596 // add latin-1 stuff
\r
597 copyRangeFromUCA(t, 0, 0xFF);
\r
599 // add stuff for copying
\r
600 if(m_parser_.m_copySet_ != null) {
\r
602 for(i = 0; i < m_parser_.m_copySet_.getRangeCount(); i++) {
\r
603 copyRangeFromUCA(t, m_parser_.m_copySet_.getRangeStart(i),
\r
604 m_parser_.m_copySet_.getRangeEnd(i));
\r
608 // copy contractions from the UCA - this is felt mostly for cyrillic
\r
609 char conts[] = RuleBasedCollator.UCA_CONTRACTIONS_;
\r
611 while (conts[offset] != 0) {
\r
612 // tailoredCE = ucmpe32_get(t.m_mapping, *conts);
\r
613 int tailoredCE = t.m_mapping_.getValue(conts[offset]);
\r
614 Elements prefixElm = null;
\r
615 if (tailoredCE != CE_NOT_FOUND_) {
\r
616 boolean needToAdd = true;
\r
617 if (isContractionTableElement(tailoredCE)) {
\r
618 if (isTailored(t.m_contractions_, tailoredCE,
\r
619 conts, offset + 1) == true) {
\r
623 if (!needToAdd && isPrefix(tailoredCE) && conts[offset+1]==0) {
\r
624 // pre-context character in UCA
\r
625 // The format for pre-context character is
\r
626 // conts[0]: baseCP conts[1]:0 conts[2]:pre-context CP
\r
627 Elements elm = new Elements();
\r
628 elm.m_cPoints_=m_utilElement_.m_uchars_;
\r
630 elm.m_uchars_= UCharacter.toString(conts[offset]);
\r
631 elm.m_prefixChars_=UCharacter.toString(conts[offset+2]);
\r
632 elm.m_prefix_=0; // TODO(claireho) : confirm!
\r
633 prefixElm = (Elements)t.m_prefixLookup_.get(elm);
\r
634 if ((prefixElm== null) ||
\r
635 (prefixElm.m_prefixChars_.charAt(0)!= conts[offset+2])) {
\r
639 if(m_parser_.m_removeSet_ != null && m_parser_.m_removeSet_.contains(conts[offset])) {
\r
644 if (needToAdd == true) {
\r
645 // we need to add if this contraction is not tailored.
\r
646 if (conts[offset+1]!=0) { // not precontext
\r
647 m_utilElement_.m_prefix_ = 0;
\r
648 m_utilElement_.m_prefixChars_ = null;
\r
649 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
\r
650 str.delete(0, str.length());
\r
651 str.append(conts[offset]);
\r
652 str.append(conts[offset + 1]);
\r
653 if (conts[offset + 2] != 0) {
\r
654 str.append(conts[offset + 2]);
\r
656 m_utilElement_.m_uchars_ = str.toString();
\r
657 m_utilElement_.m_CELength_ = 0;
\r
658 m_utilColEIter_.setText(m_utilElement_.m_uchars_);
\r
660 else { // add a pre-context element
\r
662 str.delete(0, str.length()); // clean up
\r
663 m_utilElement_.m_cPoints_ = UCharacter.toString(conts[offset]);
\r
664 m_utilElement_.m_CELength_ = 0;
\r
665 m_utilElement_.m_uchars_ = UCharacter.toString(conts[offset]);
\r
666 m_utilElement_.m_prefixChars_ = UCharacter.toString(conts[offset+2]);
\r
667 if (prefixElm==null) {
\r
668 m_utilElement_.m_prefix_=0;
\r
670 else { // TODO (claireho): confirm!
\r
671 m_utilElement_.m_prefix_= m_utilElement_.m_prefix_;
\r
672 // m_utilElement_.m_prefix_= prefixElm.m_prefix_;
\r
674 m_utilColEIter_.setText(m_utilElement_.m_prefixChars_);
\r
675 while (m_utilColEIter_.next()!=CollationElementIterator.NULLORDER) {
\r
676 // count number of keys for pre-context char.
\r
679 str.append(conts[offset+2]);
\r
680 str.append(conts[offset]);
\r
681 m_utilColEIter_.setText(str.toString());
\r
682 // Skip the keys for prefix character, then copy the rest to el.
\r
683 while ((preKeyLen-->0) &&
\r
684 m_utilColEIter_.next()!= CollationElementIterator.NULLORDER) {
\r
690 int CE = m_utilColEIter_.next();
\r
691 if (CE != CollationElementIterator.NULLORDER) {
\r
692 m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;
\r
698 addAnElement(t, m_utilElement_);
\r
700 } else if(m_parser_.m_removeSet_ != null && m_parser_.m_removeSet_.contains(conts[offset])) {
\r
701 copyRangeFromUCA(t, conts[offset], conts[offset]);
\r
707 // Add completely ignorable elements
\r
708 processUCACompleteIgnorables(t);
\r
710 // canonical closure
\r
711 canonicalClosure(t);
\r
713 // still need to produce compatibility closure
\r
714 assembleTable(t, collator);
\r
717 // private inner classes -------------------------------------------------
\r
719 private static class CEGenerator
\r
721 // package private data members --------------------------------------
\r
723 WeightRange m_ranges_[];
\r
724 int m_rangesLength_;
\r
731 int m_fLow_; // forbidden Low
\r
732 int m_fHigh_; // forbidden High
\r
734 // package private constructor ---------------------------------------
\r
738 m_ranges_ = new WeightRange[7];
\r
739 for (int i = 6; i >= 0; i --) {
\r
740 m_ranges_[i] = new WeightRange();
\r
745 private static class WeightRange implements Comparable
\r
747 // public methods ----------------------------------------------------
\r
750 * Compares this object with target
\r
751 * @param target object to compare with
\r
752 * @return 0 if equals, 1 if this is > target, -1 otherwise
\r
754 public int compareTo(Object target)
\r
756 if (this == target) {
\r
759 int tstart = ((WeightRange)target).m_start_;
\r
760 if (m_start_ == tstart) {
\r
763 if (m_start_ > tstart) {
\r
772 public void clear()
\r
782 // package private data members --------------------------------------
\r
791 // package private constructor ---------------------------------------
\r
799 * Copy constructor.
\r
800 * Cloneable is troublesome, needs to check for exception
\r
801 * @param source to clone
\r
803 WeightRange(WeightRange source)
\r
805 m_start_ = source.m_start_;
\r
806 m_end_ = source.m_end_;
\r
807 m_length_ = source.m_length_;
\r
808 m_count_ = source.m_count_;
\r
809 m_length2_ = source.m_length2_;
\r
810 m_count2_ = source.m_count2_;
\r
814 private static class MaxJamoExpansionTable
\r
816 // package private data members --------------------------------------
\r
818 Vector m_endExpansionCE_;
\r
819 // vector of booleans
\r
825 // package private constructor ---------------------------------------
\r
827 MaxJamoExpansionTable()
\r
829 m_endExpansionCE_ = new Vector();
\r
830 m_isV_ = new Vector();
\r
831 m_endExpansionCE_.add(new Integer(0));
\r
832 m_isV_.add(Boolean.FALSE);
\r
838 MaxJamoExpansionTable(MaxJamoExpansionTable table)
\r
840 m_endExpansionCE_ = (Vector)table.m_endExpansionCE_.clone();
\r
841 m_isV_ = (Vector)table.m_isV_.clone();
\r
842 m_maxLSize_ = table.m_maxLSize_;
\r
843 m_maxVSize_ = table.m_maxVSize_;
\r
844 m_maxTSize_ = table.m_maxTSize_;
\r
848 private static class MaxExpansionTable
\r
850 // package private constructor --------------------------------------
\r
852 MaxExpansionTable()
\r
854 m_endExpansionCE_ = new Vector();
\r
855 m_expansionCESize_ = new Vector();
\r
856 m_endExpansionCE_.add(new Integer(0));
\r
857 m_expansionCESize_.add(new Byte((byte)0));
\r
860 MaxExpansionTable(MaxExpansionTable table)
\r
862 m_endExpansionCE_ = (Vector)table.m_endExpansionCE_.clone();
\r
863 m_expansionCESize_ = (Vector)table.m_expansionCESize_.clone();
\r
866 // package private data member --------------------------------------
\r
868 Vector m_endExpansionCE_;
\r
869 Vector m_expansionCESize_;
\r
872 private static class BasicContractionTable
\r
874 // package private constructors -------------------------------------
\r
876 BasicContractionTable()
\r
878 m_CEs_ = new Vector();
\r
879 m_codePoints_ = new StringBuffer();
\r
882 // package private data members -------------------------------------
\r
884 StringBuffer m_codePoints_;
\r
888 private static class ContractionTable
\r
890 // package private constructor --------------------------------------
\r
893 * Builds a contraction table
\r
896 ContractionTable(IntTrieBuilder mapping)
\r
898 m_mapping_ = mapping;
\r
899 m_elements_ = new Vector();
\r
900 m_CEs_ = new Vector();
\r
901 m_codePoints_ = new StringBuffer();
\r
902 m_offsets_ = new Vector();
\r
903 m_currentTag_ = CE_NOT_FOUND_TAG_;
\r
907 * Copies a contraction table.
\r
908 * Not all data will be copied into their own object.
\r
911 ContractionTable(ContractionTable table)
\r
913 m_mapping_ = table.m_mapping_;
\r
914 m_elements_ = (Vector)table.m_elements_.clone();
\r
915 m_codePoints_ = new StringBuffer(table.m_codePoints_.toString());
\r
916 m_CEs_ = (Vector)table.m_CEs_.clone();
\r
917 m_offsets_ = (Vector)table.m_offsets_.clone();
\r
918 m_currentTag_ = table.m_currentTag_;
\r
921 // package private data members ------------------------------------
\r
924 * Vector of BasicContractionTable
\r
926 Vector m_elements_;
\r
927 IntTrieBuilder m_mapping_;
\r
928 StringBuffer m_codePoints_;
\r
935 * Private class for combining mark table.
\r
936 * The table is indexed by the class value(0-255).
\r
938 private static class CombinClassTable {
\r
940 * accumulated numbers of combining marks.
\r
942 int[] index = new int[256];
\r
945 * code point array for combining marks.
\r
955 CombinClassTable() {
\r
963 * Copy the combining mark table from ccc and index in compact
\r
966 * @param cps : code point array
\r
967 * @param size : size of ccc
\r
968 * @param index : index of combining classes(0-255)
\r
970 void generate(char[] cps, int numOfCM, int[] ccIndex) {
\r
973 cPoints = new char[numOfCM];
\r
974 for (int i=0; i<256; i++) {
\r
975 for (int j=0; j<ccIndex[i]; j++) {
\r
976 cPoints[count++] = cps[(i<<8)+j];
\r
984 * Get first CM(combining mark) with the combining class value cClass.
\r
985 * @param cClass : combining class value.
\r
986 * @return combining mark codepoint or 0 if no combining make with class value cClass
\r
988 char GetFirstCM(int cClass) {
\r
990 if (cPoints==null || cClass==0 || index[cClass]==index[cClass-1]) {
\r
994 return cPoints[index[cClass-1]];
\r
998 * Get next CM(combining mark) with the combining class value cClass.
\r
999 * Return combining mark codepoint or 0 if no next CM.
\r
1001 char GetNextCM() {
\r
1002 if (cPoints==null || index[curClass]==(index[curClass-1]+pos)) {
\r
1005 return cPoints[index[curClass-1] + (pos++)];
\r
1008 // private data members
\r
1013 private static final class BuildTable implements TrieBuilder.DataManipulate
\r
1015 // package private methods ------------------------------------------
\r
1018 * For construction of the Trie tables.
\r
1019 * Has to be labeled public
\r
1022 * @return data offset or 0
\r
1024 public int getFoldedValue(int cp, int offset)
\r
1026 int limit = cp + 0x400;
\r
1027 while (cp < limit) {
\r
1028 int value = m_mapping_.getValue(cp);
\r
1029 boolean inBlockZero = m_mapping_.isInZeroBlock(cp);
\r
1030 int tag = getCETag(value);
\r
1031 if (inBlockZero == true) {
\r
1032 cp += TrieBuilder.DATA_BLOCK_LENGTH;
\r
1034 else if (!(isSpecial(value) && (tag == CE_IMPLICIT_TAG_
\r
1035 || tag == CE_NOT_FOUND_TAG_))) {
\r
1036 // These are values that are starting in either UCA
\r
1037 // (IMPLICIT_TAG) or in the tailorings (NOT_FOUND_TAG).
\r
1038 // Presence of these tags means that there is nothing in
\r
1039 // this position and that it should be skipped.
\r
1040 return RuleBasedCollator.CE_SPECIAL_FLAG_
\r
1041 | (CE_SURROGATE_TAG_ << 24) | offset;
\r
1050 // package private constructor --------------------------------------
\r
1055 BuildTable(CollationRuleParser parser)
\r
1057 m_collator_ = new RuleBasedCollator();
\r
1058 m_collator_.setWithUCAData();
\r
1059 MaxExpansionTable maxet = new MaxExpansionTable();
\r
1060 MaxJamoExpansionTable maxjet = new MaxJamoExpansionTable();
\r
1061 m_options_ = parser.m_options_;
\r
1062 m_expansions_ = new Vector();
\r
1063 // Do your own mallocs for the structure, array and have linear
\r
1065 int trieinitialvalue = RuleBasedCollator.CE_SPECIAL_FLAG_
\r
1066 | (CE_NOT_FOUND_TAG_ << 24);
\r
1067 // temporary fix for jb3822, 0x100000 -> 30000
\r
1068 m_mapping_ = new IntTrieBuilder(null, 0x30000, trieinitialvalue,
\r
1069 trieinitialvalue, true);
\r
1070 m_prefixLookup_ = new Hashtable();
\r
1071 // uhash_open(prefixLookupHash, prefixLookupComp);
\r
1072 m_contractions_ = new ContractionTable(m_mapping_);
\r
1073 // copy UCA's maxexpansion and merge as we go along
\r
1074 m_maxExpansions_ = maxet;
\r
1075 // adding an extra initial value for easier manipulation
\r
1077 i < RuleBasedCollator.UCA_.m_expansionEndCE_.length; i ++) {
\r
1078 maxet.m_endExpansionCE_.add(new Integer(
\r
1079 RuleBasedCollator.UCA_.m_expansionEndCE_[i]));
\r
1080 maxet.m_expansionCESize_.add(new Byte(
\r
1081 RuleBasedCollator.UCA_.m_expansionEndCEMaxSize_[i]));
\r
1083 m_maxJamoExpansions_ = maxjet;
\r
1085 m_unsafeCP_ = new byte[UNSAFECP_TABLE_SIZE_];
\r
1086 m_contrEndCP_ = new byte[UNSAFECP_TABLE_SIZE_];
\r
1087 Arrays.fill(m_unsafeCP_, (byte)0);
\r
1088 Arrays.fill(m_contrEndCP_, (byte)0);
\r
1092 * Duplicating a BuildTable.
\r
1093 * Not all data will be duplicated into their own object.
\r
1094 * @param table to clone
\r
1096 BuildTable(BuildTable table)
\r
1098 m_collator_ = table.m_collator_;
\r
1099 m_mapping_ = new IntTrieBuilder(table.m_mapping_);
\r
1100 m_expansions_ = (Vector)table.m_expansions_.clone();
\r
1101 m_contractions_ = new ContractionTable(table.m_contractions_);
\r
1102 m_contractions_.m_mapping_ = m_mapping_;
\r
1103 m_options_ = table.m_options_;
\r
1104 m_maxExpansions_ = new MaxExpansionTable(table.m_maxExpansions_);
\r
1105 m_maxJamoExpansions_
\r
1106 = new MaxJamoExpansionTable(table.m_maxJamoExpansions_);
\r
1107 m_unsafeCP_ = new byte[table.m_unsafeCP_.length];
\r
1108 System.arraycopy(table.m_unsafeCP_, 0, m_unsafeCP_, 0,
\r
1109 m_unsafeCP_.length);
\r
1110 m_contrEndCP_ = new byte[table.m_contrEndCP_.length];
\r
1111 System.arraycopy(table.m_contrEndCP_, 0, m_contrEndCP_, 0,
\r
1112 m_contrEndCP_.length);
\r
1115 // package private data members -------------------------------------
\r
1117 RuleBasedCollator m_collator_;
\r
1118 IntTrieBuilder m_mapping_;
\r
1119 Vector m_expansions_;
\r
1120 ContractionTable m_contractions_;
\r
1121 // UCATableHeader image;
\r
1122 CollationRuleParser.OptionSet m_options_;
\r
1123 MaxExpansionTable m_maxExpansions_;
\r
1124 MaxJamoExpansionTable m_maxJamoExpansions_;
\r
1125 byte m_unsafeCP_[];
\r
1126 byte m_contrEndCP_[];
\r
1127 Hashtable m_prefixLookup_;
\r
1128 CombinClassTable cmLookup = null;
\r
1131 private static class Elements
\r
1133 // package private data members -------------------------------------
\r
1135 String m_prefixChars_;
\r
1141 String m_cPoints_;
\r
1143 * Offset to the working string
\r
1145 int m_cPointsOffset_;
\r
1147 * These are collation elements - there could be more than one - in
\r
1148 * case of expansion
\r
1153 * This is the value element maps in original table
\r
1156 int m_sizePrim_[];
\r
1159 boolean m_variableTop_;
\r
1160 boolean m_caseBit_;
\r
1162 // package private constructors -------------------------------------
\r
1165 * Package private constructor
\r
1169 m_sizePrim_ = new int[128];
\r
1170 m_sizeSec_ = new int[128];
\r
1171 m_sizeTer_ = new int[128];
\r
1172 m_CEs_ = new int[256];
\r
1177 * Package private constructor
\r
1179 Elements(Elements element)
\r
1181 m_prefixChars_ = element.m_prefixChars_;
\r
1182 m_prefix_ = element.m_prefix_;
\r
1183 m_uchars_ = element.m_uchars_;
\r
1184 m_cPoints_ = element.m_cPoints_;
\r
1185 m_cPointsOffset_ = element.m_cPointsOffset_;
\r
1186 m_CEs_ = element.m_CEs_;
\r
1187 m_CELength_ = element.m_CELength_;
\r
1188 m_mapCE_ = element.m_mapCE_;
\r
1189 m_sizePrim_ = element.m_sizePrim_;
\r
1190 m_sizeSec_ = element.m_sizeSec_;
\r
1191 m_sizeTer_ = element.m_sizeTer_;
\r
1192 m_variableTop_ = element.m_variableTop_;
\r
1193 m_caseBit_ = element.m_caseBit_;
\r
1196 // package private methods -------------------------------------------
\r
1199 * Initializing the elements
\r
1201 public void clear()
\r
1203 m_prefixChars_ = null;
\r
1206 m_cPoints_ = null;
\r
1207 m_cPointsOffset_ = 0;
\r
1210 Arrays.fill(m_sizePrim_, 0);
\r
1211 Arrays.fill(m_sizeSec_, 0);
\r
1212 Arrays.fill(m_sizeTer_, 0);
\r
1213 m_variableTop_ = false;
\r
1214 m_caseBit_ = false;
\r
1219 * Hashcode calculation for token
\r
1220 * @return the hashcode
\r
1222 public int hashCode()
\r
1224 String str = m_cPoints_.substring(m_cPointsOffset_);
\r
1225 return str.hashCode();
\r
1229 * Equals calculation
\r
1230 * @param target object to compare
\r
1231 * @return true if target is the same as this object
\r
1233 public boolean equals(Object target)
\r
1235 if (target == this) {
\r
1238 if (target instanceof Elements) {
\r
1239 Elements t = (Elements)target;
\r
1240 int size = m_cPoints_.length() - m_cPointsOffset_;
\r
1241 if (size == t.m_cPoints_.length() - t.m_cPointsOffset_) {
\r
1242 return t.m_cPoints_.regionMatches(t.m_cPointsOffset_,
\r
1244 m_cPointsOffset_, size);
\r
1251 // private data member ---------------------------------------------------
\r
1254 * Maximum strength used in CE building
\r
1256 private static final int CE_BASIC_STRENGTH_LIMIT_ = 3;
\r
1258 * Maximum collation strength
\r
1260 private static final int CE_STRENGTH_LIMIT_ = 16;
\r
1262 * Strength mask array, used in inverse UCA
\r
1264 private static final int STRENGTH_MASK_[] = {0xFFFF0000, 0xFFFFFF00,
\r
1267 * CE tag for not found
\r
1269 private static final int CE_NOT_FOUND_ = 0xF0000000;
\r
1271 * CE tag for not found
\r
1273 private static final int CE_NOT_FOUND_TAG_ = 0;
\r
1275 * This code point results in an expansion
\r
1277 private static final int CE_EXPANSION_TAG_ = 1;
\r
1279 * Start of a contraction
\r
1281 private static final int CE_CONTRACTION_TAG_ = 2;
\r
1283 * Thai character - do the reordering
\r
1285 //private static final int CE_THAI_TAG_ = 3;
\r
1287 * Charset processing, not yet implemented
\r
1289 //private static final int CE_CHARSET_TAG_ = 4;
\r
1291 * Lead surrogate that is tailored and doesn't start a contraction
\r
1293 private static final int CE_SURROGATE_TAG_ = 5;
\r
1297 //private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
\r
1301 //private static final int CE_LEAD_SURROGATE_TAG_ = 7;
\r
1305 //private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
\r
1307 * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
\r
1309 //private static final int CE_CJK_IMPLICIT_TAG_ = 9;
\r
1310 private static final int CE_IMPLICIT_TAG_ = 10;
\r
1311 private static final int CE_SPEC_PROC_TAG_ = 11;
\r
1313 * This is a three byte primary with starting secondaries and tertiaries.
\r
1314 * It fits in a single 32 bit CE and is used instead of expansion to save
\r
1315 * space without affecting the performance (hopefully)
\r
1317 private static final int CE_LONG_PRIMARY_TAG_ = 12;
\r
1319 * Unsafe UChar hash table table size. Size is 32 bytes for 1 bit for each
\r
1320 * latin 1 char + some power of two for hashing the rest of the chars.
\r
1323 private static final int UNSAFECP_TABLE_SIZE_ = 1056;
\r
1325 * Mask value down to "some power of two" -1. Number of bits, not num of
\r
1328 private static final int UNSAFECP_TABLE_MASK_ = 0x1fff;
\r
1332 private static final int UPPER_CASE_ = 0x80;
\r
1333 private static final int MIXED_CASE_ = 0x40;
\r
1334 private static final int LOWER_CASE_ = 0x00;
\r
1336 * Initial table size
\r
1338 //private static final int INIT_TABLE_SIZE_ = 1028;
\r
1340 * Header size, copied from ICU4C, to be changed when that value changes
\r
1342 //private static final int HEADER_SIZE_ = 0xC4;
\r
1344 * Contraction table new element indicator
\r
1346 private static final int CONTRACTION_TABLE_NEW_ELEMENT_ = 0xFFFFFF;
\r
1348 * Parser for the rules
\r
1350 private CollationRuleParser m_parser_;
\r
1352 * Utility UCA collation element iterator
\r
1354 private CollationElementIterator m_utilColEIter_;
\r
1356 * Utility data members
\r
1358 private CEGenerator m_utilGens_[] = {new CEGenerator(), new CEGenerator(),
\r
1359 new CEGenerator()};
\r
1360 private int m_utilCEBuffer_[] = new int[CE_BASIC_STRENGTH_LIMIT_];
\r
1361 private int m_utilIntBuffer_[] = new int[CE_STRENGTH_LIMIT_];
\r
1362 private Elements m_utilElement_ = new Elements();
\r
1363 private Elements m_utilElement2_ = new Elements();
\r
1364 private CollationRuleParser.Token m_utilToken_
\r
1365 = new CollationRuleParser.Token();
\r
1366 private int m_utilCountBuffer_[] = new int[6];
\r
1367 private long m_utilLongBuffer_[] = new long[5];
\r
1368 private WeightRange m_utilLowerWeightRange_[] =
\r
1369 {new WeightRange(), new WeightRange(),
\r
1370 new WeightRange(), new WeightRange(),
\r
1371 new WeightRange()};
\r
1372 private WeightRange m_utilUpperWeightRange_[] =
\r
1373 {new WeightRange(), new WeightRange(),
\r
1374 new WeightRange(), new WeightRange(),
\r
1375 new WeightRange()};
\r
1376 private WeightRange m_utilWeightRange_ = new WeightRange();
\r
1377 private char m_utilCharBuffer_[] = new char[256];
\r
1378 private CanonicalIterator m_utilCanIter_ = new CanonicalIterator("");
\r
1379 private StringBuffer m_utilStringBuffer_ = new StringBuffer("");
\r
1380 // Flag indicating a combining marks table is required or not.
\r
1381 private static boolean buildCMTabFlag = false;
\r
1384 // private methods -------------------------------------------------------
\r
1387 * @param listheader parsed rule tokens
\r
1388 * @exception Exception thrown when internal error occurs
\r
1390 private void initBuffers(CollationRuleParser.TokenListHeader listheader)
\r
1393 CollationRuleParser.Token token = listheader.m_last_;
\r
1394 Arrays.fill(m_utilIntBuffer_, 0, CE_STRENGTH_LIMIT_, 0);
\r
1396 token.m_toInsert_ = 1;
\r
1397 m_utilIntBuffer_[token.m_strength_] = 1;
\r
1398 while (token.m_previous_ != null) {
\r
1399 if (token.m_previous_.m_strength_ < token.m_strength_) {
\r
1401 m_utilIntBuffer_[token.m_strength_] = 0;
\r
1402 m_utilIntBuffer_[token.m_previous_.m_strength_] ++;
\r
1404 else if (token.m_previous_.m_strength_ > token.m_strength_) {
\r
1406 m_utilIntBuffer_[token.m_previous_.m_strength_] = 1;
\r
1409 m_utilIntBuffer_[token.m_strength_] ++;
\r
1411 token = token.m_previous_;
\r
1412 token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];
\r
1415 token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];
\r
1416 INVERSE_UCA_.getInverseGapPositions(listheader);
\r
1418 token = listheader.m_first_;
\r
1419 int fstrength = Collator.IDENTICAL;
\r
1420 int initstrength = Collator.IDENTICAL;
\r
1422 m_utilCEBuffer_[Collator.PRIMARY] = mergeCE(listheader.m_baseCE_,
\r
1423 listheader.m_baseContCE_,
\r
1424 Collator.PRIMARY);
\r
1425 m_utilCEBuffer_[Collator.SECONDARY] = mergeCE(listheader.m_baseCE_,
\r
1426 listheader.m_baseContCE_,
\r
1427 Collator.SECONDARY);
\r
1428 m_utilCEBuffer_[Collator.TERTIARY] = mergeCE(listheader.m_baseCE_,
\r
1429 listheader.m_baseContCE_,
\r
1430 Collator.TERTIARY);
\r
1431 while (token != null) {
\r
1432 fstrength = token.m_strength_;
\r
1433 if (fstrength < initstrength) {
\r
1434 initstrength = fstrength;
\r
1435 if (listheader.m_pos_[fstrength] == -1) {
\r
1436 while (listheader.m_pos_[fstrength] == -1 && fstrength > 0)
\r
1440 if (listheader.m_pos_[fstrength] == -1) {
\r
1441 throw new Exception("Internal program error");
\r
1444 if (initstrength == Collator.TERTIARY) {
\r
1445 // starting with tertiary
\r
1446 m_utilCEBuffer_[Collator.PRIMARY]
\r
1447 = listheader.m_gapsLo_[fstrength * 3];
\r
1448 m_utilCEBuffer_[Collator.SECONDARY]
\r
1449 = listheader.m_gapsLo_[fstrength * 3 + 1];
\r
1450 m_utilCEBuffer_[Collator.TERTIARY] = getCEGenerator(
\r
1451 m_utilGens_[Collator.TERTIARY],
\r
1452 listheader.m_gapsLo_,
\r
1453 listheader.m_gapsHi_,
\r
1454 token, fstrength);
\r
1456 else if (initstrength == Collator.SECONDARY) {
\r
1458 m_utilCEBuffer_[Collator.PRIMARY]
\r
1459 = listheader.m_gapsLo_[fstrength * 3];
\r
1460 m_utilCEBuffer_[Collator.SECONDARY]
\r
1462 m_utilGens_[Collator.SECONDARY],
\r
1463 listheader.m_gapsLo_,
\r
1464 listheader.m_gapsHi_,
\r
1465 token, fstrength);
\r
1466 m_utilCEBuffer_[Collator.TERTIARY]
\r
1467 = getSimpleCEGenerator(
\r
1468 m_utilGens_[Collator.TERTIARY],
\r
1469 token, Collator.TERTIARY);
\r
1473 m_utilCEBuffer_[Collator.PRIMARY]
\r
1475 m_utilGens_[Collator.PRIMARY],
\r
1476 listheader.m_gapsLo_,
\r
1477 listheader.m_gapsHi_,
\r
1478 token, fstrength);
\r
1479 m_utilCEBuffer_[Collator.SECONDARY]
\r
1480 = getSimpleCEGenerator(
\r
1481 m_utilGens_[Collator.SECONDARY],
\r
1482 token, Collator.SECONDARY);
\r
1483 m_utilCEBuffer_[Collator.TERTIARY]
\r
1484 = getSimpleCEGenerator(
\r
1485 m_utilGens_[Collator.TERTIARY],
\r
1486 token, Collator.TERTIARY);
\r
1490 if (token.m_strength_ == Collator.TERTIARY) {
\r
1491 m_utilCEBuffer_[Collator.TERTIARY]
\r
1492 = getNextGenerated(m_utilGens_[Collator.TERTIARY]);
\r
1494 else if (token.m_strength_ == Collator.SECONDARY) {
\r
1495 m_utilCEBuffer_[Collator.SECONDARY]
\r
1496 = getNextGenerated(m_utilGens_[Collator.SECONDARY]);
\r
1497 m_utilCEBuffer_[Collator.TERTIARY]
\r
1498 = getSimpleCEGenerator(
\r
1499 m_utilGens_[Collator.TERTIARY],
\r
1500 token, Collator.TERTIARY);
\r
1502 else if (token.m_strength_ == Collator.PRIMARY) {
\r
1503 m_utilCEBuffer_[Collator.PRIMARY]
\r
1504 = getNextGenerated(
\r
1505 m_utilGens_[Collator.PRIMARY]);
\r
1506 m_utilCEBuffer_[Collator.SECONDARY]
\r
1507 = getSimpleCEGenerator(
\r
1508 m_utilGens_[Collator.SECONDARY],
\r
1509 token, Collator.SECONDARY);
\r
1510 m_utilCEBuffer_[Collator.TERTIARY]
\r
1511 = getSimpleCEGenerator(
\r
1512 m_utilGens_[Collator.TERTIARY],
\r
1513 token, Collator.TERTIARY);
\r
1516 doCE(m_utilCEBuffer_, token);
\r
1517 token = token.m_next_;
\r
1522 * Get the next generated ce
\r
1523 * @param g ce generator
\r
1524 * @return next generated ce
\r
1526 private int getNextGenerated(CEGenerator g)
\r
1528 g.m_current_ = nextWeight(g);
\r
1529 return g.m_current_;
\r
1533 * @param g CEGenerator
\r
1534 * @param token rule token
\r
1535 * @param strength
\r
1536 * @return ce generator
\r
1537 * @exception Exception thrown when internal error occurs
\r
1539 private int getSimpleCEGenerator(CEGenerator g,
\r
1540 CollationRuleParser.Token token,
\r
1541 int strength) throws Exception
\r
1543 int high, low, count = 1;
\r
1544 int maxbyte = (strength == Collator.TERTIARY) ? 0x3F : 0xFF;
\r
1546 if (strength == Collator.SECONDARY) {
\r
1547 low = RuleBasedCollator.COMMON_TOP_2_ << 24;
\r
1548 high = 0xFFFFFFFF;
\r
1549 count = 0xFF - RuleBasedCollator.COMMON_TOP_2_;
\r
1552 low = RuleBasedCollator.BYTE_COMMON_ << 24; //0x05000000;
\r
1553 high = 0x40000000;
\r
1554 count = 0x40 - RuleBasedCollator.BYTE_COMMON_;
\r
1557 if (token.m_next_ != null && token.m_next_.m_strength_ == strength) {
\r
1558 count = token.m_next_.m_toInsert_;
\r
1561 g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,
\r
1563 g.m_current_ = RuleBasedCollator.BYTE_COMMON_ << 24;
\r
1565 if (g.m_rangesLength_ == 0) {
\r
1566 throw new Exception("Internal program error");
\r
1568 return g.m_current_;
\r
1572 * Combines 2 ce into one with respect to the argument strength
\r
1573 * @param ce1 first ce
\r
1574 * @param ce2 second ce
\r
1575 * @param strength strength to use
\r
1576 * @return combined ce
\r
1578 private static int mergeCE(int ce1, int ce2, int strength)
\r
1580 int mask = RuleBasedCollator.CE_TERTIARY_MASK_;
\r
1581 if (strength == Collator.SECONDARY) {
\r
1582 mask = RuleBasedCollator.CE_SECONDARY_MASK_;
\r
1584 else if (strength == Collator.PRIMARY) {
\r
1585 mask = RuleBasedCollator.CE_PRIMARY_MASK_;
\r
1589 switch (strength)
\r
1591 case Collator.PRIMARY:
\r
1592 return ce1 | ce2 >>> 16;
\r
1593 case Collator.SECONDARY:
\r
1594 return ce1 << 16 | ce2 << 8;
\r
1596 return ce1 << 24 | ce2 << 16;
\r
1601 * @param g CEGenerator
\r
1602 * @param lows low gap array
\r
1603 * @param highs high gap array
\r
1604 * @param token rule token
\r
1605 * @param fstrength
\r
1606 * @exception Exception thrown when internal error occurs
\r
1608 private int getCEGenerator(CEGenerator g, int lows[], int highs[],
\r
1609 CollationRuleParser.Token token, int fstrength)
\r
1612 int strength = token.m_strength_;
\r
1613 int low = lows[fstrength * 3 + strength];
\r
1614 int high = highs[fstrength * 3 + strength];
\r
1616 if(strength == Collator.TERTIARY) {
\r
1618 } else if(strength == Collator.PRIMARY) {
\r
1624 int count = token.m_toInsert_;
\r
1626 if (Utility.compareUnsigned(low, high) >= 0
\r
1627 && strength > Collator.PRIMARY) {
\r
1631 if (lows[fstrength * 3 + s] != highs[fstrength * 3 + s]) {
\r
1632 if (strength == Collator.SECONDARY) {
\r
1633 if (low < (RuleBasedCollator.COMMON_TOP_2_ << 24)) {
\r
1634 // Override if low range is less than UCOL_COMMON_TOP2.
\r
1635 low = RuleBasedCollator.COMMON_TOP_2_ << 24;
\r
1637 high = 0xFFFFFFFF;
\r
1640 if ( low < RuleBasedCollator.COMMON_BOTTOM_3<<24 ) {
\r
1641 // Override if low range is less than UCOL_COMMON_BOT3.
\r
1642 low = RuleBasedCollator.COMMON_BOTTOM_3 <<24;
\r
1644 high = 0x40000000;
\r
1649 throw new Exception("Internal program error");
\r
1656 if (strength == Collator.SECONDARY) { // similar as simple
\r
1657 if (Utility.compareUnsigned(low,
\r
1658 RuleBasedCollator.COMMON_BOTTOM_2_ << 24) >= 0
\r
1659 && Utility.compareUnsigned(low,
\r
1660 RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {
\r
1661 low = RuleBasedCollator.COMMON_TOP_2_ << 24;
\r
1663 if (Utility.compareUnsigned(high,
\r
1664 RuleBasedCollator.COMMON_BOTTOM_2_ << 24) > 0
\r
1665 && Utility.compareUnsigned(high,
\r
1666 RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {
\r
1667 high = RuleBasedCollator.COMMON_TOP_2_ << 24;
\r
1669 if (Utility.compareUnsigned(low,
\r
1670 RuleBasedCollator.COMMON_BOTTOM_2_ << 24) < 0) {
\r
1671 g.m_rangesLength_ = allocateWeights(
\r
1672 RuleBasedCollator.BYTE_UNSHIFTED_MIN_ << 24,
\r
1673 high, count, maxbyte, g.m_ranges_);
\r
1674 g.m_current_ = nextWeight(g);
\r
1675 //g.m_current_ = RuleBasedCollator.COMMON_BOTTOM_2_ << 24;
\r
1676 return g.m_current_;
\r
1680 g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,
\r
1682 if (g.m_rangesLength_ == 0) {
\r
1683 throw new Exception("Internal program error");
\r
1685 g.m_current_ = nextWeight(g);
\r
1686 return g.m_current_;
\r
1690 * @param ceparts list of collation elements parts
\r
1691 * @param token rule token
\r
1692 * @exception Exception thrown when forming case bits for expansions fails
\r
1694 private void doCE(int ceparts[], CollationRuleParser.Token token)
\r
1697 // this one makes the table and stuff
\r
1698 // int noofbytes[] = new int[3];
\r
1699 for (int i = 0; i < 3; i ++) {
\r
1700 // noofbytes[i] = countBytes(ceparts[i]);
\r
1701 m_utilIntBuffer_[i] = countBytes(ceparts[i]);
\r
1704 // Here we have to pack CEs from parts
\r
1708 while ((cei << 1) < m_utilIntBuffer_[0] || cei < m_utilIntBuffer_[1]
\r
1709 || cei < m_utilIntBuffer_[2]) {
\r
1711 value = RuleBasedCollator.CE_CONTINUATION_MARKER_;
\r
1716 if ((cei << 1) < m_utilIntBuffer_[0]) {
\r
1717 value |= ((ceparts[0] >> (32 - ((cei + 1) << 4))) & 0xFFFF)
\r
1720 if (cei < m_utilIntBuffer_[1]) {
\r
1721 value |= ((ceparts[1] >> (32 - ((cei + 1) << 3))) & 0xFF) << 8;
\r
1724 if (cei < m_utilIntBuffer_[2]) {
\r
1725 value |= ((ceparts[2] >> (32 - ((cei+1) << 3))) & 0x3F);
\r
1727 token.m_CE_[cei] = value;
\r
1730 if (cei == 0) { // totally ignorable
\r
1731 token.m_CELength_ = 1;
\r
1732 token.m_CE_[0] = 0;
\r
1734 else { // there is at least something
\r
1735 token.m_CELength_ = cei;
\r
1738 // Case bits handling for expansion
\r
1739 if(token.m_CE_[0] != 0) { // case bits should be set only for non-ignorables
\r
1740 int startoftokenrule = token.m_source_ & 0xFF;
\r
1741 if ((token.m_source_ >>> 24) > 1) {
\r
1743 int length = token.m_source_ >>> 24;
\r
1744 String tokenstr = token.m_rules_.substring(startoftokenrule,
\r
1745 startoftokenrule + length);
\r
1746 token.m_CE_[0] |= getCaseBits(tokenstr);
\r
1749 // Copy it from the UCA
\r
1751 = getFirstCE(token.m_rules_.charAt(startoftokenrule));
\r
1752 token.m_CE_[0] |= (caseCE & 0xC0);
\r
1758 * Count the number of non-zero bytes used in the ce
\r
1760 * @return number of non-zero bytes used in ce
\r
1762 private static final int countBytes(int ce)
\r
1764 int mask = 0xFFFFFFFF;
\r
1766 while (mask != 0) {
\r
1767 if ((ce & mask) != 0) {
\r
1776 * We are ready to create collation elements
\r
1777 * @param t build table to insert
\r
1778 * @param lh rule token list header
\r
1780 private void createElements(BuildTable t,
\r
1781 CollationRuleParser.TokenListHeader lh)
\r
1783 CollationRuleParser.Token tok = lh.m_first_;
\r
1784 m_utilElement_.clear();
\r
1785 while (tok != null) {
\r
1786 // first, check if there are any expansions
\r
1787 // if there are expansions, we need to do a little bit more
\r
1788 // processing since parts of expansion can be tailored, while
\r
1790 if (tok.m_expansion_ != 0) {
\r
1791 int len = tok.m_expansion_ >>> 24;
\r
1792 int currentSequenceLen = len;
\r
1793 int expOffset = tok.m_expansion_ & 0x00FFFFFF;
\r
1794 m_utilToken_.m_source_ = currentSequenceLen | expOffset;
\r
1795 m_utilToken_.m_rules_ = m_parser_.m_source_;
\r
1798 currentSequenceLen = len;
\r
1799 while (currentSequenceLen > 0) {
\r
1800 m_utilToken_.m_source_ = (currentSequenceLen << 24)
\r
1802 CollationRuleParser.Token expt =
\r
1803 (CollationRuleParser.Token)
\r
1804 m_parser_.m_hashTable_.get(m_utilToken_);
\r
1806 && expt.m_strength_
\r
1807 != CollationRuleParser.TOKEN_RESET_) {
\r
1808 // expansion is tailored
\r
1809 int noOfCEsToCopy = expt.m_CELength_;
\r
1810 for (int j = 0; j < noOfCEsToCopy; j ++) {
\r
1811 tok.m_expCE_[tok.m_expCELength_ + j]
\r
1814 tok.m_expCELength_ += noOfCEsToCopy;
\r
1815 // never try to add codepoints and CEs.
\r
1816 // For some odd reason, it won't work.
\r
1817 expOffset += currentSequenceLen; //noOfCEsToCopy;
\r
1818 len -= currentSequenceLen; //noOfCEsToCopy;
\r
1822 currentSequenceLen --;
\r
1825 if (currentSequenceLen == 0) {
\r
1826 // couldn't find any tailored subsequence, will have to
\r
1827 // get one from UCA. first, get the UChars from the
\r
1828 // rules then pick CEs out until there is no more and
\r
1829 // stuff them into expansion
\r
1830 m_utilColEIter_.setText(m_parser_.m_source_.substring(
\r
1831 expOffset, expOffset + 1));
\r
1833 int order = m_utilColEIter_.next();
\r
1834 if (order == CollationElementIterator.NULLORDER) {
\r
1837 tok.m_expCE_[tok.m_expCELength_ ++] = order;
\r
1845 tok.m_expCELength_ = 0;
\r
1848 // set the ucaelement with obtained values
\r
1849 m_utilElement_.m_CELength_ = tok.m_CELength_ + tok.m_expCELength_;
\r
1852 System.arraycopy(tok.m_CE_, 0, m_utilElement_.m_CEs_, 0,
\r
1854 System.arraycopy(tok.m_expCE_, 0, m_utilElement_.m_CEs_,
\r
1855 tok.m_CELength_, tok.m_expCELength_);
\r
1858 // We kept prefix and source kind of together, as it is a kind of a
\r
1860 // However, now we have to slice the prefix off the main thing -
\r
1861 m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_;
\r
1862 m_utilElement_.m_cPointsOffset_ = 0; //el.m_uchars_;
\r
1863 if (tok.m_prefix_ != 0) {
\r
1864 // we will just copy the prefix here, and adjust accordingly in
\r
1865 // the addPrefix function in ucol_elm. The reason is that we
\r
1866 // need to add both composed AND decomposed elements to the
\r
1868 int size = tok.m_prefix_ >> 24;
\r
1869 int offset = tok.m_prefix_ & 0x00FFFFFF;
\r
1870 m_utilElement_.m_prefixChars_
\r
1871 = m_parser_.m_source_.substring(offset, offset + size);
\r
1872 size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24);
\r
1873 offset = (tok.m_source_ & 0x00FFFFFF) + (tok.m_prefix_ >> 24);
\r
1874 m_utilElement_.m_uchars_
\r
1875 = m_parser_.m_source_.substring(offset, offset + size);
\r
1878 m_utilElement_.m_prefixChars_ = null;
\r
1879 int offset = tok.m_source_ & 0x00FFFFFF;
\r
1880 int size = tok.m_source_ >>> 24;
\r
1881 m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(offset,
\r
1884 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
\r
1886 boolean containCombinMarks = false;
\r
1887 for (int i = 0; i < m_utilElement_.m_cPoints_.length()
\r
1888 - m_utilElement_.m_cPointsOffset_; i ++) {
\r
1889 if (isJamo(m_utilElement_.m_cPoints_.charAt(i))) {
\r
1890 t.m_collator_.m_isJamoSpecial_ = true;
\r
1893 if ( !buildCMTabFlag ) {
\r
1894 // check combining class
\r
1895 char fcd = NormalizerImpl.getFCD16(m_utilElement_.m_cPoints_.charAt(i));
\r
1896 if ( (fcd & 0xff) == 0 ) {
\r
1897 // reset flag when current char is not combining mark.
\r
1898 containCombinMarks = false;
\r
1901 containCombinMarks = true;
\r
1906 if ( !buildCMTabFlag && containCombinMarks ) {
\r
1907 buildCMTabFlag = true;
\r
1912 // Case bits handling
\r
1913 m_utilElement_.m_CEs_[0] &= 0xFFFFFF3F;
\r
1914 // Clean the case bits field
\r
1915 if (m_utilElement_.m_cPoints_.length()
\r
1916 - m_utilElement_.m_cPointsOffset_ > 1) {
\r
1918 m_utilElement_.m_CEs_[0]
\r
1919 |= getCaseBits(m_utilElement_.m_cPoints_);
\r
1922 // Copy it from the UCA
\r
1923 int caseCE = getFirstCE(m_utilElement_.m_cPoints_.charAt(0));
\r
1924 m_utilElement_.m_CEs_[0] |= (caseCE & 0xC0);
\r
1928 // and then, add it
\r
1929 addAnElement(t, m_utilElement_);
\r
1930 tok = tok.m_next_;
\r
1935 * Testing if the string argument has case
\r
1936 * @param src string
\r
1937 * @return the case for this char array
\r
1938 * @exception Exception thrown when internal program error occurs
\r
1940 private final int getCaseBits(String src) throws Exception
\r
1944 src = Normalizer.decompose(src, true);
\r
1945 m_utilColEIter_.setText(src);
\r
1946 for (int i = 0; i < src.length(); i++) {
\r
1947 m_utilColEIter_.setText(src.substring(i, i + 1));
\r
1948 int order = m_utilColEIter_.next();
\r
1949 if (RuleBasedCollator.isContinuation(order)) {
\r
1950 throw new Exception("Internal program error");
\r
1952 if ((order & RuleBasedCollator.CE_CASE_BIT_MASK_)
\r
1957 char ch = src.charAt(i);
\r
1958 if (UCharacter.isLowerCase(ch)) {
\r
1962 if (toSmallKana(ch) == ch && toLargeKana(ch) != ch) {
\r
1969 if (uCount != 0 && lCount != 0) {
\r
1970 return MIXED_CASE_;
\r
1972 else if (uCount != 0) {
\r
1973 return UPPER_CASE_;
\r
1976 return LOWER_CASE_;
\r
1981 * Converts a char to the uppercase Kana
\r
1982 * @param ch character to convert
\r
1983 * @return the converted Kana character
\r
1985 private static final char toLargeKana(char ch)
\r
1987 if (0x3042 < ch && ch < 0x30ef) { // Kana range
\r
1988 switch (ch - 0x3000) {
\r
2021 * Converts a char to the lowercase Kana
\r
2022 * @param ch character to convert
\r
2023 * @return the converted Kana character
\r
2025 private static final char toSmallKana(char ch)
\r
2027 if (0x3042 < ch && ch < 0x30ef) { // Kana range
\r
2028 switch (ch - 0x3000) {
\r
2061 * This should be connected to special Jamo handling.
\r
2063 private int getFirstCE(char ch)
\r
2065 m_utilColEIter_.setText(UCharacter.toString(ch));
\r
2066 return m_utilColEIter_.next();
\r
2070 * This adds a read element, while testing for existence
\r
2071 * @param t build table
\r
2075 private int addAnElement(BuildTable t, Elements element)
\r
2077 Vector expansions = t.m_expansions_;
\r
2078 element.m_mapCE_ = 0;
\r
2080 if (element.m_CELength_ == 1) {
\r
2081 element.m_mapCE_ = element.m_CEs_[0];
\r
2084 // unfortunately, it looks like we have to look for a long primary
\r
2085 // here since in canonical closure we are going to hit some long
\r
2086 // primaries from the first phase, and they will come back as
\r
2087 // continuations/expansions destroying the effect of the previous
\r
2088 // opitimization. A long primary is a three byte primary with
\r
2089 // starting secondaries and tertiaries. It can appear in long runs
\r
2090 // of only primary differences (like east Asian tailorings) also,
\r
2091 // it should not be an expansion, as expansions would break with
\r
2093 if (element.m_CELength_ == 2 // a two CE expansion
\r
2094 && RuleBasedCollator.isContinuation(element.m_CEs_[1])
\r
2095 && (element.m_CEs_[1]
\r
2096 & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_)))
\r
2097 == 0 // that has only primaries in continuation
\r
2098 && (((element.m_CEs_[0] >> 8) & 0xFF)
\r
2099 == RuleBasedCollator.BYTE_COMMON_)
\r
2100 // a common secondary
\r
2101 && ((element.m_CEs_[0] & 0xFF)
\r
2102 == RuleBasedCollator.BYTE_COMMON_) // and a common tertiary
\r
2104 element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_
\r
2105 // a long primary special
\r
2106 | (CE_LONG_PRIMARY_TAG_ << 24)
\r
2107 // first and second byte of primary
\r
2108 | ((element.m_CEs_[0] >> 8) & 0xFFFF00)
\r
2109 // third byte of primary
\r
2110 | ((element.m_CEs_[1] >> 24) & 0xFF);
\r
2113 // omitting expansion offset in builder
\r
2114 // (HEADER_SIZE_ >> 2)
\r
2115 int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
\r
2116 | (CE_EXPANSION_TAG_
\r
2117 << RuleBasedCollator.CE_TAG_SHIFT_)
\r
2118 | (addExpansion(expansions, element.m_CEs_[0])
\r
2121 for (int i = 1; i < element.m_CELength_; i ++) {
\r
2122 addExpansion(expansions, element.m_CEs_[i]);
\r
2124 if (element.m_CELength_ <= 0xF) {
\r
2125 expansion |= element.m_CELength_;
\r
2128 addExpansion(expansions, 0);
\r
2130 element.m_mapCE_ = expansion;
\r
2131 setMaxExpansion(element.m_CEs_[element.m_CELength_ - 1],
\r
2132 (byte)element.m_CELength_,
\r
2133 t.m_maxExpansions_);
\r
2134 if (isJamo(element.m_cPoints_.charAt(0))){
\r
2135 t.m_collator_.m_isJamoSpecial_ = true;
\r
2136 setMaxJamoExpansion(element.m_cPoints_.charAt(0),
\r
2137 element.m_CEs_[element.m_CELength_
\r
2139 (byte)element.m_CELength_,
\r
2140 t.m_maxJamoExpansions_);
\r
2145 // We treat digits differently - they are "uber special" and should be
\r
2146 // processed differently if numeric collation is on.
\r
2148 if ((element.m_uchars_.length() == 2)
\r
2149 && UTF16.isLeadSurrogate(element.m_uchars_.charAt(0))) {
\r
2150 uniChar = UCharacterProperty.getRawSupplementary(
\r
2151 element.m_uchars_.charAt(0),
\r
2152 element.m_uchars_.charAt(1));
\r
2154 else if (element.m_uchars_.length() == 1) {
\r
2155 uniChar = element.m_uchars_.charAt(0);
\r
2158 // Here, we either have one normal CE OR mapCE is set. Therefore, we
\r
2159 // stuff only one element to the expansion buffer. When we encounter a
\r
2160 // digit and we don't do numeric collation, we will just pick the CE
\r
2161 // we have and break out of case (see ucol.cpp ucol_prv_getSpecialCE
\r
2162 // && ucol_prv_getSpecialPrevCE). If we picked a special, further
\r
2163 // processing will occur. If it's a simple CE, we'll return due
\r
2164 // to how the loop is constructed.
\r
2165 if (uniChar != 0 && UCharacter.isDigit(uniChar)) {
\r
2166 // prepare the element
\r
2167 int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
\r
2168 | (CollationElementIterator.CE_DIGIT_TAG_
\r
2169 << RuleBasedCollator.CE_TAG_SHIFT_) | 1;
\r
2170 if (element.m_mapCE_ != 0) {
\r
2171 // if there is an expansion, we'll pick it here
\r
2172 expansion |= (addExpansion(expansions, element.m_mapCE_) << 4);
\r
2175 expansion |= (addExpansion(expansions, element.m_CEs_[0]) << 4);
\r
2177 element.m_mapCE_ = expansion;
\r
2180 // here we want to add the prefix structure.
\r
2181 // I will try to process it as a reverse contraction, if possible.
\r
2182 // prefix buffer is already reversed.
\r
2184 if (element.m_prefixChars_ != null &&
\r
2185 element.m_prefixChars_.length() - element.m_prefix_ > 0) {
\r
2186 // We keep the seen prefix starter elements in a hashtable we need
\r
2187 // it to be able to distinguish between the simple codepoints and
\r
2188 // prefix starters. Also, we need to use it for canonical closure.
\r
2189 m_utilElement2_.m_caseBit_ = element.m_caseBit_;
\r
2190 m_utilElement2_.m_CELength_ = element.m_CELength_;
\r
2191 m_utilElement2_.m_CEs_ = element.m_CEs_;
\r
2192 m_utilElement2_.m_mapCE_ = element.m_mapCE_;
\r
2193 //m_utilElement2_.m_prefixChars_ = element.m_prefixChars_;
\r
2194 m_utilElement2_.m_sizePrim_ = element.m_sizePrim_;
\r
2195 m_utilElement2_.m_sizeSec_ = element.m_sizeSec_;
\r
2196 m_utilElement2_.m_sizeTer_ = element.m_sizeTer_;
\r
2197 m_utilElement2_.m_variableTop_ = element.m_variableTop_;
\r
2198 m_utilElement2_.m_prefix_ = element.m_prefix_;
\r
2199 m_utilElement2_.m_prefixChars_ = Normalizer.compose(element.m_prefixChars_, false);
\r
2200 m_utilElement2_.m_uchars_ = element.m_uchars_;
\r
2201 m_utilElement2_.m_cPoints_ = element.m_cPoints_;
\r
2202 m_utilElement2_.m_cPointsOffset_ = 0;
\r
2204 if (t.m_prefixLookup_ != null) {
\r
2205 Elements uCE = (Elements)t.m_prefixLookup_.get(element);
\r
2206 if (uCE != null) {
\r
2207 // there is already a set of code points here
\r
2208 element.m_mapCE_ = addPrefix(t, uCE.m_mapCE_, element);
\r
2210 else { // no code points, so this spot is clean
\r
2211 element.m_mapCE_ = addPrefix(t, CE_NOT_FOUND_, element);
\r
2212 uCE = new Elements(element);
\r
2213 uCE.m_cPoints_ = uCE.m_uchars_;
\r
2214 t.m_prefixLookup_.put(uCE, uCE);
\r
2216 if (m_utilElement2_.m_prefixChars_.length()
\r
2217 != element.m_prefixChars_.length() - element.m_prefix_
\r
2218 || !m_utilElement2_.m_prefixChars_.regionMatches(0,
\r
2219 element.m_prefixChars_, element.m_prefix_,
\r
2220 m_utilElement2_.m_prefixChars_.length())) {
\r
2222 m_utilElement2_.m_mapCE_ = addPrefix(t, element.m_mapCE_,
\r
2228 // We need to use the canonical iterator here
\r
2229 // the way we do it is to generate the canonically equivalent strings
\r
2230 // for the contraction and then add the sequences that pass FCD check
\r
2231 if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1
\r
2232 && !(element.m_cPoints_.length() - element.m_cPointsOffset_ == 2
\r
2233 && UTF16.isLeadSurrogate(element.m_cPoints_.charAt(0))
\r
2234 && UTF16.isTrailSurrogate(element.m_cPoints_.charAt(1)))) {
\r
2235 // this is a contraction, we should check whether a composed form
\r
2236 // should also be included
\r
2237 m_utilCanIter_.setSource(element.m_cPoints_);
\r
2238 String source = m_utilCanIter_.next();
\r
2239 while (source != null && source.length() > 0) {
\r
2240 if (Normalizer.quickCheck(source, Normalizer.FCD,0)
\r
2241 != Normalizer.NO) {
\r
2242 element.m_uchars_ = source;
\r
2243 element.m_cPoints_ = element.m_uchars_;
\r
2244 finalizeAddition(t, element);
\r
2246 source = m_utilCanIter_.next();
\r
2249 return element.m_mapCE_;
\r
2252 return finalizeAddition(t, element);
\r
2257 * Adds an expansion ce to the expansion vector
\r
2258 * @param expansions vector to add to
\r
2259 * @param value of the expansion
\r
2260 * @return the current position of the new element
\r
2262 private static final int addExpansion(Vector expansions, int value)
\r
2264 expansions.add(new Integer(value));
\r
2265 return expansions.size() - 1;
\r
2269 * Looks for the maximum length of all expansion sequences ending with the
\r
2270 * same collation element. The size required for maxexpansion and maxsize
\r
2271 * is returned if the arrays are too small.
\r
2272 * @param endexpansion the last expansion collation element to be added
\r
2273 * @param expansionsize size of the expansion
\r
2274 * @param maxexpansion data structure to store the maximum expansion data.
\r
2275 * @returns size of the maxexpansion and maxsize used.
\r
2277 private static int setMaxExpansion(int endexpansion, byte expansionsize,
\r
2278 MaxExpansionTable maxexpansion)
\r
2281 int limit = maxexpansion.m_endExpansionCE_.size();
\r
2282 long unsigned = (long)endexpansion;
\r
2283 unsigned &= 0xFFFFFFFFl;
\r
2285 // using binary search to determine if last expansion element is
\r
2286 // already in the array
\r
2288 while (start < limit - 1) {
\r
2289 int mid = start + ((limit - start) >> 1);
\r
2290 long unsignedce = ((Integer)maxexpansion.m_endExpansionCE_.get(
\r
2291 mid)).intValue();
\r
2292 unsignedce &= 0xFFFFFFFFl;
\r
2293 if (unsigned <= unsignedce) {
\r
2301 if (((Integer)maxexpansion.m_endExpansionCE_.get(start)).intValue()
\r
2302 == endexpansion) {
\r
2305 else if (((Integer)maxexpansion.m_endExpansionCE_.get(limit)).intValue()
\r
2306 == endexpansion) {
\r
2309 if (result > -1) {
\r
2310 // found the ce in expansion, we'll just modify the size if it
\r
2312 Object currentsize = maxexpansion.m_expansionCESize_.get(result);
\r
2313 if (((Byte)currentsize).byteValue() < expansionsize) {
\r
2314 maxexpansion.m_expansionCESize_.set(result,
\r
2315 new Byte(expansionsize));
\r
2319 // we'll need to squeeze the value into the array. initial
\r
2320 // implementation. shifting the subarray down by 1
\r
2321 maxexpansion.m_endExpansionCE_.insertElementAt(
\r
2322 new Integer(endexpansion),
\r
2324 maxexpansion.m_expansionCESize_.insertElementAt(
\r
2325 new Byte(expansionsize),
\r
2328 return maxexpansion.m_endExpansionCE_.size();
\r
2332 * Sets the maximum length of all jamo expansion sequences ending with the
\r
2333 * same collation element. The size required for maxexpansion and maxsize
\r
2334 * is returned if the arrays are too small.
\r
2335 * @param ch the jamo codepoint
\r
2336 * @param endexpansion the last expansion collation element to be added
\r
2337 * @param expansionsize size of the expansion
\r
2338 * @param maxexpansion data structure to store the maximum expansion data.
\r
2339 * @returns size of the maxexpansion and maxsize used.
\r
2341 private static int setMaxJamoExpansion(char ch, int endexpansion,
\r
2342 byte expansionsize,
\r
2343 MaxJamoExpansionTable maxexpansion)
\r
2345 boolean isV = true;
\r
2346 if (ch >= 0x1100 && ch <= 0x1112) {
\r
2347 // determines L for Jamo, doesn't need to store this since it is
\r
2348 // never at the end of a expansion
\r
2349 if (maxexpansion.m_maxLSize_ < expansionsize) {
\r
2350 maxexpansion.m_maxLSize_ = expansionsize;
\r
2352 return maxexpansion.m_endExpansionCE_.size();
\r
2355 if (ch >= 0x1161 && ch <= 0x1175) {
\r
2356 // determines V for Jamo
\r
2357 if (maxexpansion.m_maxVSize_ < expansionsize) {
\r
2358 maxexpansion.m_maxVSize_ = expansionsize;
\r
2362 if (ch >= 0x11A8 && ch <= 0x11C2) {
\r
2364 // determines T for Jamo
\r
2365 if (maxexpansion.m_maxTSize_ < expansionsize) {
\r
2366 maxexpansion.m_maxTSize_ = expansionsize;
\r
2370 int pos = maxexpansion.m_endExpansionCE_.size();
\r
2373 if (((Integer)maxexpansion.m_endExpansionCE_.get(pos)).intValue()
\r
2374 == endexpansion) {
\r
2375 return maxexpansion.m_endExpansionCE_.size();
\r
2378 maxexpansion.m_endExpansionCE_.add(new Integer(endexpansion));
\r
2379 maxexpansion.m_isV_.add(isV ? Boolean.TRUE : Boolean.FALSE);
\r
2381 return maxexpansion.m_endExpansionCE_.size();
\r
2385 * Adds a prefix to the table
\r
2386 * @param t build table to update
\r
2387 * @param CE collation element to add
\r
2388 * @param element rule element to add
\r
2389 * @return modified ce
\r
2391 private int addPrefix(BuildTable t, int CE, Elements element)
\r
2393 // currently the longest prefix we're supporting in Japanese is two
\r
2394 // characters long. Although this table could quite easily mimic
\r
2395 // complete contraction stuff there is no good reason to make a general
\r
2396 // solution, as it would require some error prone messing.
\r
2397 ContractionTable contractions = t.m_contractions_;
\r
2398 String oldCP = element.m_cPoints_;
\r
2399 int oldCPOffset = element.m_cPointsOffset_;
\r
2401 contractions.m_currentTag_ = CE_SPEC_PROC_TAG_;
\r
2402 // here, we will normalize & add prefix to the table.
\r
2403 int size = element.m_prefixChars_.length() - element.m_prefix_;
\r
2404 for (int j = 1; j < size; j ++) {
\r
2405 // First add NFD prefix chars to unsafe CP hash table
\r
2406 // Unless it is a trail surrogate, which is handled algoritmically
\r
2407 // and shouldn't take up space in the table.
\r
2408 char ch = element.m_prefixChars_.charAt(j + element.m_prefix_);
\r
2409 if (!UTF16.isTrailSurrogate(ch)) {
\r
2410 unsafeCPSet(t.m_unsafeCP_, ch);
\r
2414 // StringBuffer reversed = new StringBuffer();
\r
2415 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
\r
2416 for (int j = 0; j < size; j ++) {
\r
2417 // prefixes are going to be looked up backwards
\r
2418 // therefore, we will promptly reverse the prefix buffer...
\r
2419 int offset = element.m_prefixChars_.length() - j - 1;
\r
2420 m_utilStringBuffer_.append(element.m_prefixChars_.charAt(offset));
\r
2422 element.m_prefixChars_ = m_utilStringBuffer_.toString();
\r
2423 element.m_prefix_ = 0;
\r
2425 // the first codepoint is also unsafe, as it forms a 'contraction' with
\r
2427 if (!UTF16.isTrailSurrogate(element.m_cPoints_.charAt(0))) {
\r
2428 unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_.charAt(0));
\r
2431 element.m_cPoints_ = element.m_prefixChars_;
\r
2432 element.m_cPointsOffset_ = element.m_prefix_;
\r
2434 // Add the last char of the contraction to the contraction-end hash
\r
2435 // table. unless it is a trail surrogate, which is handled
\r
2436 // algorithmically and shouldn't be in the table
\r
2437 if (!UTF16.isTrailSurrogate(
\r
2438 element.m_cPoints_.charAt(element.m_cPoints_.length() - 1))) {
\r
2439 ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_.charAt(
\r
2440 element.m_cPoints_.length() - 1));
\r
2442 // First we need to check if contractions starts with a surrogate
\r
2443 // int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_);
\r
2445 // If there are any Jamos in the contraction, we should turn on special
\r
2446 // processing for Jamos
\r
2447 if (isJamo(element.m_prefixChars_.charAt(element.m_prefix_))) {
\r
2448 t.m_collator_.m_isJamoSpecial_ = true;
\r
2450 // then we need to deal with it
\r
2451 // we could aready have something in table - or we might not
\r
2452 if (!isPrefix(CE)) {
\r
2453 // if it wasn't contraction, we wouldn't end up here
\r
2454 int firstContractionOffset = addContraction(contractions,
\r
2455 CONTRACTION_TABLE_NEW_ELEMENT_,
\r
2457 int newCE = processContraction(contractions, element,
\r
2459 addContraction(contractions, firstContractionOffset,
\r
2460 element.m_prefixChars_.charAt(element.m_prefix_),
\r
2462 addContraction(contractions, firstContractionOffset, (char)0xFFFF,
\r
2464 CE = constructSpecialCE(CE_SPEC_PROC_TAG_, firstContractionOffset);
\r
2467 // we are adding to existing contraction
\r
2468 // there were already some elements in the table, so we need to add
\r
2469 // a new contraction
\r
2470 // Two things can happen here: either the codepoint is already in
\r
2471 // the table, or it is not
\r
2472 char ch = element.m_prefixChars_.charAt(element.m_prefix_);
\r
2473 int position = findCP(contractions, CE, ch);
\r
2474 if (position > 0) {
\r
2475 // if it is we just continue down the chain
\r
2476 int eCE = getCE(contractions, CE, position);
\r
2477 int newCE = processContraction(contractions, element, eCE);
\r
2478 setContraction(contractions, CE, position, ch, newCE);
\r
2481 // if it isn't, we will have to create a new sequence
\r
2482 processContraction(contractions, element, CE_NOT_FOUND_);
\r
2483 insertContraction(contractions, CE, ch, element.m_mapCE_);
\r
2487 element.m_cPoints_ = oldCP;
\r
2488 element.m_cPointsOffset_ = oldCPOffset;
\r
2494 * Checks if the argument ce is a contraction
\r
2495 * @param CE collation element
\r
2496 * @return true if argument ce is a contraction
\r
2498 private static final boolean isContraction(int CE)
\r
2500 return isSpecial(CE) && (getCETag(CE) == CE_CONTRACTION_TAG_);
\r
2504 * Checks if the argument ce has a prefix
\r
2505 * @param CE collation element
\r
2506 * @return true if argument ce has a prefix
\r
2508 private static final boolean isPrefix(int CE)
\r
2510 return isSpecial(CE) && (getCETag(CE) == CE_SPEC_PROC_TAG_);
\r
2514 * Checks if the argument ce is special
\r
2515 * @param CE collation element
\r
2516 * @return true if argument ce is special
\r
2518 private static final boolean isSpecial(int CE)
\r
2520 return (CE & RuleBasedCollator.CE_SPECIAL_FLAG_) == 0xF0000000;
\r
2524 * Checks if the argument ce has a prefix
\r
2525 * @param CE collation element
\r
2526 * @return true if argument ce has a prefix
\r
2528 private static final int getCETag(int CE)
\r
2530 return (CE & RuleBasedCollator.CE_TAG_MASK_) >>>
\r
2531 RuleBasedCollator.CE_TAG_SHIFT_;
\r
2535 * Gets the ce at position in contraction table
\r
2536 * @param table contraction table
\r
2537 * @param position offset to the contraction table
\r
2540 private static final int getCE(ContractionTable table, int element,
\r
2543 element &= 0xFFFFFF;
\r
2544 BasicContractionTable tbl = getBasicContractionTable(table, element);
\r
2546 if (tbl == null) {
\r
2547 return CE_NOT_FOUND_;
\r
2549 if (position > tbl.m_CEs_.size() || position == -1) {
\r
2550 return CE_NOT_FOUND_;
\r
2553 return ((Integer)tbl.m_CEs_.get(position)).intValue();
\r
2558 * Sets the unsafe character
\r
2559 * @param table unsafe table
\r
2560 * @param c character to be added
\r
2562 private static final void unsafeCPSet(byte table[], char c)
\r
2565 if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {
\r
2566 if (hash >= 0xd800 && hash <= 0xf8ff) {
\r
2567 // Part of a surrogate, or in private use area.
\r
2568 // These don't go in the table
\r
2571 hash = (hash & UNSAFECP_TABLE_MASK_) + 256;
\r
2573 table[hash >> 3] |= (1 << (hash & 7));
\r
2577 * Sets the contraction end character
\r
2578 * @param table contraction end table
\r
2579 * @param c character to be added
\r
2581 private static final void ContrEndCPSet(byte table[], char c)
\r
2584 if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {
\r
2585 hash = (hash & UNSAFECP_TABLE_MASK_) + 256;
\r
2587 table[hash >> 3] |= (1 << (hash & 7));
\r
2591 * Adds more contractions in table. If element is non existant, it creates
\r
2592 * on. Returns element handle
\r
2593 * @param table contraction table
\r
2594 * @param element offset to the contraction table
\r
2595 * @param codePoint codepoint to add
\r
2597 * @return collation element
\r
2599 private static int addContraction(ContractionTable table, int element,
\r
2600 char codePoint, int value)
\r
2602 BasicContractionTable tbl = getBasicContractionTable(table, element);
\r
2603 if (tbl == null) {
\r
2604 tbl = addAContractionElement(table);
\r
2605 element = table.m_elements_.size() - 1;
\r
2608 tbl.m_CEs_.add(new Integer(value));
\r
2609 tbl.m_codePoints_.append(codePoint);
\r
2610 return constructSpecialCE(table.m_currentTag_, element);
\r
2614 * Adds a contraction element to the table
\r
2615 * @param table contraction table to update
\r
2616 * @return contraction
\r
2618 private static BasicContractionTable addAContractionElement(
\r
2619 ContractionTable table)
\r
2621 BasicContractionTable result = new BasicContractionTable();
\r
2622 table.m_elements_.add(result);
\r
2627 * Constructs a special ce
\r
2628 * @param tag special tag
\r
2629 * @param CE collation element
\r
2630 * @return a contraction ce
\r
2632 private static final int constructSpecialCE(int tag, int CE)
\r
2634 return RuleBasedCollator.CE_SPECIAL_FLAG_
\r
2635 | (tag << RuleBasedCollator.CE_TAG_SHIFT_) | (CE & 0xFFFFFF);
\r
2639 * Sets and inserts the element that has a contraction
\r
2640 * @param contractions contraction table
\r
2641 * @param element contracting element
\r
2642 * @param existingCE
\r
2643 * @return contraction ce
\r
2645 private static int processContraction(ContractionTable contractions,
\r
2646 Elements element,
\r
2649 int firstContractionOffset = 0;
\r
2650 // end of recursion
\r
2651 if (element.m_cPoints_.length() - element.m_cPointsOffset_ == 1) {
\r
2652 if (isContractionTableElement(existingCE)
\r
2653 && getCETag(existingCE) == contractions.m_currentTag_) {
\r
2654 changeContraction(contractions, existingCE, (char)0,
\r
2655 element.m_mapCE_);
\r
2656 changeContraction(contractions, existingCE, (char)0xFFFF,
\r
2657 element.m_mapCE_);
\r
2658 return existingCE;
\r
2661 // can't do just that. existingCe might be a contraction,
\r
2662 // meaning that we need to do another step
\r
2663 return element.m_mapCE_;
\r
2667 // this recursion currently feeds on the only element we have...
\r
2668 // We will have to copy it in order to accomodate for both backward
\r
2669 // and forward cycles
\r
2670 // we encountered either an empty space or a non-contraction element
\r
2671 // this means we are constructing a new contraction sequence
\r
2672 element.m_cPointsOffset_ ++;
\r
2673 if (!isContractionTableElement(existingCE)) {
\r
2674 // if it wasn't contraction, we wouldn't end up here
\r
2675 firstContractionOffset = addContraction(contractions,
\r
2676 CONTRACTION_TABLE_NEW_ELEMENT_,
\r
2677 (char)0, existingCE);
\r
2678 int newCE = processContraction(contractions, element,
\r
2680 addContraction(contractions, firstContractionOffset,
\r
2681 element.m_cPoints_.charAt(element.m_cPointsOffset_),
\r
2683 addContraction(contractions, firstContractionOffset,
\r
2684 (char)0xFFFF, existingCE);
\r
2685 existingCE = constructSpecialCE(contractions.m_currentTag_,
\r
2686 firstContractionOffset);
\r
2689 // we are adding to existing contraction
\r
2690 // there were already some elements in the table, so we need to add
\r
2691 // a new contraction
\r
2692 // Two things can happen here: either the codepoint is already in
\r
2693 // the table, or it is not
\r
2694 int position = findCP(contractions, existingCE,
\r
2695 element.m_cPoints_.charAt(element.m_cPointsOffset_));
\r
2696 if (position > 0) {
\r
2697 // if it is we just continue down the chain
\r
2698 int eCE = getCE(contractions, existingCE, position);
\r
2699 int newCE = processContraction(contractions, element, eCE);
\r
2700 setContraction(contractions, existingCE, position,
\r
2701 element.m_cPoints_.charAt(element.m_cPointsOffset_),
\r
2705 // if it isn't, we will have to create a new sequence
\r
2706 int newCE = processContraction(contractions, element,
\r
2708 insertContraction(contractions, existingCE,
\r
2709 element.m_cPoints_.charAt(element.m_cPointsOffset_),
\r
2713 element.m_cPointsOffset_ --;
\r
2714 return existingCE;
\r
2718 * Checks if CE belongs to the contraction table
\r
2719 * @param CE collation element to test
\r
2720 * @return true if CE belongs to the contraction table
\r
2722 private static final boolean isContractionTableElement(int CE)
\r
2724 return isSpecial(CE)
\r
2725 && (getCETag(CE) == CE_CONTRACTION_TAG_
\r
2726 || getCETag(CE) == CE_SPEC_PROC_TAG_);
\r
2730 * Gets the codepoint
\r
2731 * @param table contraction table
\r
2732 * @param element offset to the contraction element in the table
\r
2733 * @param codePoint code point to look for
\r
2734 * @return the offset to the code point
\r
2736 private static int findCP(ContractionTable table, int element,
\r
2739 BasicContractionTable tbl = getBasicContractionTable(table, element);
\r
2740 if (tbl == null) {
\r
2745 while (codePoint > tbl.m_codePoints_.charAt(position)) {
\r
2747 if (position > tbl.m_codePoints_.length()) {
\r
2751 if (codePoint == tbl.m_codePoints_.charAt(position)) {
\r
2760 * Gets the contraction element out of the contraction table
\r
2761 * @param table contraction table
\r
2762 * @param offset to the element in the contraction table
\r
2763 * @return basic contraction element at offset in the contraction table
\r
2765 private static final BasicContractionTable getBasicContractionTable(
\r
2766 ContractionTable table,
\r
2769 offset &= 0xFFFFFF;
\r
2770 if (offset == 0xFFFFFF) {
\r
2773 return (BasicContractionTable)table.m_elements_.get(offset);
\r
2777 * Changes the contraction element
\r
2778 * @param table contraction table
\r
2779 * @param element offset to the element in the contraction table
\r
2780 * @param codePoint codepoint
\r
2781 * @param newCE new collation element
\r
2782 * @return basic contraction element at offset in the contraction table
\r
2784 private static final int changeContraction(ContractionTable table,
\r
2785 int element, char codePoint,
\r
2788 BasicContractionTable tbl = getBasicContractionTable(table, element);
\r
2789 if (tbl == null) {
\r
2793 while (codePoint > tbl.m_codePoints_.charAt(position)) {
\r
2795 if (position > tbl.m_codePoints_.length()) {
\r
2796 return CE_NOT_FOUND_;
\r
2799 if (codePoint == tbl.m_codePoints_.charAt(position)) {
\r
2800 tbl.m_CEs_.set(position, new Integer(newCE));
\r
2801 return element & 0xFFFFFF;
\r
2804 return CE_NOT_FOUND_;
\r
2809 * Sets a part of contraction sequence in table. If element is non
\r
2810 * existant, it creates on. Returns element handle.
\r
2811 * @param table contraction table
\r
2812 * @param element offset to the contraction table
\r
2814 * @param codePoint contraction character
\r
2815 * @param value ce value
\r
2816 * @return new contraction ce
\r
2818 private static final int setContraction(ContractionTable table,
\r
2819 int element, int offset,
\r
2820 char codePoint, int value)
\r
2822 element &= 0xFFFFFF;
\r
2823 BasicContractionTable tbl = getBasicContractionTable(table, element);
\r
2824 if (tbl == null) {
\r
2825 tbl = addAContractionElement(table);
\r
2826 element = table.m_elements_.size() - 1;
\r
2829 tbl.m_CEs_.set(offset, new Integer(value));
\r
2830 tbl.m_codePoints_.setCharAt(offset, codePoint);
\r
2831 return constructSpecialCE(table.m_currentTag_, element);
\r
2835 * Inserts a part of contraction sequence in table. Sequences behind the
\r
2836 * offset are moved back. If element is non existent, it creates on.
\r
2837 * @param table contraction
\r
2838 * @param element offset to the table contraction
\r
2839 * @param codePoint code point
\r
2840 * @param value collation element value
\r
2841 * @return contraction collation element
\r
2843 private static final int insertContraction(ContractionTable table,
\r
2844 int element, char codePoint,
\r
2847 element &= 0xFFFFFF;
\r
2848 BasicContractionTable tbl = getBasicContractionTable(table, element);
\r
2849 if (tbl == null) {
\r
2850 tbl = addAContractionElement(table);
\r
2851 element = table.m_elements_.size() - 1;
\r
2855 while (tbl.m_codePoints_.charAt(offset) < codePoint
\r
2856 && offset < tbl.m_codePoints_.length()) {
\r
2860 tbl.m_CEs_.insertElementAt(new Integer(value), offset);
\r
2861 tbl.m_codePoints_.insert(offset, codePoint);
\r
2863 return constructSpecialCE(table.m_currentTag_, element);
\r
2867 * Finalize addition
\r
2868 * @param t build table
\r
2869 * @param element to add
\r
2871 private final static int finalizeAddition(BuildTable t, Elements element)
\r
2873 int CE = CE_NOT_FOUND_;
\r
2874 // This should add a completely ignorable element to the
\r
2875 // unsafe table, so that backward iteration will skip
\r
2876 // over it when treating contractions.
\r
2877 if (element.m_mapCE_ == 0) {
\r
2878 for (int i = 0; i < element.m_cPoints_.length(); i ++) {
\r
2879 char ch = element.m_cPoints_.charAt(i);
\r
2880 if (!UTF16.isTrailSurrogate(ch)) {
\r
2881 unsafeCPSet(t.m_unsafeCP_, ch);
\r
2886 if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1) {
\r
2887 // we're adding a contraction
\r
2888 int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_);
\r
2889 CE = t.m_mapping_.getValue(cp);
\r
2890 CE = addContraction(t, CE, element);
\r
2894 CE = t.m_mapping_.getValue(element.m_cPoints_.charAt(
\r
2895 element.m_cPointsOffset_));
\r
2897 if (CE != CE_NOT_FOUND_) {
\r
2898 if(isContractionTableElement(CE)) {
\r
2899 // adding a non contraction element (thai, expansion,
\r
2900 // single) to already existing contraction
\r
2901 if (!isPrefix(element.m_mapCE_)) {
\r
2902 // we cannot reenter prefix elements - as we are going
\r
2903 // to create a dead loop
\r
2904 // Only expansions and regular CEs can go here...
\r
2905 // Contractions will never happen in this place
\r
2906 setContraction(t.m_contractions_, CE, 0, (char)0,
\r
2907 element.m_mapCE_);
\r
2908 // This loop has to change the CE at the end of
\r
2909 // contraction REDO!
\r
2910 changeLastCE(t.m_contractions_, CE, element.m_mapCE_);
\r
2914 t.m_mapping_.setValue(element.m_cPoints_.charAt(
\r
2915 element.m_cPointsOffset_),
\r
2916 element.m_mapCE_);
\r
2917 if (element.m_prefixChars_ != null &&
\r
2918 element.m_prefixChars_.length()>0 &&
\r
2919 getCETag(CE) != CE_IMPLICIT_TAG_) {
\r
2920 // Add CE for standalone precontext char.
\r
2921 Elements origElem = new Elements();
\r
2922 origElem.m_prefixChars_ = null;
\r
2923 origElem.m_uchars_ = element.m_cPoints_;
\r
2924 origElem.m_cPoints_ = origElem.m_uchars_;
\r
2925 origElem.m_CEs_[0] = CE;
\r
2926 origElem.m_mapCE_ = CE;
\r
2927 origElem.m_CELength_ = 1;
\r
2928 finalizeAddition(t, origElem);
\r
2933 t.m_mapping_.setValue(element.m_cPoints_.charAt(
\r
2934 element.m_cPointsOffset_),
\r
2935 element.m_mapCE_);
\r
2942 * Note regarding surrogate handling: We are interested only in the single
\r
2943 * or leading surrogates in a contraction. If a surrogate is somewhere else
\r
2944 * in the contraction, it is going to be handled as a pair of code units,
\r
2945 * as it doesn't affect the performance AND handling surrogates specially
\r
2946 * would complicate code way too much.
\r
2948 private static int addContraction(BuildTable t, int CE, Elements element)
\r
2950 ContractionTable contractions = t.m_contractions_;
\r
2951 contractions.m_currentTag_ = CE_CONTRACTION_TAG_;
\r
2953 // First we need to check if contractions starts with a surrogate
\r
2954 int cp = UTF16.charAt(element.m_cPoints_, 0);
\r
2956 if (UCharacter.isSupplementary(cp)) {
\r
2959 if (cpsize < element.m_cPoints_.length()) {
\r
2960 // This is a real contraction, if there are other characters after
\r
2962 int size = element.m_cPoints_.length() - element.m_cPointsOffset_;
\r
2963 for (int j = 1; j < size; j ++) {
\r
2964 // First add contraction chars to unsafe CP hash table
\r
2965 // Unless it is a trail surrogate, which is handled
\r
2966 // algoritmically and shouldn't take up space in the table.
\r
2967 if (!UTF16.isTrailSurrogate(element.m_cPoints_.charAt(
\r
2968 element.m_cPointsOffset_ + j))) {
\r
2969 unsafeCPSet(t.m_unsafeCP_,
\r
2970 element.m_cPoints_.charAt(
\r
2971 element.m_cPointsOffset_ + j));
\r
2974 // Add the last char of the contraction to the contraction-end
\r
2975 // hash table. unless it is a trail surrogate, which is handled
\r
2976 // algorithmically and shouldn't be in the table
\r
2977 if (!UTF16.isTrailSurrogate(element.m_cPoints_.charAt(
\r
2978 element.m_cPoints_.length() -1))) {
\r
2979 ContrEndCPSet(t.m_contrEndCP_,
\r
2980 element.m_cPoints_.charAt(
\r
2981 element.m_cPoints_.length() -1));
\r
2984 // If there are any Jamos in the contraction, we should turn on
\r
2985 // special processing for Jamos
\r
2986 if (isJamo(element.m_cPoints_.charAt(element.m_cPointsOffset_))) {
\r
2987 t.m_collator_.m_isJamoSpecial_ = true;
\r
2989 // then we need to deal with it
\r
2990 // we could aready have something in table - or we might not
\r
2991 element.m_cPointsOffset_ += cpsize;
\r
2992 if (!isContraction(CE)) {
\r
2993 // if it wasn't contraction, we wouldn't end up here
\r
2994 int firstContractionOffset = addContraction(contractions,
\r
2995 CONTRACTION_TABLE_NEW_ELEMENT_, (char)0, CE);
\r
2996 int newCE = processContraction(contractions, element,
\r
2998 addContraction(contractions, firstContractionOffset,
\r
2999 element.m_cPoints_.charAt(element.m_cPointsOffset_),
\r
3001 addContraction(contractions, firstContractionOffset,
\r
3002 (char)0xFFFF, CE);
\r
3003 CE = constructSpecialCE(CE_CONTRACTION_TAG_,
\r
3004 firstContractionOffset);
\r
3007 // we are adding to existing contraction
\r
3008 // there were already some elements in the table, so we need to
\r
3009 // add a new contraction
\r
3010 // Two things can happen here: either the codepoint is already
\r
3011 // in the table, or it is not
\r
3012 int position = findCP(contractions, CE,
\r
3013 element.m_cPoints_.charAt(element.m_cPointsOffset_));
\r
3014 if (position > 0) {
\r
3015 // if it is we just continue down the chain
\r
3016 int eCE = getCE(contractions, CE, position);
\r
3017 int newCE = processContraction(contractions, element, eCE);
\r
3018 setContraction(contractions, CE, position,
\r
3019 element.m_cPoints_.charAt(element.m_cPointsOffset_),
\r
3023 // if it isn't, we will have to create a new sequence
\r
3024 int newCE = processContraction(contractions, element,
\r
3026 insertContraction(contractions, CE,
\r
3027 element.m_cPoints_.charAt(element.m_cPointsOffset_),
\r
3031 element.m_cPointsOffset_ -= cpsize;
\r
3032 t.m_mapping_.setValue(cp, CE);
\r
3034 else if (!isContraction(CE)) {
\r
3035 // this is just a surrogate, and there is no contraction
\r
3036 t.m_mapping_.setValue(cp, element.m_mapCE_);
\r
3039 // fill out the first stage of the contraction with the surrogate
\r
3041 changeContraction(contractions, CE, (char)0, element.m_mapCE_);
\r
3042 changeContraction(contractions, CE, (char)0xFFFF, element.m_mapCE_);
\r
3048 * this is for adding non contractions
\r
3049 * @param table contraction table
\r
3050 * @param element offset to the contraction table
\r
3051 * @param value collation element value
\r
3052 * @return new collation element
\r
3054 private static final int changeLastCE(ContractionTable table, int element,
\r
3057 BasicContractionTable tbl = getBasicContractionTable(table, element);
\r
3058 if (tbl == null) {
\r
3062 tbl.m_CEs_.set(tbl.m_CEs_.size() - 1, new Integer(value));
\r
3063 return constructSpecialCE(table.m_currentTag_, element & 0xFFFFFF);
\r
3067 * Given a set of ranges calculated by allocWeights(), iterate through the
\r
3068 * weights. Sets the next weight in cegenerator.m_current_.
\r
3069 * @param cegenerator object that contains ranges weight range array and
\r
3071 * @return the next weight
\r
3073 private static int nextWeight(CEGenerator cegenerator)
\r
3075 if (cegenerator.m_rangesLength_ > 0) {
\r
3076 // get maxByte from the .count field
\r
3077 int maxByte = cegenerator.m_ranges_[0].m_count_;
\r
3078 // get the next weight
\r
3079 int weight = cegenerator.m_ranges_[0].m_start_;
\r
3080 if (weight == cegenerator.m_ranges_[0].m_end_) {
\r
3081 // this range is finished, remove it and move the following
\r
3083 cegenerator.m_rangesLength_ --;
\r
3084 if (cegenerator.m_rangesLength_ > 0) {
\r
3085 System.arraycopy(cegenerator.m_ranges_, 1,
\r
3086 cegenerator.m_ranges_, 0,
\r
3087 cegenerator.m_rangesLength_);
\r
3088 cegenerator.m_ranges_[0].m_count_ = maxByte;
\r
3089 // keep maxByte in ranges[0]
\r
3093 // increment the weight for the next value
\r
3094 cegenerator.m_ranges_[0].m_start_
\r
3095 = incWeight(weight, cegenerator.m_ranges_[0].m_length2_,
\r
3104 * Increment the collation weight
\r
3105 * @param weight to increment
\r
3108 * @return new incremented weight
\r
3110 private static final int incWeight(int weight, int length, int maxByte)
\r
3113 int b = getWeightByte(weight, length);
\r
3114 if (b < maxByte) {
\r
3115 return setWeightByte(weight, length, b + 1);
\r
3118 // roll over, set this byte to BYTE_FIRST_TAILORED_ and
\r
3119 // increment the previous one
\r
3120 weight = setWeightByte(weight, length,
\r
3121 RuleBasedCollator.BYTE_FIRST_TAILORED_);
\r
3128 * Gets the weight byte
\r
3133 private static final int getWeightByte(int weight, int index)
\r
3135 return (weight >> ((4 - index) << 3)) & 0xff;
\r
3139 * Set the weight byte in table
\r
3144 private static final int setWeightByte(int weight, int index, int b)
\r
3147 // 0xffffffff except a 00 "hole" for the index-th byte
\r
3148 int mask = 0xffffffff >>> index;
\r
3149 index = 32 - index;
\r
3150 mask |= 0xffffff00 << index;
\r
3151 return (weight & mask) | (b << index);
\r
3155 * Call getWeightRanges and then determine heuristically which ranges to
\r
3156 * use for a given number of weights between (excluding) two limits
\r
3157 * @param lowerLimit
\r
3158 * @param upperLimit
\r
3164 private int allocateWeights(int lowerLimit, int upperLimit, int n,
\r
3165 int maxByte, WeightRange ranges[])
\r
3167 // number of usable byte values 3..maxByte
\r
3168 int countBytes = maxByte - RuleBasedCollator.BYTE_FIRST_TAILORED_ + 1;
\r
3169 // [0] unused, [5] to make index checks unnecessary, m_utilCountBuffer_
\r
3170 // countBytes to the power of index, m_utilLongBuffer_ for unsignedness
\r
3171 // gcc requires explicit initialization
\r
3172 m_utilLongBuffer_[0] = 1;
\r
3173 m_utilLongBuffer_[1] = countBytes;
\r
3174 m_utilLongBuffer_[2] = m_utilLongBuffer_[1] * countBytes;
\r
3175 m_utilLongBuffer_[3] = m_utilLongBuffer_[2] * countBytes;
\r
3176 m_utilLongBuffer_[4] = m_utilLongBuffer_[3] * countBytes;
\r
3177 int rangeCount = getWeightRanges(lowerLimit, upperLimit, maxByte,
\r
3178 countBytes, ranges);
\r
3179 if (rangeCount <= 0) {
\r
3182 // what is the maximum number of weights with these ranges?
\r
3183 long maxCount = 0;
\r
3184 for (int i = 0; i < rangeCount; ++ i) {
\r
3185 maxCount += (long)ranges[i].m_count_
\r
3186 * m_utilLongBuffer_[4 - ranges[i].m_length_];
\r
3188 if (maxCount < n) {
\r
3191 // set the length2 and count2 fields
\r
3192 for (int i = 0; i < rangeCount; ++ i) {
\r
3193 ranges[i].m_length2_ = ranges[i].m_length_;
\r
3194 ranges[i].m_count2_ = ranges[i].m_count_;
\r
3196 // try until we find suitably large ranges
\r
3198 // get the smallest number of bytes in a range
\r
3199 int minLength = ranges[0].m_length2_;
\r
3200 // sum up the number of elements that fit into ranges of each byte
\r
3202 Arrays.fill(m_utilCountBuffer_, 0);
\r
3203 for (int i = 0; i < rangeCount; ++ i) {
\r
3204 m_utilCountBuffer_[ranges[i].m_length2_] += ranges[i].m_count2_;
\r
3206 // now try to allocate n elements in the available short ranges
\r
3207 if (n <= m_utilCountBuffer_[minLength]
\r
3208 + m_utilCountBuffer_[minLength + 1]) {
\r
3209 // trivial cases, use the first few ranges
\r
3213 maxCount += ranges[rangeCount].m_count2_;
\r
3215 } while (n > maxCount);
\r
3218 else if (n <= ranges[0].m_count2_ * countBytes) {
\r
3219 // easy case, just make this one range large enough by
\r
3220 // lengthening it once more, possibly split it
\r
3222 // calculate how to split the range between maxLength-1
\r
3223 // (count1) and maxLength (count2)
\r
3225 = m_utilLongBuffer_[minLength - ranges[0].m_length_];
\r
3226 long power = power_1 * countBytes;
\r
3227 int count2 = (int)((n + power - 1) / power);
\r
3228 int count1 = ranges[0].m_count_ - count2;
\r
3229 // split the range
\r
3231 // lengthen the entire range to maxLength
\r
3232 lengthenRange(ranges, 0, maxByte, countBytes);
\r
3235 // really split the range
\r
3236 // create a new range with the end and initial and current
\r
3237 // length of the old one
\r
3239 ranges[1].m_end_ = ranges[0].m_end_;
\r
3240 ranges[1].m_length_ = ranges[0].m_length_;
\r
3241 ranges[1].m_length2_ = minLength;
\r
3242 // set the end of the first range according to count1
\r
3243 int i = ranges[0].m_length_;
\r
3244 int b = getWeightByte(ranges[0].m_start_, i) + count1 - 1;
\r
3245 // ranges[0].count and count1 may be >countBytes from
\r
3246 // merging adjacent ranges; b > maxByte is possible
\r
3247 if (b <= maxByte) {
\r
3248 ranges[0].m_end_ = setWeightByte(ranges[0].m_start_, i,
\r
3252 ranges[0].m_end_ = setWeightByte(
\r
3253 incWeight(ranges[0].m_start_, i - 1,
\r
3255 i, b - countBytes);
\r
3257 // set the bytes in the end weight at length + 1..length2
\r
3259 b = (maxByte << 24) | (maxByte << 16) | (maxByte << 8)
\r
3260 | maxByte; // this used to be 0xffffffff
\r
3261 ranges[0].m_end_ = truncateWeight(ranges[0].m_end_, i)
\r
3262 | (b >>> (i << 3))
\r
3263 & (b << ((4 - minLength) << 3));
\r
3264 // set the start of the second range to immediately follow
\r
3265 // the end of the first one
\r
3266 ranges[1].m_start_ = incWeight(ranges[0].m_end_, minLength,
\r
3268 // set the count values (informational)
\r
3269 ranges[0].m_count_ = count1;
\r
3270 ranges[1].m_count_ = count2;
\r
3272 ranges[0].m_count2_ = (int)(count1 * power_1);
\r
3273 // will be *countBytes when lengthened
\r
3274 ranges[1].m_count2_ = (int)(count2 * power_1);
\r
3276 // lengthen the second range to maxLength
\r
3277 lengthenRange(ranges, 1, maxByte, countBytes);
\r
3281 // no good match, lengthen all minLength ranges and iterate
\r
3282 for (int i=0; ranges[i].m_length2_ == minLength; ++ i) {
\r
3283 lengthenRange(ranges, i, maxByte, countBytes);
\r
3287 if (rangeCount > 1) {
\r
3288 // sort the ranges by weight values
\r
3289 Arrays.sort(ranges, 0, rangeCount);
\r
3292 // set maxByte in ranges[0] for ucol_nextWeight()
\r
3293 ranges[0].m_count_ = maxByte;
\r
3295 return rangeCount;
\r
3299 * Updates the range length
\r
3300 * @param range weight range array
\r
3301 * @param offset to weight range array
\r
3303 * @param countBytes
\r
3304 * @return new length
\r
3306 private static final int lengthenRange(WeightRange range[], int offset,
\r
3307 int maxByte, int countBytes)
\r
3309 int length = range[offset].m_length2_ + 1;
\r
3310 range[offset].m_start_ = setWeightTrail(range[offset].m_start_, length,
\r
3311 RuleBasedCollator.BYTE_FIRST_TAILORED_);
\r
3312 range[offset].m_end_ = setWeightTrail(range[offset].m_end_, length,
\r
3314 range[offset].m_count2_ *= countBytes;
\r
3315 range[offset].m_length2_ = length;
\r
3320 * Gets the weight
\r
3324 * @return new weight
\r
3326 private static final int setWeightTrail(int weight, int length, int trail)
\r
3328 length = (4 - length) << 3;
\r
3329 return (weight & (0xffffff00 << length)) | (trail << length);
\r
3333 * take two CE weights and calculate the
\r
3334 * possible ranges of weights between the two limits, excluding them
\r
3335 * for weights with up to 4 bytes there are up to 2*4-1=7 ranges
\r
3336 * @param lowerLimit
\r
3337 * @param upperLimit
\r
3339 * @param countBytes
\r
3341 * @return weight ranges
\r
3343 private int getWeightRanges(int lowerLimit, int upperLimit, int maxByte,
\r
3344 int countBytes, WeightRange ranges[])
\r
3346 // assume that both lowerLimit & upperLimit are not 0
\r
3347 // get the lengths of the limits
\r
3348 int lowerLength = lengthOfWeight(lowerLimit);
\r
3349 int upperLength = lengthOfWeight(upperLimit);
\r
3350 if (Utility.compareUnsigned(lowerLimit, upperLimit) >= 0) {
\r
3353 // check that neither is a prefix of the other
\r
3354 if (lowerLength < upperLength) {
\r
3355 if (lowerLimit == truncateWeight(upperLimit, lowerLength)) {
\r
3359 // if the upper limit is a prefix of the lower limit then the earlier
\r
3360 // test lowerLimit >= upperLimit has caught it
\r
3361 // reset local variables
\r
3362 // With the limit lengths of 1..4, there are up to 7 ranges for
\r
3364 // range minimum length
\r
3372 // We are now going to calculate up to 7 ranges.
\r
3373 // Some of them will typically overlap, so we will then have to merge
\r
3374 // and eliminate ranges.
\r
3376 // We have to clean cruft from previous invocations
\r
3377 // before doing anything. C++ already does that
\r
3378 for(int length = 0; length < 5; length++) {
\r
3379 m_utilLowerWeightRange_[length].clear();
\r
3380 m_utilUpperWeightRange_[length].clear();
\r
3382 m_utilWeightRange_.clear();
\r
3384 int weight = lowerLimit;
\r
3385 for (int length = lowerLength; length >= 2; -- length) {
\r
3386 m_utilLowerWeightRange_[length].clear();
\r
3387 int trail = getWeightByte(weight, length);
\r
3388 if (trail < maxByte) {
\r
3389 m_utilLowerWeightRange_[length].m_start_
\r
3390 = incWeightTrail(weight, length);
\r
3391 m_utilLowerWeightRange_[length].m_end_
\r
3392 = setWeightTrail(weight, length, maxByte);
\r
3393 m_utilLowerWeightRange_[length].m_length_ = length;
\r
3394 m_utilLowerWeightRange_[length].m_count_ = maxByte - trail;
\r
3396 weight = truncateWeight(weight, length - 1);
\r
3398 m_utilWeightRange_.m_start_ = incWeightTrail(weight, 1);
\r
3400 weight = upperLimit;
\r
3401 // [0] and [1] are not used - this simplifies indexing,
\r
3402 // m_utilUpperWeightRange_
\r
3404 for (int length = upperLength; length >= 2; length --) {
\r
3405 int trail = getWeightByte(weight, length);
\r
3406 if (trail > RuleBasedCollator.BYTE_FIRST_TAILORED_) {
\r
3407 m_utilUpperWeightRange_[length].m_start_
\r
3408 = setWeightTrail(weight, length,
\r
3409 RuleBasedCollator.BYTE_FIRST_TAILORED_);
\r
3410 m_utilUpperWeightRange_[length].m_end_
\r
3411 = decWeightTrail(weight, length);
\r
3412 m_utilUpperWeightRange_[length].m_length_ = length;
\r
3413 m_utilUpperWeightRange_[length].m_count_ = trail
\r
3414 - RuleBasedCollator.BYTE_FIRST_TAILORED_;
\r
3416 weight = truncateWeight(weight, length - 1);
\r
3418 m_utilWeightRange_.m_end_ = decWeightTrail(weight, 1);
\r
3420 // set the middle range
\r
3421 m_utilWeightRange_.m_length_ = 1;
\r
3422 if (Utility.compareUnsigned(m_utilWeightRange_.m_end_, m_utilWeightRange_.m_start_) >= 0) {
\r
3423 //if (m_utilWeightRange_.m_end_ >= m_utilWeightRange_.m_start_) {
\r
3424 m_utilWeightRange_.m_count_
\r
3425 = ((m_utilWeightRange_.m_end_ - m_utilWeightRange_.m_start_)
\r
3429 // eliminate overlaps
\r
3430 // remove the middle range
\r
3431 m_utilWeightRange_.m_count_ = 0;
\r
3432 // reduce or remove the lower ranges that go beyond upperLimit
\r
3433 for (int length = 4; length >= 2; -- length) {
\r
3434 if (m_utilLowerWeightRange_[length].m_count_ > 0
\r
3435 && m_utilUpperWeightRange_[length].m_count_ > 0) {
\r
3436 int start = m_utilUpperWeightRange_[length].m_start_;
\r
3437 int end = m_utilLowerWeightRange_[length].m_end_;
\r
3438 if (end >= start || incWeight(end, length, maxByte)
\r
3440 // lower and upper ranges collide or are directly
\r
3441 // adjacent: merge these two and remove all shorter
\r
3443 start = m_utilLowerWeightRange_[length].m_start_;
\r
3444 end = m_utilLowerWeightRange_[length].m_end_
\r
3445 = m_utilUpperWeightRange_[length].m_end_;
\r
3446 // merging directly adjacent ranges needs to subtract
\r
3447 // the 0/1 gaps in between;
\r
3448 // it may result in a range with count>countBytes
\r
3449 m_utilLowerWeightRange_[length].m_count_
\r
3450 = getWeightByte(end, length)
\r
3451 - getWeightByte(start, length) + 1
\r
3452 + countBytes * (getWeightByte(end, length - 1)
\r
3453 - getWeightByte(start,
\r
3455 m_utilUpperWeightRange_[length].m_count_ = 0;
\r
3456 while (-- length >= 2) {
\r
3457 m_utilLowerWeightRange_[length].m_count_
\r
3458 = m_utilUpperWeightRange_[length].m_count_ = 0;
\r
3466 // copy the ranges, shortest first, into the result array
\r
3467 int rangeCount = 0;
\r
3468 if (m_utilWeightRange_.m_count_ > 0) {
\r
3469 ranges[0] = new WeightRange(m_utilWeightRange_);
\r
3472 for (int length = 2; length <= 4; ++ length) {
\r
3473 // copy upper first so that later the middle range is more likely
\r
3474 // the first one to use
\r
3475 if (m_utilUpperWeightRange_[length].m_count_ > 0) {
\r
3476 ranges[rangeCount]
\r
3477 = new WeightRange(m_utilUpperWeightRange_[length]);
\r
3480 if (m_utilLowerWeightRange_[length].m_count_ > 0) {
\r
3481 ranges[rangeCount]
\r
3482 = new WeightRange(m_utilLowerWeightRange_[length]);
\r
3486 return rangeCount;
\r
3490 * Truncates the weight with length
\r
3493 * @return truncated weight
\r
3495 private static final int truncateWeight(int weight, int length)
\r
3497 return weight & (0xffffffff << ((4 - length) << 3));
\r
3501 * Length of the weight
\r
3503 * @return length of the weight
\r
3505 private static final int lengthOfWeight(int weight)
\r
3507 if ((weight & 0xffffff) == 0) {
\r
3510 else if ((weight & 0xffff) == 0) {
\r
3513 else if ((weight & 0xff) == 0) {
\r
3520 * Increment the weight trail
\r
3523 * @return new weight
\r
3525 private static final int incWeightTrail(int weight, int length)
\r
3527 return weight + (1 << ((4-length) << 3));
\r
3531 * Decrement the weight trail
\r
3534 * @return new weight
\r
3536 private static int decWeightTrail(int weight, int length)
\r
3538 return weight - (1 << ((4 - length) << 3));
\r
3542 * Gets the codepoint
\r
3543 * @param tbl contraction table
\r
3544 * @param codePoint code point to look for
\r
3545 * @return the offset to the code point
\r
3547 private static int findCP(BasicContractionTable tbl, char codePoint)
\r
3550 while (codePoint > tbl.m_codePoints_.charAt(position)) {
\r
3552 if (position > tbl.m_codePoints_.length()) {
\r
3556 if (codePoint == tbl.m_codePoints_.charAt(position)) {
\r
3565 * Finds a contraction ce
\r
3571 private static int findCE(ContractionTable table, int element, char ch)
\r
3573 if (table == null) {
\r
3574 return CE_NOT_FOUND_;
\r
3576 BasicContractionTable tbl = getBasicContractionTable(table, element);
\r
3577 if (tbl == null) {
\r
3578 return CE_NOT_FOUND_;
\r
3580 int position = findCP(tbl, ch);
\r
3581 if (position > tbl.m_CEs_.size() || position < 0) {
\r
3582 return CE_NOT_FOUND_;
\r
3584 return ((Integer)tbl.m_CEs_.get(position)).intValue();
\r
3588 * Checks if the string is tailored in the contraction
\r
3589 * @param table contraction table
\r
3591 * @param array character array to check
\r
3592 * @param offset array offset
\r
3593 * @return true if it is tailored
\r
3595 private static boolean isTailored(ContractionTable table, int element,
\r
3596 char array[], int offset)
\r
3598 while (array[offset] != 0) {
\r
3599 element = findCE(table, element, array[offset]);
\r
3600 if (element == CE_NOT_FOUND_) {
\r
3603 if (!isContractionTableElement(element)) {
\r
3608 if (getCE(table, element, 0) != CE_NOT_FOUND_) {
\r
3617 * Assemble RuleBasedCollator
\r
3618 * @param t build table
\r
3619 * @param collator to update
\r
3621 private void assembleTable(BuildTable t, RuleBasedCollator collator)
\r
3623 IntTrieBuilder mapping = t.m_mapping_;
\r
3624 Vector expansions = t.m_expansions_;
\r
3625 ContractionTable contractions = t.m_contractions_;
\r
3626 MaxExpansionTable maxexpansion = t.m_maxExpansions_;
\r
3628 // contraction offset has to be in since we are building on the
\r
3629 // UCA contractions
\r
3630 // int beforeContractions = (HEADER_SIZE_
\r
3631 // + paddedsize(expansions.size() << 2)) >>> 1;
\r
3632 collator.m_contractionOffset_ = 0;
\r
3633 int contractionsSize = constructTable(contractions);
\r
3635 // the following operation depends on the trie data. Therefore, we have
\r
3636 // to do it before the trie is compacted
\r
3637 // sets jamo expansions
\r
3638 getMaxExpansionJamo(mapping, maxexpansion, t.m_maxJamoExpansions_,
\r
3639 collator.m_isJamoSpecial_);
\r
3641 // TODO: LATIN1 array is now in the utrie - it should be removed from
\r
3642 // the calculation
\r
3643 setAttributes(collator, t.m_options_);
\r
3644 // copy expansions
\r
3645 int size = expansions.size();
\r
3646 collator.m_expansion_ = new int[size];
\r
3647 for (int i = 0; i < size; i ++) {
\r
3648 collator.m_expansion_[i] = ((Integer)expansions.get(i)).intValue();
\r
3650 // contractions block
\r
3651 if (contractionsSize != 0) {
\r
3652 // copy contraction index
\r
3653 collator.m_contractionIndex_ = new char[contractionsSize];
\r
3654 contractions.m_codePoints_.getChars(0, contractionsSize,
\r
3655 collator.m_contractionIndex_,
\r
3657 // copy contraction collation elements
\r
3658 collator.m_contractionCE_ = new int[contractionsSize];
\r
3659 for (int i = 0; i < contractionsSize; i ++) {
\r
3660 collator.m_contractionCE_[i] = ((Integer)
\r
3661 contractions.m_CEs_.get(i)).intValue();
\r
3664 // copy mapping table
\r
3665 collator.m_trie_ = mapping.serialize(t,
\r
3666 RuleBasedCollator.DataManipulate.getInstance());
\r
3667 // copy max expansion table
\r
3668 // not copying the first element which is a dummy
\r
3669 // to be in synch with icu4c's builder, we continue to use the
\r
3670 // expansion offset
\r
3671 // omitting expansion offset in builder
\r
3672 collator.m_expansionOffset_ = 0;
\r
3673 size = maxexpansion.m_endExpansionCE_.size();
\r
3674 collator.m_expansionEndCE_ = new int[size - 1];
\r
3675 for (int i = 1; i < size; i ++) {
\r
3676 collator.m_expansionEndCE_[i - 1] = ((Integer)
\r
3677 maxexpansion.m_endExpansionCE_.get(i)).intValue();
\r
3679 collator.m_expansionEndCEMaxSize_ = new byte[size - 1];
\r
3680 for (int i = 1; i < size; i ++) {
\r
3681 collator.m_expansionEndCEMaxSize_[i - 1]
\r
3682 = ((Byte)maxexpansion.m_expansionCESize_.get(i)).byteValue();
\r
3684 // Unsafe chars table. Finish it off, then copy it.
\r
3685 unsafeCPAddCCNZ(t);
\r
3686 // Or in unsafebits from UCA, making a combined table.
\r
3687 for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i ++) {
\r
3688 t.m_unsafeCP_[i] |= RuleBasedCollator.UCA_.m_unsafe_[i];
\r
3690 collator.m_unsafe_ = t.m_unsafeCP_;
\r
3692 // Finish building Contraction Ending chars hash table and then copy it
\r
3694 // Or in unsafebits from UCA, making a combined table
\r
3695 for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i ++) {
\r
3696 t.m_contrEndCP_[i] |= RuleBasedCollator.UCA_.m_contractionEnd_[i];
\r
3698 collator.m_contractionEnd_ = t.m_contrEndCP_;
\r
3702 * Sets this collator to use the all options and tables in UCA.
\r
3703 * @param collator which attribute is to be set
\r
3704 * @param option to set with
\r
3706 private static final void setAttributes(RuleBasedCollator collator,
\r
3707 CollationRuleParser.OptionSet option)
\r
3709 collator.latinOneFailed_ = true;
\r
3710 collator.m_caseFirst_ = option.m_caseFirst_;
\r
3711 collator.setDecomposition(option.m_decomposition_);
\r
3712 collator.setAlternateHandlingShifted(
\r
3713 option.m_isAlternateHandlingShifted_);
\r
3714 collator.setCaseLevel(option.m_isCaseLevel_);
\r
3715 collator.setFrenchCollation(option.m_isFrenchCollation_);
\r
3716 collator.m_isHiragana4_ = option.m_isHiragana4_;
\r
3717 collator.setStrength(option.m_strength_);
\r
3718 collator.m_variableTopValue_ = option.m_variableTopValue_;
\r
3719 collator.latinOneFailed_ = false;
\r
3723 * Constructing the contraction table
\r
3724 * @param table contraction table
\r
3727 private int constructTable(ContractionTable table)
\r
3729 // See how much memory we need
\r
3730 int tsize = table.m_elements_.size();
\r
3734 table.m_offsets_.clear();
\r
3736 for (int i = 0; i < tsize; i ++) {
\r
3737 table.m_offsets_.add(new Integer(position));
\r
3738 position += ((BasicContractionTable)
\r
3739 table.m_elements_.get(i)).m_CEs_.size();
\r
3741 table.m_CEs_.clear();
\r
3742 table.m_codePoints_.delete(0, table.m_codePoints_.length());
\r
3743 // Now stuff the things in
\r
3744 StringBuffer cpPointer = table.m_codePoints_;
\r
3745 Vector CEPointer = table.m_CEs_;
\r
3746 for (int i = 0; i < tsize; i ++) {
\r
3747 BasicContractionTable bct = (BasicContractionTable)
\r
3748 table.m_elements_.get(i);
\r
3749 int size = bct.m_CEs_.size();
\r
3752 int offset = CEPointer.size();
\r
3753 CEPointer.add(bct.m_CEs_.get(0));
\r
3754 for (int j = 1; j < size; j ++) {
\r
3755 char ch = bct.m_codePoints_.charAt(j);
\r
3756 char cc = (char)(UCharacter.getCombiningClass(ch) & 0xFF);
\r
3763 cpPointer.append(ch);
\r
3764 CEPointer.add(bct.m_CEs_.get(j));
\r
3766 cpPointer.insert(offset,
\r
3767 (char)(((ccMin == ccMax) ? 1 : 0 << 8) | ccMax));
\r
3768 for (int j = 0; j < size; j ++) {
\r
3769 if (isContractionTableElement(((Integer)
\r
3770 CEPointer.get(offset + j)).intValue())) {
\r
3771 int ce = ((Integer)CEPointer.get(offset + j)).intValue();
\r
3772 CEPointer.set(offset + j,
\r
3773 new Integer(constructSpecialCE(getCETag(ce),
\r
3774 ((Integer)table.m_offsets_.get(
\r
3775 getContractionOffset(ce))).intValue())));
\r
3780 for (int i = 0; i <= 0x10FFFF; i ++) {
\r
3781 int CE = table.m_mapping_.getValue(i);
\r
3782 if (isContractionTableElement(CE)) {
\r
3783 CE = constructSpecialCE(getCETag(CE),
\r
3784 ((Integer)table.m_offsets_.get(
\r
3785 getContractionOffset(CE))).intValue());
\r
3786 table.m_mapping_.setValue(i, CE);
\r
3793 * Get contraction offset
\r
3794 * @param ce collation element
\r
3795 * @return contraction offset
\r
3797 private static final int getContractionOffset(int ce)
\r
3799 return ce & 0xFFFFFF;
\r
3803 * Gets the maximum Jamo expansion
\r
3804 * @param mapping trie table
\r
3805 * @param maxexpansion maximum expansion table
\r
3806 * @param maxjamoexpansion maximum jamo expansion table
\r
3807 * @param jamospecial is jamo special?
\r
3809 private static void getMaxExpansionJamo(IntTrieBuilder mapping,
\r
3810 MaxExpansionTable maxexpansion,
\r
3811 MaxJamoExpansionTable
\r
3813 boolean jamospecial)
\r
3815 int VBASE = 0x1161;
\r
3816 int TBASE = 0x11A8;
\r
3819 int v = VBASE + VCOUNT - 1;
\r
3820 int t = TBASE + TCOUNT - 1;
\r
3822 while (v >= VBASE) {
\r
3823 int ce = mapping.getValue(v);
\r
3824 if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_)
\r
3825 != RuleBasedCollator.CE_SPECIAL_FLAG_) {
\r
3826 setMaxExpansion(ce, (byte)2, maxexpansion);
\r
3831 while (t >= TBASE)
\r
3833 int ce = mapping.getValue(t);
\r
3834 if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_)
\r
3835 != RuleBasedCollator.CE_SPECIAL_FLAG_) {
\r
3836 setMaxExpansion(ce, (byte)3, maxexpansion);
\r
3840 // According to the docs, 99% of the time, the Jamo will not be special
\r
3841 if (jamospecial) {
\r
3842 // gets the max expansion in all unicode characters
\r
3843 int count = maxjamoexpansion.m_endExpansionCE_.size();
\r
3844 byte maxTSize = (byte)(maxjamoexpansion.m_maxLSize_ +
\r
3845 maxjamoexpansion.m_maxVSize_ +
\r
3846 maxjamoexpansion.m_maxTSize_);
\r
3847 byte maxVSize = (byte)(maxjamoexpansion.m_maxLSize_ +
\r
3848 maxjamoexpansion.m_maxVSize_);
\r
3850 while (count > 0) {
\r
3852 if (((Boolean)maxjamoexpansion.m_isV_.get(count)).booleanValue()
\r
3854 setMaxExpansion(((Integer)
\r
3855 maxjamoexpansion.m_endExpansionCE_.get(count)).intValue(),
\r
3856 maxVSize, maxexpansion);
\r
3859 setMaxExpansion(((Integer)
\r
3860 maxjamoexpansion.m_endExpansionCE_.get(count)).intValue(),
\r
3861 maxTSize, maxexpansion);
\r
3868 * To the UnsafeCP hash table, add all chars with combining class != 0
\r
3869 * @param t build table
\r
3871 private static final void unsafeCPAddCCNZ(BuildTable t)
\r
3873 boolean buildCMTable = ( buildCMTabFlag & (t.cmLookup==null) );
\r
3874 char[] cm = null; // combining mark array
\r
3875 int[] index= new int[256];
\r
3878 if (buildCMTable) {
\r
3879 cm = new char[0x10000];
\r
3881 for (char c = 0; c < 0xffff; c ++) {
\r
3882 char fcd = NormalizerImpl.getFCD16(c);
\r
3883 if (fcd >= 0x100 || // if the leading combining class(c) > 0 ||
\r
3884 (UTF16.isLeadSurrogate(c) && fcd != 0)) {
\r
3885 // c is a leading surrogate with some FCD data
\r
3886 unsafeCPSet(t.m_unsafeCP_, c);
\r
3887 if ( buildCMTable && (fcd!=0)) {
\r
3888 int cc = (fcd& 0xff);
\r
3889 int pos = (cc<<8)+index[cc];
\r
3897 if (t.m_prefixLookup_ != null) {
\r
3898 Enumeration els = t.m_prefixLookup_.elements();
\r
3899 while (els.hasMoreElements()) {
\r
3900 Elements e = (Elements)els.nextElement();
\r
3901 // codepoints here are in the NFD form. We need to add the
\r
3902 // first code point of the NFC form to unsafe, because
\r
3903 // strcoll needs to backup over them.
\r
3904 // weiv: This is wrong! See the comment above.
\r
3905 //String decomp = Normalizer.decompose(e.m_cPoints_, true);
\r
3906 //unsafeCPSet(t.m_unsafeCP_, decomp.charAt(0));
\r
3908 String comp = Normalizer.compose(e.m_cPoints_, false);
\r
3909 unsafeCPSet(t.m_unsafeCP_, comp.charAt(0));
\r
3913 if (buildCMTable) {
\r
3914 t.cmLookup = new CombinClassTable();
\r
3915 t.cmLookup.generate(cm, count, index);
\r
3921 * @param t build table
\r
3922 * @param collator RuleBasedCollator
\r
3923 * @param colEl collation element iterator
\r
3926 * @param type character type
\r
3929 private boolean enumCategoryRangeClosureCategory(BuildTable t,
\r
3930 RuleBasedCollator collator,
\r
3931 CollationElementIterator colEl,
\r
3932 int start, int limit, int type)
\r
3934 if (type != UCharacterCategory.UNASSIGNED
\r
3935 && type != UCharacterCategory.PRIVATE_USE) {
\r
3936 // if the range is assigned - we might ommit more categories later
\r
3938 for (int u32 = start; u32 < limit; u32 ++) {
\r
3939 int noOfDec = NormalizerImpl.getDecomposition(u32, false,
\r
3940 m_utilCharBuffer_,
\r
3942 if (noOfDec > 0) {
\r
3943 // if we're positive, that means there is no decomposition
\r
3944 String comp = UCharacter.toString(u32);
\r
3945 String decomp = new String(m_utilCharBuffer_, 0, noOfDec);
\r
3946 if (!collator.equals(comp, decomp)) {
\r
3947 m_utilElement_.m_cPoints_ = decomp;
\r
3948 m_utilElement_.m_prefix_ = 0;
\r
3950 = (Elements)t.m_prefixLookup_.get(m_utilElement_);
\r
3951 if (prefix == null) {
\r
3952 m_utilElement_.m_cPoints_ = comp;
\r
3953 m_utilElement_.m_prefix_ = 0;
\r
3954 m_utilElement_.m_prefixChars_ = null;
\r
3955 colEl.setText(decomp);
\r
3956 int ce = colEl.next();
\r
3957 m_utilElement_.m_CELength_ = 0;
\r
3958 while (ce != CollationElementIterator.NULLORDER) {
\r
3959 m_utilElement_.m_CEs_[
\r
3960 m_utilElement_.m_CELength_ ++]
\r
3962 ce = colEl.next();
\r
3966 m_utilElement_.m_cPoints_ = comp;
\r
3967 m_utilElement_.m_prefix_ = 0;
\r
3968 m_utilElement_.m_prefixChars_ = null;
\r
3969 m_utilElement_.m_CELength_ = 1;
\r
3970 m_utilElement_.m_CEs_[0] = prefix.m_mapCE_;
\r
3971 // This character uses a prefix. We have to add it
\r
3972 // to the unsafe table, as it decomposed form is
\r
3973 // already in. In Japanese, this happens for \u309e
\r
3975 // Since unsafeCPSet is static in ucol_elm, we are
\r
3976 // going to wrap it up in the unsafeCPAddCCNZ
\r
3979 addAnElement(t, m_utilElement_);
\r
3988 * Determine if a character is a Jamo
\r
3989 * @param ch character to test
\r
3990 * @return true if ch is a Jamo, false otherwise
\r
3992 private static final boolean isJamo(char ch)
\r
3994 return (ch >= 0x1100 && ch <= 0x1112)
\r
3995 || (ch >= 0x1175 && ch <= 0x1161)
\r
3996 || (ch >= 0x11A8 && ch <= 0x11C2);
\r
4000 * Produces canonical closure
\r
4002 private void canonicalClosure(BuildTable t)
\r
4004 BuildTable temp = new BuildTable(t);
\r
4005 assembleTable(temp, temp.m_collator_);
\r
4006 // produce canonical closure
\r
4007 CollationElementIterator coleiter
\r
4008 = temp.m_collator_.getCollationElementIterator("");
\r
4009 RangeValueIterator typeiter = UCharacter.getTypeIterator();
\r
4010 RangeValueIterator.Element element = new RangeValueIterator.Element();
\r
4011 while (typeiter.next(element)) {
\r
4012 enumCategoryRangeClosureCategory(t, temp.m_collator_, coleiter,
\r
4013 element.start, element.limit,
\r
4017 t.cmLookup = temp.cmLookup;
\r
4018 temp.cmLookup = null;
\r
4020 for (int i = 0; i < m_parser_.m_resultLength_; i ++) {
\r
4021 char baseChar, firstCM;
\r
4022 // now we need to generate the CEs
\r
4023 // We stuff the initial value in the buffers, and increase the
\r
4024 // appropriate buffer according to strength */
\r
4025 // createElements(t, m_parser_.m_listHeader_[i]);
\r
4026 CollationRuleParser.Token tok = m_parser_.m_listHeader_[i].m_first_;
\r
4027 m_utilElement_.clear();
\r
4028 while ( tok!=null ) {
\r
4029 m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_;
\r
4030 m_utilElement_.m_cPointsOffset_ = 0; //el.m_uchars_;
\r
4031 if (tok.m_prefix_ != 0) {
\r
4032 // we will just copy the prefix here, and adjust accordingly in
\r
4033 // the addPrefix function in ucol_elm. The reason is that we
\r
4034 // need to add both composed AND decomposed elements to the
\r
4036 int size = tok.m_prefix_ >> 24;
\r
4037 int offset = tok.m_prefix_ & 0x00FFFFFF;
\r
4038 m_utilElement_.m_prefixChars_
\r
4039 = m_parser_.m_source_.substring(offset, offset + size);
\r
4040 size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24);
\r
4041 offset = (tok.m_source_ & 0x00FFFFFF) + (tok.m_prefix_ >> 24);
\r
4042 m_utilElement_.m_uchars_
\r
4043 = m_parser_.m_source_.substring(offset, offset + size);
\r
4046 m_utilElement_.m_prefixChars_ = null;
\r
4047 int offset = tok.m_source_ & 0x00FFFFFF;
\r
4048 int size = tok.m_source_ >>> 24;
\r
4049 m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(offset,
\r
4052 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
\r
4054 baseChar = firstCM = 0; //reset
\r
4055 for (int j = 0; j < m_utilElement_.m_cPoints_.length()
\r
4056 - m_utilElement_.m_cPointsOffset_; j ++) {
\r
4058 char fcd = NormalizerImpl.getFCD16(m_utilElement_.m_cPoints_.charAt(j));
\r
4059 if ( (fcd & 0xff) == 0 ) {
\r
4060 baseChar = m_utilElement_.m_cPoints_.charAt(j);
\r
4063 if ( (baseChar!=0) && (firstCM==0) ) {
\r
4064 firstCM = m_utilElement_.m_cPoints_.charAt(j); // first combining mark
\r
4069 if ( (baseChar!=0) && (firstCM!=0)) {
\r
4070 addTailCanonicalClosures(t, temp.m_collator_, coleiter, baseChar, firstCM);
\r
4072 tok = tok.m_next_;
\r
4077 private void addTailCanonicalClosures(BuildTable t,
\r
4078 RuleBasedCollator m_collator,
\r
4079 CollationElementIterator colEl,
\r
4082 if ( t.cmLookup == null ) {
\r
4085 CombinClassTable cmLookup = t.cmLookup;
\r
4086 int[] index = cmLookup.index;
\r
4087 int cClass = NormalizerImpl.getFCD16(cMark) & 0xff;
\r
4089 char[] precompCh = new char[256];
\r
4090 int[] precompClass = new int[256];
\r
4092 Elements element = new Elements();
\r
4095 maxIndex = index[cClass-1];
\r
4097 for (int i=0; i<maxIndex; i++) {
\r
4098 StringBuffer decompBuf = new StringBuffer();
\r
4099 decompBuf.append(baseChar).append(cmLookup.cPoints[i]);
\r
4100 String comp = Normalizer.compose(decompBuf.toString(), false);
\r
4101 if ( comp.length() == 1 ) {
\r
4102 precompCh[precompLen] = comp.charAt(0);
\r
4103 precompClass[precompLen] =
\r
4104 (NormalizerImpl.getFCD16(cmLookup.cPoints[i]) & 0xff);
\r
4106 StringBuffer decomp = new StringBuffer();
\r
4107 for (int j=0; j < m_utilElement_.m_cPoints_.length(); j++) {
\r
4108 if ( m_utilElement_.m_cPoints_.charAt(j) == cMark ) {
\r
4109 decomp.append(cmLookup.cPoints[i]);
\r
4112 decomp.append(m_utilElement_.m_cPoints_.charAt(j));
\r
4115 comp = Normalizer.compose(decomp.toString(), false);
\r
4116 StringBuffer buf= new StringBuffer(comp);
\r
4117 buf.append(cMark);
\r
4118 decomp.append(cMark);
\r
4119 comp = buf.toString();
\r
4121 element.m_cPoints_ = decomp.toString();
\r
4122 element.m_CELength_ = 0;
\r
4123 element.m_prefix_ = 0;
\r
4124 Elements prefix = (Elements)t.m_prefixLookup_.get(element);
\r
4125 element.m_cPoints_ = comp;
\r
4126 element.m_uchars_ = comp;
\r
4128 if (prefix == null) {
\r
4129 element.m_prefix_ = 0;
\r
4130 element.m_prefixChars_ = null;
\r
4131 colEl.setText(decomp.toString());
\r
4132 int ce = colEl.next();
\r
4133 element.m_CELength_ = 0;
\r
4134 while (ce != CollationElementIterator.NULLORDER) {
\r
4135 element.m_CEs_[element.m_CELength_ ++] = ce;
\r
4136 ce = colEl.next();
\r
4140 element.m_cPoints_ = comp;
\r
4141 element.m_prefix_ = 0;
\r
4142 element.m_prefixChars_ = null;
\r
4143 element.m_CELength_ = 1;
\r
4144 element.m_CEs_[0] = prefix.m_mapCE_;
\r
4146 setMapCE(t, element);
\r
4147 finalizeAddition(t, element);
\r
4149 if (comp.length()>2) {
\r
4150 // This is a fix for tailoring contractions with accented
\r
4151 // character at the end of contraction string.
\r
4152 addFCD4AccentedContractions(t, colEl, comp, element);
\r
4154 if ( precompLen > 1 ) {
\r
4155 precompLen = addMultiCMontractions(t, colEl, element, precompCh,
\r
4156 precompClass, precompLen, cMark, i, decomp.toString());
\r
4163 private void setMapCE(BuildTable t, Elements element) {
\r
4164 Vector expansions = t.m_expansions_;
\r
4165 element.m_mapCE_ = 0;
\r
4167 if (element.m_CELength_ == 2 // a two CE expansion
\r
4168 && RuleBasedCollator.isContinuation(element.m_CEs_[1])
\r
4169 && (element.m_CEs_[1]
\r
4170 & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that has only primaries in continuation
\r
4171 && (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_)
\r
4172 // a common secondary
\r
4173 && ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_)) { // and a common tertiary
\r
4175 element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_
\r
4176 // a long primary special
\r
4177 | (CE_LONG_PRIMARY_TAG_ << 24)
\r
4178 // first and second byte of primary
\r
4179 | ((element.m_CEs_[0] >> 8) & 0xFFFF00)
\r
4180 // third byte of primary
\r
4181 | ((element.m_CEs_[1] >> 24) & 0xFF);
\r
4184 // omitting expansion offset in builder
\r
4185 // (HEADER_SIZE_ >> 2)
\r
4186 int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
\r
4187 | (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
\r
4188 | (addExpansion(expansions, element.m_CEs_[0]) << 4) & 0xFFFFF0;
\r
4190 for (int i = 1; i < element.m_CELength_; i ++) {
\r
4191 addExpansion(expansions, element.m_CEs_[i]);
\r
4193 if (element.m_CELength_ <= 0xF) {
\r
4194 expansion |= element.m_CELength_;
\r
4197 addExpansion(expansions, 0);
\r
4199 element.m_mapCE_ = expansion;
\r
4200 setMaxExpansion(element.m_CEs_[element.m_CELength_ - 1],
\r
4201 (byte)element.m_CELength_, t.m_maxExpansions_);
\r
4205 private int addMultiCMontractions(BuildTable t,
\r
4206 CollationElementIterator colEl,
\r
4209 int[] precompClass,
\r
4215 CombinClassTable cmLookup = t.cmLookup;
\r
4216 char[] combiningMarks = {cMark};
\r
4217 int cMarkClass = (int)(UCharacter.getCombiningClass(cMark) & 0xFF);
\r
4218 String comMark = new String(combiningMarks);
\r
4219 int noOfPrecomposedChs = maxComp;
\r
4221 for (int j=0; j<maxComp; j++) {
\r
4223 StringBuffer temp;
\r
4226 String newDecomp, comp;
\r
4228 if ( count==0 ) { // Decompose the saved precomposed char.
\r
4229 newDecomp = Normalizer.decompose( new String(precompCh, j ,1), false);
\r
4230 temp = new StringBuffer(newDecomp);
\r
4231 temp.append(cmLookup.cPoints[cmPos]);
\r
4232 newDecomp = temp.toString();
\r
4235 temp = new StringBuffer(decomp);
\r
4236 temp.append(precompCh[j]);
\r
4237 newDecomp = temp.toString();
\r
4239 comp = Normalizer.compose(newDecomp, false);
\r
4240 if (comp.length()==1) {
\r
4241 temp.append(cMark);
\r
4242 element.m_cPoints_ = temp.toString();
\r
4243 element.m_CELength_ = 0;
\r
4244 element.m_prefix_ = 0;
\r
4245 Elements prefix = (Elements)t.m_prefixLookup_.get(element);
\r
4246 element.m_cPoints_ = comp+comMark;
\r
4247 if (prefix == null) {
\r
4248 element.m_prefix_ = 0;
\r
4249 element.m_prefixChars_ = null;
\r
4250 colEl.setText(temp.toString());
\r
4251 int ce = colEl.next();
\r
4252 element.m_CELength_ = 0;
\r
4253 while (ce != CollationElementIterator.NULLORDER) {
\r
4254 element.m_CEs_[element.m_CELength_ ++] = ce;
\r
4255 ce = colEl.next();
\r
4259 element.m_cPoints_ = comp;
\r
4260 element.m_prefix_ = 0;
\r
4261 element.m_prefixChars_ = null;
\r
4262 element.m_CELength_ = 1;
\r
4263 element.m_CEs_[0] = prefix.m_mapCE_;
\r
4265 setMapCE(t, element);
\r
4266 finalizeAddition(t, element);
\r
4267 precompCh[noOfPrecomposedChs] = comp.charAt(0);
\r
4268 precompClass[noOfPrecomposedChs] = cMarkClass;
\r
4269 noOfPrecomposedChs++;
\r
4271 }while(++count<2 && (precompClass[j]==cMarkClass));
\r
4273 return noOfPrecomposedChs;
\r
4276 private void addFCD4AccentedContractions(BuildTable t,
\r
4277 CollationElementIterator colEl,
\r
4279 Elements element) {
\r
4280 String decomp = Normalizer.decompose(data, false);
\r
4281 String comp = Normalizer.compose(data, false);
\r
4283 element.m_cPoints_ = decomp;
\r
4284 element.m_CELength_ = 0;
\r
4285 element.m_prefix_ = 0;
\r
4286 Elements prefix = (Elements)t.m_prefixLookup_.get(element);
\r
4287 if (prefix == null) {
\r
4288 element.m_cPoints_ = comp;
\r
4289 element.m_prefix_ = 0;
\r
4290 element.m_prefixChars_ = null;
\r
4291 element.m_CELength_ = 0;
\r
4292 colEl.setText(decomp);
\r
4293 int ce = colEl.next();
\r
4294 element.m_CELength_ = 0;
\r
4295 while (ce != CollationElementIterator.NULLORDER) {
\r
4296 element.m_CEs_[element.m_CELength_ ++] = ce;
\r
4297 ce = colEl.next();
\r
4299 addAnElement(t, element);
\r
4303 private void processUCACompleteIgnorables(BuildTable t)
\r
4305 TrieIterator trieiterator
\r
4306 = new TrieIterator(RuleBasedCollator.UCA_.m_trie_);
\r
4307 RangeValueIterator.Element element = new RangeValueIterator.Element();
\r
4308 while (trieiterator.next(element)) {
\r
4309 int start = element.start;
\r
4310 int limit = element.limit;
\r
4311 if (element.value == 0) {
\r
4312 while (start < limit) {
\r
4313 int CE = t.m_mapping_.getValue(start);
\r
4314 if (CE == CE_NOT_FOUND_) {
\r
4315 m_utilElement_.m_prefix_ = 0;
\r
4316 m_utilElement_.m_uchars_ = UCharacter.toString(start);
\r
4317 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
\r
4318 m_utilElement_.m_cPointsOffset_ = 0;
\r
4319 m_utilElement_.m_CELength_ = 1;
\r
4320 m_utilElement_.m_CEs_[0] = 0;
\r
4321 addAnElement(t, m_utilElement_);
\r