]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-52_1/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java
Upgrade ICU4J.
[Dictionary.git] / jars / icu4j-52_1 / main / classes / collate / src / com / ibm / icu / text / CollationParsedRuleBuilder.java
1 /**
2  *******************************************************************************
3  * Copyright (C) 1996-2011, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  *******************************************************************************
6  */
7 package com.ibm.icu.text;
8
9 import java.io.IOException;
10 import java.text.ParseException;
11 import java.util.ArrayList;
12 import java.util.Arrays;
13 import java.util.Collections;
14 import java.util.Enumeration;
15 import java.util.HashMap;
16 import java.util.List;
17 import java.util.Map;
18
19 import com.ibm.icu.impl.IntTrieBuilder;
20 import com.ibm.icu.impl.Norm2AllModes;
21 import com.ibm.icu.impl.Normalizer2Impl;
22 import com.ibm.icu.impl.TrieBuilder;
23 import com.ibm.icu.impl.TrieIterator;
24 import com.ibm.icu.impl.UCharacterProperty;
25 import com.ibm.icu.impl.Utility;
26 import com.ibm.icu.lang.UCharacter;
27 import com.ibm.icu.lang.UCharacterCategory;
28 import com.ibm.icu.util.RangeValueIterator;
29 import com.ibm.icu.util.VersionInfo;
30
31 /**
32  * Class for building a collator from a list of collation rules. This class is
33  * uses CollationRuleParser
34  * 
35  * @author Syn Wee Quek
36  * @since release 2.2, June 11 2002
37  */
38 final class CollationParsedRuleBuilder {
39     // package private constructors ------------------------------------------
40
41     /**
42      * Constructor
43      * 
44      * @param rules
45      *            collation rules
46      * @exception ParseException
47      *                thrown when argument rules have an invalid syntax
48      */
49     CollationParsedRuleBuilder(String rules) throws ParseException {
50         m_parser_ = new CollationRuleParser(rules);
51         m_parser_.assembleTokenList();
52         m_utilColEIter_ = RuleBasedCollator.UCA_
53                 .getCollationElementIterator("");
54     }
55
56     // package private inner classes -----------------------------------------
57
58     /**
59      * Inverse UCA wrapper
60      */
61     static class InverseUCA {
62         // package private constructor ---------------------------------------
63
64         InverseUCA() {
65         }
66
67         // package private data member ---------------------------------------
68
69         /**
70          * Array list of characters
71          */
72         int m_table_[];
73         /**
74          * Array list of continuation characters
75          */
76         char m_continuations_[];
77
78         /**
79          * UCA version of inverse UCA table
80          */
81         VersionInfo m_UCA_version_;
82
83         // package private method --------------------------------------------
84
85         /**
86          * Returns the previous inverse ces of the argument ces
87          * 
88          * @param ce
89          *            ce to test
90          * @param contce
91          *            continuation ce to test
92          * @param strength
93          *            collation strength
94          * @param prevresult
95          *            an array to store the return results previous inverse ce
96          *            and previous inverse continuation ce
97          * @return result of the inverse ce
98          */
99         final int getInversePrevCE(int ce, int contce, int strength,
100                 int prevresult[]) {
101             int result = findInverseCE(ce, contce);
102
103             if (result < 0) {
104                 prevresult[0] = CollationElementIterator.NULLORDER;
105                 return -1;
106             }
107
108             ce &= STRENGTH_MASK_[strength];
109             contce &= STRENGTH_MASK_[strength];
110
111             prevresult[0] = ce;
112             prevresult[1] = contce;
113
114             while ((prevresult[0] & STRENGTH_MASK_[strength]) == ce
115                     && (prevresult[1] & STRENGTH_MASK_[strength]) == contce
116                     && result > 0) {
117                 // this condition should prevent falling off the edge of the
118                 // world
119                 // here, we end up in a singularity - zero
120                 prevresult[0] = m_table_[3 * (--result)];
121                 prevresult[1] = m_table_[3 * result + 1];
122             }
123             return result;
124         }
125
126         final int getCEStrengthDifference(int CE, int contCE, int prevCE,
127                 int prevContCE) {
128             int strength = Collator.TERTIARY;
129             while (((prevCE & STRENGTH_MASK_[strength]) != (CE & STRENGTH_MASK_[strength]) || (prevContCE & STRENGTH_MASK_[strength]) != (contCE & STRENGTH_MASK_[strength]))
130                     && (strength != 0)) {
131                 strength--;
132             }
133             return strength;
134         }
135
136         private int compareCEs(int source0, int source1, int target0,
137                 int target1) {
138             int s1 = source0, s2, t1 = target0, t2;
139             if (RuleBasedCollator.isContinuation(source1)) {
140                 s2 = source1;
141             } else {
142                 s2 = 0;
143             }
144             if (RuleBasedCollator.isContinuation(target1)) {
145                 t2 = target1;
146             } else {
147                 t2 = 0;
148             }
149
150             int s = 0, t = 0;
151             if (s1 == t1 && s2 == t2) {
152                 return 0;
153             }
154             s = (s1 & 0xFFFF0000) | ((s2 & 0xFFFF0000) >>> 16);
155             t = (t1 & 0xFFFF0000) | ((t2 & 0xFFFF0000) >>> 16);
156             if (s == t) {
157                 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00) >> 8;
158                 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00) >> 8;
159                 if (s == t) {
160                     s = (s1 & 0x000000FF) << 8 | (s2 & 0x000000FF);
161                     t = (t1 & 0x000000FF) << 8 | (t2 & 0x000000FF);
162                     return Utility.compareUnsigned(s, t);
163                 } else {
164                     return Utility.compareUnsigned(s, t);
165                 }
166             } else {
167                 return Utility.compareUnsigned(s, t);
168             }
169         }
170
171         /**
172          * Finding the inverse CE of the argument CEs
173          * 
174          * @param ce
175          *            CE to be tested
176          * @param contce
177          *            continuation CE
178          * @return inverse CE
179          */
180         int findInverseCE(int ce, int contce) {
181             int bottom = 0;
182             int top = m_table_.length / 3;
183             int result = 0;
184
185             while (bottom < top - 1) {
186                 result = (top + bottom) >> 1;
187                 int first = m_table_[3 * result];
188                 int second = m_table_[3 * result + 1];
189                 int comparison = compareCEs(first, second, ce, contce);
190                 if (comparison > 0) {
191                     top = result;
192                 } else if (comparison < 0) {
193                     bottom = result;
194                 } else {
195                     break;
196                 }
197             }
198
199             return result;
200         }
201
202         /**
203          * Getting gap offsets in the inverse UCA
204          * 
205          * @param listheader
206          *            parsed token lists
207          * @exception Exception
208          *                thrown when error occurs while finding the collation
209          *                gaps
210          */
211         void getInverseGapPositions(
212                 CollationRuleParser.TokenListHeader listheader)
213                 throws Exception {
214             // reset all the gaps
215             CollationRuleParser.Token token = listheader.m_first_;
216             int tokenstrength = token.m_strength_;
217
218             for (int i = 0; i < 3; i++) {
219                 listheader.m_gapsHi_[3 * i] = 0;
220                 listheader.m_gapsHi_[3 * i + 1] = 0;
221                 listheader.m_gapsHi_[3 * i + 2] = 0;
222                 listheader.m_gapsLo_[3 * i] = 0;
223                 listheader.m_gapsLo_[3 * i + 1] = 0;
224                 listheader.m_gapsLo_[3 * i + 2] = 0;
225                 listheader.m_numStr_[i] = 0;
226                 listheader.m_fStrToken_[i] = null;
227                 listheader.m_lStrToken_[i] = null;
228                 listheader.m_pos_[i] = -1;
229             }
230
231             if ((listheader.m_baseCE_ >>> 24) >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_
232                     && (listheader.m_baseCE_ >>> 24) <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_) {
233                 // implicits -
234                 listheader.m_pos_[0] = 0;
235                 int t1 = listheader.m_baseCE_;
236                 int t2 = listheader.m_baseContCE_;
237                 listheader.m_gapsLo_[0] = mergeCE(t1, t2, Collator.PRIMARY);
238                 listheader.m_gapsLo_[1] = mergeCE(t1, t2, Collator.SECONDARY);
239                 listheader.m_gapsLo_[2] = mergeCE(t1, t2, Collator.TERTIARY);
240                 int primaryCE = t1 & RuleBasedCollator.CE_PRIMARY_MASK_
241                         | (t2 & RuleBasedCollator.CE_PRIMARY_MASK_) >>> 16;
242                 primaryCE = RuleBasedCollator.impCEGen_
243                         .getImplicitFromRaw(RuleBasedCollator.impCEGen_
244                                 .getRawFromImplicit(primaryCE) + 1);
245
246                 t1 = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
247                 t2 = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_
248                         | RuleBasedCollator.CE_CONTINUATION_MARKER_;
249
250                 // if (listheader.m_baseCE_ < 0xEF000000) {
251                 // // first implicits have three byte primaries, with a gap of
252                 // // one so we esentially need to add 2 to the top byte in
253                 // // listheader.m_baseContCE_
254                 // t2 += 0x02000000;
255                 // }
256                 // else {
257                 // // second implicits have four byte primaries, with a gap of
258                 // // IMPLICIT_LAST2_MULTIPLIER_
259                 // // Now, this guy is not really accessible here, so until we
260                 // // find a better way to pass it around, assume that the gap
261                 // is 1
262                 // t2 += 0x00020000;
263                 // }
264                 listheader.m_gapsHi_[0] = mergeCE(t1, t2, Collator.PRIMARY);
265                 listheader.m_gapsHi_[1] = mergeCE(t1, t2, Collator.SECONDARY);
266                 listheader.m_gapsHi_[2] = mergeCE(t1, t2, Collator.TERTIARY);
267             } else if (listheader.m_indirect_ == true
268                     && listheader.m_nextCE_ != 0) {
269                 listheader.m_pos_[0] = 0;
270                 int t1 = listheader.m_baseCE_;
271                 int t2 = listheader.m_baseContCE_;
272                 listheader.m_gapsLo_[0] = mergeCE(t1, t2, Collator.PRIMARY);
273                 listheader.m_gapsLo_[1] = mergeCE(t1, t2, Collator.SECONDARY);
274                 listheader.m_gapsLo_[2] = mergeCE(t1, t2, Collator.TERTIARY);
275                 t1 = listheader.m_nextCE_;
276                 t2 = listheader.m_nextContCE_;
277                 listheader.m_gapsHi_[0] = mergeCE(t1, t2, Collator.PRIMARY);
278                 listheader.m_gapsHi_[1] = mergeCE(t1, t2, Collator.SECONDARY);
279                 listheader.m_gapsHi_[2] = mergeCE(t1, t2, Collator.TERTIARY);
280             } else {
281                 while (true) {
282                     if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {
283                         listheader.m_pos_[tokenstrength] = getInverseNext(
284                                 listheader, tokenstrength);
285                         if (listheader.m_pos_[tokenstrength] >= 0) {
286                             listheader.m_fStrToken_[tokenstrength] = token;
287                         } else {
288                             // The CE must be implicit, since it's not in the
289                             // table
290                             // Error
291                             throw new Exception("Internal program error");
292                         }
293                     }
294
295                     while (token != null && token.m_strength_ >= tokenstrength) {
296                         if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {
297                             listheader.m_lStrToken_[tokenstrength] = token;
298                         }
299                         token = token.m_next_;
300                     }
301                     if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_ - 1) {
302                         // check if previous interval is the same and merge the
303                         // intervals if it is so
304                         if (listheader.m_pos_[tokenstrength] == listheader.m_pos_[tokenstrength + 1]) {
305                             listheader.m_fStrToken_[tokenstrength] = listheader.m_fStrToken_[tokenstrength + 1];
306                             listheader.m_fStrToken_[tokenstrength + 1] = null;
307                             listheader.m_lStrToken_[tokenstrength + 1] = null;
308                             listheader.m_pos_[tokenstrength + 1] = -1;
309                         }
310                     }
311                     if (token != null) {
312                         tokenstrength = token.m_strength_;
313                     } else {
314                         break;
315                     }
316                 }
317                 for (int st = 0; st < 3; st++) {
318                     int pos = listheader.m_pos_[st];
319                     if (pos >= 0) {
320                         int t1 = m_table_[3 * pos];
321                         int t2 = m_table_[3 * pos + 1];
322                         listheader.m_gapsHi_[3 * st] = mergeCE(t1, t2,
323                                 Collator.PRIMARY);
324                         listheader.m_gapsHi_[3 * st + 1] = mergeCE(t1, t2,
325                                 Collator.SECONDARY);
326                         listheader.m_gapsHi_[3 * st + 2] = (t1 & 0x3f) << 24
327                                 | (t2 & 0x3f) << 16;
328                         // pos --;
329                         // t1 = m_table_[3 * pos];
330                         // t2 = m_table_[3 * pos + 1];
331                         t1 = listheader.m_baseCE_;
332                         t2 = listheader.m_baseContCE_;
333
334                         listheader.m_gapsLo_[3 * st] = mergeCE(t1, t2,
335                                 Collator.PRIMARY);
336                         listheader.m_gapsLo_[3 * st + 1] = mergeCE(t1, t2,
337                                 Collator.SECONDARY);
338                         listheader.m_gapsLo_[3 * st + 2] = (t1 & 0x3f) << 24
339                                 | (t2 & 0x3f) << 16;
340                     }
341                 }
342             }
343         }
344
345         /**
346          * Gets the next CE in the inverse table
347          * 
348          * @param listheader
349          *            token list header
350          * @param strength
351          *            collation strength
352          * @return next ce
353          */
354         private final int getInverseNext(
355                 CollationRuleParser.TokenListHeader listheader, int strength) {
356             int ce = listheader.m_baseCE_;
357             int secondce = listheader.m_baseContCE_;
358             int result = findInverseCE(ce, secondce);
359
360             if (result < 0) {
361                 return -1;
362             }
363
364             ce &= STRENGTH_MASK_[strength];
365             secondce &= STRENGTH_MASK_[strength];
366
367             int nextce = ce;
368             int nextcontce = secondce;
369
370             while ((nextce & STRENGTH_MASK_[strength]) == ce
371                     && (nextcontce & STRENGTH_MASK_[strength]) == secondce) {
372                 nextce = m_table_[3 * (++result)];
373                 nextcontce = m_table_[3 * result + 1];
374             }
375
376             listheader.m_nextCE_ = nextce;
377             listheader.m_nextContCE_ = nextcontce;
378
379             return result;
380         }
381     }
382
383     // package private data members ------------------------------------------
384
385     /**
386      * Inverse UCA, instantiate only when required
387      */
388     static final InverseUCA INVERSE_UCA_;
389
390     /**
391      * UCA and Inverse UCA version do not match
392      */
393     private static final String INV_UCA_VERSION_MISMATCH_ = "UCA versions of UCA and inverse UCA should match";
394
395     /**
396      * UCA and Inverse UCA version do not match
397      */
398     private static final String UCA_NOT_INSTANTIATED_ = "UCA is not instantiated!";
399
400     /**
401      * Initializing the inverse UCA
402      */
403     static {
404         InverseUCA temp = null;
405         try {
406             temp = CollatorReader.getInverseUCA();
407         } catch (IOException e) {
408         }
409         /*
410          * try { String invdat = "/com/ibm/icu/impl/data/invuca.icu";
411          * InputStream i =
412          * CollationParsedRuleBuilder.class.getResourceAsStream(invdat);
413          * BufferedInputStream b = new BufferedInputStream(i, 110000);
414          * INVERSE_UCA_ = CollatorReader.readInverseUCA(b); b.close();
415          * i.close(); } catch (Exception e) { e.printStackTrace(); throw new
416          * RuntimeException(e.getMessage()); }
417          */
418
419         if (temp != null && RuleBasedCollator.UCA_ != null) {
420             if (!temp.m_UCA_version_
421                     .equals(RuleBasedCollator.UCA_.m_UCA_version_)) {
422                 throw new RuntimeException(INV_UCA_VERSION_MISMATCH_);
423             }
424         } else {
425             throw new RuntimeException(UCA_NOT_INSTANTIATED_);
426         }
427
428         INVERSE_UCA_ = temp;
429     }
430
431     // package private methods -----------------------------------------------
432
433     /**
434      * Parse and sets the collation rules in the argument collator
435      * 
436      * @param collator
437      *            to set
438      * @exception Exception
439      *                thrown when internal program error occurs
440      */
441     void setRules(RuleBasedCollator collator) throws Exception {
442         if (m_parser_.m_resultLength_ > 0 || m_parser_.m_removeSet_ != null) {
443             // we have a set of rules, let's make something of it
444             assembleTailoringTable(collator);
445         } else { // no rules, but no error either must be only options
446             // We will init the collator from UCA
447             collator.setWithUCATables();
448         }
449         // And set only the options
450         m_parser_.setDefaultOptionsInCollator(collator);
451     }
452
453     private void copyRangeFromUCA(BuildTable t, int start, int end) {
454         int u = 0;
455         for (u = start; u <= end; u++) {
456             // if ((CE = ucmpe32_get(t.m_mapping, u)) == UCOL_NOT_FOUND
457             int CE = t.m_mapping_.getValue(u);
458             if (CE == CE_NOT_FOUND_
459             // this test is for contractions that are missing the starting
460                     // element. Looks like latin-1 should be done before
461                     // assembling the table, even if it results in more false
462                     // closure elements
463                     || (isContractionTableElement(CE) && getCE(
464                             t.m_contractions_, CE, 0) == CE_NOT_FOUND_)) {
465                 // m_utilElement_.m_uchars_ = str.toString();
466                 m_utilElement_.m_uchars_ = UCharacter.toString(u);
467                 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
468                 m_utilElement_.m_prefix_ = 0;
469                 m_utilElement_.m_CELength_ = 0;
470                 m_utilElement_.m_prefixChars_ = null;
471                 m_utilColEIter_.setText(m_utilElement_.m_uchars_);
472                 while (CE != CollationElementIterator.NULLORDER) {
473                     CE = m_utilColEIter_.next();
474                     if (CE != CollationElementIterator.NULLORDER) {
475                         m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;
476                     }
477                 }
478                 addAnElement(t, m_utilElement_);
479             }
480         }
481     }
482
483     /**
484      * 2. Eliminate the negative lists by doing the following for each non-null
485      * negative list: o if previousCE(baseCE, strongestN) != some ListHeader X's
486      * baseCE, create new ListHeader X o reverse the list, add to the end of X's
487      * positive list. Reset the strength of the first item you add, based on the
488      * stronger strength levels of the two lists.
489      * 
490      * 3. For each ListHeader with a non-null positive list: o Find all
491      * character strings with CEs between the baseCE and the next/previous CE,
492      * at the strength of the first token. Add these to the tailoring. ? That
493      * is, if UCA has ... x <<< X << x' <<< X' < y ..., and the tailoring has &
494      * x < z... ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
495      * 
496      * It is possible that this part should be done even while constructing list
497      * The problem is that it is unknown what is going to be the strongest
498      * weight. So we might as well do it here o Allocate CEs for each token in
499      * the list, based on the total number N of the largest level difference,
500      * and the gap G between baseCE and nextCE at that level. The relation *
501      * between the last item and nextCE is the same as the strongest strength. o
502      * Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) ? There are 3
503      * primary items: a, d, e. Fit them into the primary gap. Then fit b and c
504      * into the secondary gap between a and d, then fit q into the tertiary gap
505      * between b and c. o Example: baseCE << b <<< q << c * nextCE(X,2) ? There
506      * are 2 secondary items: b, c. Fit them into the secondary gap. Then fit q
507      * into the tertiary gap between b and c. o When incrementing primary
508      * values, we will not cross high byte boundaries except where there is only
509      * a single-byte primary. That is to ensure that the script reordering will
510      * continue to work.
511      * 
512      * @param collator
513      *            the rule based collator to update
514      * @exception Exception
515      *                thrown when internal program error occurs
516      */
517     void assembleTailoringTable(RuleBasedCollator collator) throws Exception {
518
519         for (int i = 0; i < m_parser_.m_resultLength_; i++) {
520             // now we need to generate the CEs
521             // We stuff the initial value in the buffers, and increase the
522             // appropriate buffer according to strength
523             if (m_parser_.m_listHeader_[i].m_first_ != null) {
524                 // if there are any elements
525                 // due to the way parser works, subsequent tailorings
526                 // may remove all the elements from a sequence, therefore
527                 // leaving an empty tailoring sequence.
528                 initBuffers(m_parser_.m_listHeader_[i]);
529             }
530         }
531
532         if (m_parser_.m_variableTop_ != null) {
533             // stuff the variable top value
534             m_parser_.m_options_.m_variableTopValue_ = m_parser_.m_variableTop_.m_CE_[0] >>> 16;
535             // remove it from the list
536             if (m_parser_.m_variableTop_.m_listHeader_.m_first_ == m_parser_.m_variableTop_) { // first
537                                                                                                // in
538                                                                                                // list
539                 m_parser_.m_variableTop_.m_listHeader_.m_first_ = m_parser_.m_variableTop_.m_next_;
540             }
541             if (m_parser_.m_variableTop_.m_listHeader_.m_last_ == m_parser_.m_variableTop_) {
542                 // first in list
543                 m_parser_.m_variableTop_.m_listHeader_.m_last_ = m_parser_.m_variableTop_.m_previous_;
544             }
545             if (m_parser_.m_variableTop_.m_next_ != null) {
546                 m_parser_.m_variableTop_.m_next_.m_previous_ = m_parser_.m_variableTop_.m_previous_;
547             }
548             if (m_parser_.m_variableTop_.m_previous_ != null) {
549                 m_parser_.m_variableTop_.m_previous_.m_next_ = m_parser_.m_variableTop_.m_next_;
550             }
551         }
552
553         BuildTable t = new BuildTable(m_parser_);
554
555         // After this, we have assigned CE values to all regular CEs now we
556         // will go through list once more and resolve expansions, make
557         // UCAElements structs and add them to table
558         for (int i = 0; i < m_parser_.m_resultLength_; i++) {
559             // now we need to generate the CEs
560             // We stuff the initial value in the buffers, and increase the
561             // appropriate buffer according to strength */
562             createElements(t, m_parser_.m_listHeader_[i]);
563         }
564
565         m_utilElement_.clear();
566
567         // add latin-1 stuff
568         copyRangeFromUCA(t, 0, 0xFF);
569
570         // add stuff for copying
571         if (m_parser_.m_copySet_ != null) {
572             int i = 0;
573             for (i = 0; i < m_parser_.m_copySet_.getRangeCount(); i++) {
574                 copyRangeFromUCA(t, m_parser_.m_copySet_.getRangeStart(i),
575                         m_parser_.m_copySet_.getRangeEnd(i));
576             }
577         }
578
579         // copy contractions from the UCA - this is felt mostly for cyrillic
580         char conts[] = RuleBasedCollator.UCA_CONTRACTIONS_;
581         int maxUCAContractionLength = RuleBasedCollator.MAX_UCA_CONTRACTION_LENGTH;
582         int offset = 0;
583         while (conts[offset] != 0) {
584             // A continuation is NUL-terminated and NUL-padded
585             // except if it has the maximum length.
586             int contractionLength = maxUCAContractionLength;
587             while (contractionLength > 0 && conts[offset + contractionLength - 1] == 0) {
588                 --contractionLength;
589             }
590             int first = Character.codePointAt(conts, offset);
591             int firstLength = Character.charCount(first);
592             int tailoredCE = t.m_mapping_.getValue(first);
593             Elements prefixElm = null;
594             if (tailoredCE != CE_NOT_FOUND_) {
595                 boolean needToAdd = true;
596                 if (isContractionTableElement(tailoredCE)) {
597                     if (isTailored(t.m_contractions_, tailoredCE, conts,
598                             offset + firstLength) == true) {
599                         needToAdd = false;
600                     }
601                 }
602                 if (!needToAdd && isPrefix(tailoredCE)
603                         && conts[offset + 1] == 0) {
604                     // pre-context character in UCA
605                     // The format for pre-context character is
606                     // conts[0]: baseCP conts[1]:0 conts[2]:pre-context CP
607                     Elements elm = new Elements();
608                     elm.m_CELength_ = 0;
609                     elm.m_uchars_ = Character.toString(conts[offset]);
610                     elm.m_cPoints_ = m_utilElement_.m_uchars_;
611                     elm.m_prefixChars_ = Character.toString(conts[offset + 2]);
612                     elm.m_prefix_ = 0; // TODO(claireho) : confirm!
613                     prefixElm = t.m_prefixLookup_.get(elm);
614                     if ((prefixElm == null)
615                             || (prefixElm.m_prefixChars_.charAt(0) != conts[offset + 2])) {
616                         needToAdd = true;
617                     }
618                 }
619                 if (m_parser_.m_removeSet_ != null
620                         && m_parser_.m_removeSet_.contains(first)) {
621                     needToAdd = false;
622                 }
623
624                 if (needToAdd == true) {
625                     // we need to add if this contraction is not tailored.
626                     if (conts[offset + 1] != 0) { // not precontext
627                         m_utilElement_.m_prefix_ = 0;
628                         m_utilElement_.m_prefixChars_ = null;
629                         m_utilElement_.m_uchars_ = new String(conts, offset, contractionLength);
630                         m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
631                         m_utilElement_.m_CELength_ = 0;
632                         m_utilColEIter_.setText(m_utilElement_.m_uchars_);
633                     } else { // add a pre-context element
634                         int preKeyLen = 0;
635                         m_utilElement_.m_uchars_ = Character.toString(conts[offset]);
636                         m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
637                         m_utilElement_.m_CELength_ = 0;
638                         m_utilElement_.m_prefixChars_ = Character.toString(conts[offset + 2]);
639                         if (prefixElm == null) {
640                             m_utilElement_.m_prefix_ = 0;
641                         } else { // TODO (claireho): confirm!
642                             m_utilElement_.m_prefix_ = m_utilElement_.m_prefix_;
643                             // m_utilElement_.m_prefix_= prefixElm.m_prefix_;
644                         }
645                         m_utilColEIter_.setText(m_utilElement_.m_prefixChars_);
646                         while (m_utilColEIter_.next() != CollationElementIterator.NULLORDER) {
647                             // count number of keys for pre-context char.
648                             preKeyLen++;
649                         }
650                         m_utilColEIter_.setText(m_utilElement_.m_prefixChars_ + m_utilElement_.m_uchars_);
651                         // Skip the keys for prefix character, then copy the
652                         // rest to el.
653                         while ((preKeyLen-- > 0)
654                                 && m_utilColEIter_.next() != CollationElementIterator.NULLORDER) {
655                             continue;
656                         }
657
658                     }
659                     while (true) {
660                         int CE = m_utilColEIter_.next();
661                         if (CE != CollationElementIterator.NULLORDER) {
662                             m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;
663                         } else {
664                             break;
665                         }
666                     }
667                     addAnElement(t, m_utilElement_);
668                 }
669             } else if (m_parser_.m_removeSet_ != null
670                     && m_parser_.m_removeSet_.contains(first)) {
671                 copyRangeFromUCA(t, first, first);
672             }
673
674             offset += maxUCAContractionLength;
675         }
676
677         // Add completely ignorable elements
678         processUCACompleteIgnorables(t);
679
680         // canonical closure
681         canonicalClosure(t);
682
683         // still need to produce compatibility closure
684         assembleTable(t, collator);
685     }
686
687     // private inner classes -------------------------------------------------
688
689     @SuppressWarnings("unused")
690     private static class CEGenerator {
691         // package private data members --------------------------------------
692
693         WeightRange m_ranges_[];
694         int m_rangesLength_;
695         int m_byteSize_;
696         int m_start_;
697         int m_limit_;
698         int m_maxCount_;
699         int m_count_;
700         int m_current_;
701         int m_fLow_; // forbidden Low
702         int m_fHigh_; // forbidden High
703
704         // package private constructor ---------------------------------------
705
706         CEGenerator() {
707             m_ranges_ = new WeightRange[7];
708             for (int i = 6; i >= 0; i--) {
709                 m_ranges_[i] = new WeightRange();
710             }
711         }
712     }
713
714     private static class WeightRange implements Comparable<WeightRange> {
715         // public methods ----------------------------------------------------
716
717         /**
718          * Compares this object with target
719          * 
720          * @param target object to compare with
721          * @return 0 if equals, 1 if this is > target, -1 otherwise
722          */
723         public int compareTo(WeightRange target) {
724             return Utility.compareUnsigned(m_start_, target.m_start_);
725         }
726
727         /**
728          * Initialize
729          */
730         public void clear() {
731             m_start_ = 0;
732             m_end_ = 0;
733             m_length_ = 0;
734             m_count_ = 0;
735             m_length2_ = 0;
736             m_count2_ = 0;
737         }
738
739         // package private data members --------------------------------------
740
741         int m_start_;
742         int m_end_;
743         int m_length_;
744         int m_count_;
745         int m_length2_;
746         int m_count2_;
747
748         // package private constructor ---------------------------------------
749
750         WeightRange() {
751             clear();
752         }
753
754         /**
755          * Copy constructor. Cloneable is troublesome, needs to check for
756          * exception
757          * 
758          * @param source
759          *            to clone
760          */
761         WeightRange(WeightRange source) {
762             m_start_ = source.m_start_;
763             m_end_ = source.m_end_;
764             m_length_ = source.m_length_;
765             m_count_ = source.m_count_;
766             m_length2_ = source.m_length2_;
767             m_count2_ = source.m_count2_;
768         }
769     }
770
771     private static class MaxJamoExpansionTable {
772         // package private data members --------------------------------------
773
774         List<Integer> m_endExpansionCE_;
775         // vector of booleans
776         List<Boolean> m_isV_;
777         byte m_maxLSize_;
778         byte m_maxVSize_;
779         byte m_maxTSize_;
780
781         // package private constructor ---------------------------------------
782
783         MaxJamoExpansionTable() {
784             m_endExpansionCE_ = new ArrayList<Integer>();
785             m_isV_ = new ArrayList<Boolean>();
786             m_endExpansionCE_.add(Integer.valueOf(0));
787             m_isV_.add(Boolean.FALSE);
788             m_maxLSize_ = 1;
789             m_maxVSize_ = 1;
790             m_maxTSize_ = 1;
791         }
792
793         MaxJamoExpansionTable(MaxJamoExpansionTable table) {
794             m_endExpansionCE_ = new ArrayList<Integer>(table.m_endExpansionCE_);
795             m_isV_ = new ArrayList<Boolean>(table.m_isV_);
796             m_maxLSize_ = table.m_maxLSize_;
797             m_maxVSize_ = table.m_maxVSize_;
798             m_maxTSize_ = table.m_maxTSize_;
799         }
800     }
801
802     private static class MaxExpansionTable {
803         // package private constructor --------------------------------------
804
805         MaxExpansionTable() {
806             m_endExpansionCE_ = new ArrayList<Integer>();
807             m_expansionCESize_ = new ArrayList<Byte>();
808             m_endExpansionCE_.add(Integer.valueOf(0));
809             m_expansionCESize_.add(Byte.valueOf((byte) 0));
810         }
811
812         MaxExpansionTable(MaxExpansionTable table) {
813             m_endExpansionCE_ = new ArrayList<Integer>(table.m_endExpansionCE_);
814             m_expansionCESize_ = new ArrayList<Byte>(table.m_expansionCESize_);
815         }
816
817         // package private data member --------------------------------------
818
819         List<Integer> m_endExpansionCE_;
820         List<Byte> m_expansionCESize_;
821     }
822
823     private static class BasicContractionTable {
824         // package private constructors -------------------------------------
825
826         BasicContractionTable() {
827             m_CEs_ = new ArrayList<Integer>();
828             m_codePoints_ = new StringBuilder();
829         }
830
831         // package private data members -------------------------------------
832
833         StringBuilder m_codePoints_;
834         List<Integer> m_CEs_;
835     }
836
837     private static class ContractionTable {
838         // package private constructor --------------------------------------
839
840         /**
841          * Builds a contraction table
842          * 
843          * @param mapping
844          */
845         ContractionTable(IntTrieBuilder mapping) {
846             m_mapping_ = mapping;
847             m_elements_ = new ArrayList<BasicContractionTable>();
848             m_CEs_ = new ArrayList<Integer>();
849             m_codePoints_ = new StringBuilder();
850             m_offsets_ = new ArrayList<Integer>();
851             m_currentTag_ = CE_NOT_FOUND_TAG_;
852         }
853
854         /**
855          * Copies a contraction table. Not all data will be copied into their
856          * own object.
857          * 
858          * @param table
859          */
860         ContractionTable(ContractionTable table) {
861             m_mapping_ = table.m_mapping_;
862             m_elements_ = new ArrayList<BasicContractionTable>(table.m_elements_);
863             m_codePoints_ = new StringBuilder(table.m_codePoints_);
864             m_CEs_ = new ArrayList<Integer>(table.m_CEs_);
865             m_offsets_ = new ArrayList<Integer>(table.m_offsets_);
866             m_currentTag_ = table.m_currentTag_;
867         }
868
869         // package private data members ------------------------------------
870
871         /**
872          * Vector of BasicContractionTable
873          */
874         List<BasicContractionTable> m_elements_;
875         IntTrieBuilder m_mapping_;
876         StringBuilder m_codePoints_;
877         List<Integer> m_CEs_;
878         List<Integer> m_offsets_;
879         int m_currentTag_;
880     }
881
882     /**
883      * Private class for combining mark table. The table is indexed by the class
884      * value(0-255).
885      */
886     @SuppressWarnings("unused")
887     private static class CombinClassTable {
888         /**
889          * accumulated numbers of combining marks.
890          */
891         int[] index = new int[256];
892
893         /**
894          * code point array for combining marks.
895          */
896         char[] cPoints;
897
898         /**
899          * size of cPoints.
900          */
901         int size;
902
903         // constructor
904         CombinClassTable() {
905             cPoints = null;
906             size = 0;
907             pos = 0;
908             curClass = 1;
909         }
910
911         /**
912          * Copy the combining mark table from ccc and index in compact way.
913          * 
914          * @param cps
915          *            : code point array
916          * @param size
917          *            : size of ccc
918          * @param index
919          *            : index of combining classes(0-255)
920          */
921         void generate(char[] cps, int numOfCM, int[] ccIndex) {
922             int count = 0;
923
924             cPoints = new char[numOfCM];
925             for (int i = 0; i < 256; i++) {
926                 for (int j = 0; j < ccIndex[i]; j++) {
927                     cPoints[count++] = cps[(i << 8) + j];
928                 }
929                 index[i] = count;
930             }
931             size = count;
932         }
933
934         /**
935          * Get first CM(combining mark) with the combining class value cClass.
936          * 
937          * @param cClass
938          *            : combining class value.
939          * @return combining mark codepoint or 0 if no combining make with class
940          *         value cClass
941          */
942         char GetFirstCM(int cClass) {
943             curClass = cClass;
944             if (cPoints == null || cClass == 0
945                     || index[cClass] == index[cClass - 1]) {
946                 return 0;
947             }
948             pos = 1;
949             return cPoints[index[cClass - 1]];
950         }
951
952         /**
953          * Get next CM(combining mark) with the combining class value cClass.
954          * Return combining mark codepoint or 0 if no next CM.
955          */
956         char GetNextCM() {
957             if (cPoints == null
958                     || index[curClass] == (index[curClass - 1] + pos)) {
959                 return 0;
960             }
961             return cPoints[index[curClass - 1] + (pos++)];
962         }
963
964         // private data members
965         int pos;
966         int curClass;
967     }
968
969     private static final class BuildTable implements TrieBuilder.DataManipulate {
970         // package private methods ------------------------------------------
971
972         /**
973          * For construction of the Trie tables. Has to be labeled public
974          * 
975          * @param cp The value of the code point.
976          * @param offset The value of the offset.
977          * @return data offset or 0
978          */
979         public int getFoldedValue(int cp, int offset) {
980             int limit = cp + 0x400;
981             while (cp < limit) {
982                 int value = m_mapping_.getValue(cp);
983                 boolean inBlockZero = m_mapping_.isInZeroBlock(cp);
984                 int tag = getCETag(value);
985                 if (inBlockZero == true) {
986                     cp += TrieBuilder.DATA_BLOCK_LENGTH;
987                 } else if (!(isSpecial(value) && (tag == CE_IMPLICIT_TAG_ || tag == CE_NOT_FOUND_TAG_))) {
988                     // These are values that are starting in either UCA
989                     // (IMPLICIT_TAG) or in the tailorings (NOT_FOUND_TAG).
990                     // Presence of these tags means that there is nothing in
991                     // this position and that it should be skipped.
992                     return RuleBasedCollator.CE_SPECIAL_FLAG_
993                             | (CE_SURROGATE_TAG_ << 24) | offset;
994                 } else {
995                     ++cp;
996                 }
997             }
998             return 0;
999         }
1000
1001         // package private constructor --------------------------------------
1002
1003         /**
1004          * Returns a table
1005          */
1006         BuildTable(CollationRuleParser parser) {
1007             m_collator_ = new RuleBasedCollator();
1008             m_collator_.setWithUCAData();
1009             MaxExpansionTable maxet = new MaxExpansionTable();
1010             MaxJamoExpansionTable maxjet = new MaxJamoExpansionTable();
1011             m_options_ = parser.m_options_;
1012             m_expansions_ = new ArrayList<Integer>();
1013             // Do your own mallocs for the structure, array and have linear
1014             // Latin 1
1015             int trieinitialvalue = RuleBasedCollator.CE_SPECIAL_FLAG_
1016                     | (CE_NOT_FOUND_TAG_ << 24);
1017             // temporary fix for jb3822, 0x100000 -> 30000
1018             m_mapping_ = new IntTrieBuilder(null, 0x30000, trieinitialvalue,
1019                     trieinitialvalue, true);
1020             m_prefixLookup_ = new HashMap<Elements, Elements>();
1021             // uhash_open(prefixLookupHash, prefixLookupComp);
1022             m_contractions_ = new ContractionTable(m_mapping_);
1023             // copy UCA's maxexpansion and merge as we go along
1024             m_maxExpansions_ = maxet;
1025             // adding an extra initial value for easier manipulation
1026             for (int i = 0; i < RuleBasedCollator.UCA_.m_expansionEndCE_.length; i++) {
1027                 maxet.m_endExpansionCE_.add(Integer.valueOf(
1028                         RuleBasedCollator.UCA_.m_expansionEndCE_[i]));
1029                 maxet.m_expansionCESize_.add(Byte.valueOf(
1030                         RuleBasedCollator.UCA_.m_expansionEndCEMaxSize_[i]));
1031             }
1032             m_maxJamoExpansions_ = maxjet;
1033
1034             m_unsafeCP_ = new byte[UNSAFECP_TABLE_SIZE_];
1035             m_contrEndCP_ = new byte[UNSAFECP_TABLE_SIZE_];
1036             Arrays.fill(m_unsafeCP_, (byte) 0);
1037             Arrays.fill(m_contrEndCP_, (byte) 0);
1038         }
1039
1040         /**
1041          * Duplicating a BuildTable. Not all data will be duplicated into their
1042          * own object.
1043          * 
1044          * @param table
1045          *            to clone
1046          */
1047         BuildTable(BuildTable table) {
1048             m_collator_ = table.m_collator_;
1049             m_mapping_ = new IntTrieBuilder(table.m_mapping_);
1050             m_expansions_ = new ArrayList<Integer>(table.m_expansions_);
1051             m_contractions_ = new ContractionTable(table.m_contractions_);
1052             m_contractions_.m_mapping_ = m_mapping_;
1053             m_options_ = table.m_options_;
1054             m_maxExpansions_ = new MaxExpansionTable(table.m_maxExpansions_);
1055             m_maxJamoExpansions_ = new MaxJamoExpansionTable(
1056                     table.m_maxJamoExpansions_);
1057             m_unsafeCP_ = new byte[table.m_unsafeCP_.length];
1058             System.arraycopy(table.m_unsafeCP_, 0, m_unsafeCP_, 0,
1059                     m_unsafeCP_.length);
1060             m_contrEndCP_ = new byte[table.m_contrEndCP_.length];
1061             System.arraycopy(table.m_contrEndCP_, 0, m_contrEndCP_, 0,
1062                     m_contrEndCP_.length);
1063         }
1064
1065         // package private data members -------------------------------------
1066
1067         RuleBasedCollator m_collator_;
1068         IntTrieBuilder m_mapping_;
1069         List<Integer> m_expansions_;
1070         ContractionTable m_contractions_;
1071         // UCATableHeader image;
1072         CollationRuleParser.OptionSet m_options_;
1073         MaxExpansionTable m_maxExpansions_;
1074         MaxJamoExpansionTable m_maxJamoExpansions_;
1075         byte m_unsafeCP_[];
1076         byte m_contrEndCP_[];
1077         Map<Elements, Elements> m_prefixLookup_;
1078         CombinClassTable cmLookup = null;
1079     }
1080
1081     private static class Elements {
1082         // package private data members -------------------------------------
1083
1084         String m_prefixChars_;
1085         int m_prefix_;
1086         String m_uchars_;
1087         /**
1088          * Working string
1089          */
1090         String m_cPoints_;
1091         /**
1092          * Offset to the working string
1093          */
1094         int m_cPointsOffset_;
1095         /**
1096          * These are collation elements - there could be more than one - in case
1097          * of expansion
1098          */
1099         int m_CEs_[];
1100         int m_CELength_;
1101         /**
1102          * This is the value element maps in original table
1103          */
1104         int m_mapCE_;
1105         int m_sizePrim_[];
1106         int m_sizeSec_[];
1107         int m_sizeTer_[];
1108         boolean m_variableTop_;
1109         boolean m_caseBit_;
1110
1111         // package private constructors -------------------------------------
1112
1113         /**
1114          * Package private constructor
1115          */
1116         Elements() {
1117             m_sizePrim_ = new int[128];
1118             m_sizeSec_ = new int[128];
1119             m_sizeTer_ = new int[128];
1120             m_CEs_ = new int[256];
1121             m_CELength_ = 0;
1122         }
1123
1124         /**
1125          * Package private constructor
1126          */
1127         Elements(Elements element) {
1128             m_prefixChars_ = element.m_prefixChars_;
1129             m_prefix_ = element.m_prefix_;
1130             m_uchars_ = element.m_uchars_;
1131             m_cPoints_ = element.m_cPoints_;
1132             m_cPointsOffset_ = element.m_cPointsOffset_;
1133             m_CEs_ = element.m_CEs_;
1134             m_CELength_ = element.m_CELength_;
1135             m_mapCE_ = element.m_mapCE_;
1136             m_sizePrim_ = element.m_sizePrim_;
1137             m_sizeSec_ = element.m_sizeSec_;
1138             m_sizeTer_ = element.m_sizeTer_;
1139             m_variableTop_ = element.m_variableTop_;
1140             m_caseBit_ = element.m_caseBit_;
1141         }
1142
1143         // package private methods -------------------------------------------
1144
1145         /**
1146          * Initializing the elements
1147          */
1148         public void clear() {
1149             m_prefixChars_ = null;
1150             m_prefix_ = 0;
1151             m_uchars_ = null;
1152             m_cPoints_ = null;
1153             m_cPointsOffset_ = 0;
1154             m_CELength_ = 0;
1155             m_mapCE_ = 0;
1156             Arrays.fill(m_sizePrim_, 0);
1157             Arrays.fill(m_sizeSec_, 0);
1158             Arrays.fill(m_sizeTer_, 0);
1159             m_variableTop_ = false;
1160             m_caseBit_ = false;
1161         }
1162
1163         /**
1164          * Hashcode calculation for token
1165          * 
1166          * @return the hashcode
1167          */
1168         public int hashCode() {
1169             String str = m_cPoints_.substring(m_cPointsOffset_);
1170             return str.hashCode();
1171         }
1172
1173         /**
1174          * Equals calculation
1175          * 
1176          * @param target Object to compare
1177          * @return true if target is the same as this object
1178          */
1179         public boolean equals(Object target) {
1180             if (target == this) {
1181                 return true;
1182             }
1183             if (target instanceof Elements) {
1184                 Elements t = (Elements) target;
1185                 int size = m_cPoints_.length() - m_cPointsOffset_;
1186                 if (size == t.m_cPoints_.length() - t.m_cPointsOffset_) {
1187                     return t.m_cPoints_.regionMatches(t.m_cPointsOffset_,
1188                             m_cPoints_, m_cPointsOffset_, size);
1189                 }
1190             }
1191             return false;
1192         }
1193     }
1194
1195     // private data member ---------------------------------------------------
1196
1197     /**
1198      * Maximum strength used in CE building
1199      */
1200     private static final int CE_BASIC_STRENGTH_LIMIT_ = 3;
1201     /**
1202      * Maximum collation strength
1203      */
1204     private static final int CE_STRENGTH_LIMIT_ = 16;
1205     /**
1206      * Strength mask array, used in inverse UCA
1207      */
1208     private static final int STRENGTH_MASK_[] = { 0xFFFF0000, 0xFFFFFF00,
1209         0xFFFFFFFF };
1210     /**
1211      * CE tag for not found
1212      */
1213     private static final int CE_NOT_FOUND_ = 0xF0000000;
1214     /**
1215      * CE tag for not found
1216      */
1217     private static final int CE_NOT_FOUND_TAG_ = 0;
1218     /**
1219      * This code point results in an expansion
1220      */
1221     private static final int CE_EXPANSION_TAG_ = 1;
1222     /**
1223      * Start of a contraction
1224      */
1225     private static final int CE_CONTRACTION_TAG_ = 2;
1226     /*
1227      * Thai character - do the reordering
1228      */
1229     // private static final int CE_THAI_TAG_ = 3;
1230     /*
1231      * Charset processing, not yet implemented
1232      */
1233     // private static final int CE_CHARSET_TAG_ = 4;
1234     /**
1235      * Lead surrogate that is tailored and doesn't start a contraction
1236      */
1237     private static final int CE_SURROGATE_TAG_ = 5;
1238     /*
1239      * AC00-D7AF
1240      */
1241     // private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
1242     /*
1243      * D800-DBFF
1244      */
1245     // private static final int CE_LEAD_SURROGATE_TAG_ = 7;
1246     /*
1247      * DC00-DFFF
1248      */
1249     // private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
1250     /*
1251      * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
1252      */
1253     // private static final int CE_CJK_IMPLICIT_TAG_ = 9;
1254     private static final int CE_IMPLICIT_TAG_ = 10;
1255     private static final int CE_SPEC_PROC_TAG_ = 11;
1256     /**
1257      * This is a three byte primary with starting secondaries and tertiaries. It
1258      * fits in a single 32 bit CE and is used instead of expansion to save space
1259      * without affecting the performance (hopefully)
1260      */
1261     private static final int CE_LONG_PRIMARY_TAG_ = 12;
1262     /**
1263      * Unsafe UChar hash table table size. Size is 32 bytes for 1 bit for each
1264      * latin 1 char + some power of two for hashing the rest of the chars. Size
1265      * in bytes
1266      */
1267     private static final int UNSAFECP_TABLE_SIZE_ = 1056;
1268     /**
1269      * Mask value down to "some power of two" -1. Number of bits, not num of
1270      * bytes.
1271      */
1272     private static final int UNSAFECP_TABLE_MASK_ = 0x1fff;
1273     /**
1274      * Case values
1275      */
1276     private static final int UPPER_CASE_ = 0x80;
1277     private static final int MIXED_CASE_ = 0x40;
1278     private static final int LOWER_CASE_ = 0x00;
1279     /*
1280      * Initial table size
1281      */
1282     // private static final int INIT_TABLE_SIZE_ = 1028;
1283     /*
1284      * Header size, copied from ICU4C, to be changed when that value changes
1285      */
1286     // private static final int HEADER_SIZE_ = 0xC4;
1287     /**
1288      * Contraction table new element indicator
1289      */
1290     private static final int CONTRACTION_TABLE_NEW_ELEMENT_ = 0xFFFFFF;
1291     /**
1292      * Parser for the rules
1293      */
1294     private CollationRuleParser m_parser_;
1295     /**
1296      * Utility UCA collation element iterator
1297      */
1298     private CollationElementIterator m_utilColEIter_;
1299     /**
1300      * Utility data members
1301      */
1302     private CEGenerator m_utilGens_[] = { new CEGenerator(), new CEGenerator(),
1303         new CEGenerator() };
1304     private int m_utilCEBuffer_[] = new int[CE_BASIC_STRENGTH_LIMIT_];
1305     private int m_utilIntBuffer_[] = new int[CE_STRENGTH_LIMIT_];
1306     private Elements m_utilElement_ = new Elements();
1307     private Elements m_utilElement2_ = new Elements();
1308     private CollationRuleParser.Token m_utilToken_ = new CollationRuleParser.Token();
1309     private int m_utilCountBuffer_[] = new int[6];
1310     private long m_utilLongBuffer_[] = new long[5];
1311     private WeightRange m_utilLowerWeightRange_[] = { new WeightRange(),
1312         new WeightRange(), new WeightRange(), new WeightRange(),
1313         new WeightRange() };
1314     private WeightRange m_utilUpperWeightRange_[] = { new WeightRange(),
1315         new WeightRange(), new WeightRange(), new WeightRange(),
1316         new WeightRange() };
1317     private WeightRange m_utilWeightRange_ = new WeightRange();
1318     private final Normalizer2Impl m_nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
1319     private CanonicalIterator m_utilCanIter_ = new CanonicalIterator("");
1320     private StringBuilder m_utilStringBuffer_ = new StringBuilder("");
1321     // Flag indicating a combining marks table is required or not.
1322     private static boolean buildCMTabFlag = false;
1323
1324     // private methods -------------------------------------------------------
1325
1326     /**
1327      * @param listheader
1328      *            parsed rule tokens
1329      * @exception Exception
1330      *                thrown when internal error occurs
1331      */
1332     private void initBuffers(CollationRuleParser.TokenListHeader listheader)
1333             throws Exception {
1334         CollationRuleParser.Token token = listheader.m_last_;
1335         Arrays.fill(m_utilIntBuffer_, 0, CE_STRENGTH_LIMIT_, 0);
1336
1337         token.m_toInsert_ = 1;
1338         m_utilIntBuffer_[token.m_strength_] = 1;
1339         while (token.m_previous_ != null) {
1340             if (token.m_previous_.m_strength_ < token.m_strength_) {
1341                 // going up
1342                 m_utilIntBuffer_[token.m_strength_] = 0;
1343                 m_utilIntBuffer_[token.m_previous_.m_strength_]++;
1344             } else if (token.m_previous_.m_strength_ > token.m_strength_) {
1345                 // going down
1346                 m_utilIntBuffer_[token.m_previous_.m_strength_] = 1;
1347             } else {
1348                 m_utilIntBuffer_[token.m_strength_]++;
1349             }
1350             token = token.m_previous_;
1351             token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];
1352         }
1353
1354         token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];
1355         INVERSE_UCA_.getInverseGapPositions(listheader);
1356
1357         token = listheader.m_first_;
1358         int fstrength = Collator.IDENTICAL;
1359         int initstrength = Collator.IDENTICAL;
1360
1361         m_utilCEBuffer_[Collator.PRIMARY] = mergeCE(listheader.m_baseCE_,
1362                 listheader.m_baseContCE_, Collator.PRIMARY);
1363         m_utilCEBuffer_[Collator.SECONDARY] = mergeCE(listheader.m_baseCE_,
1364                 listheader.m_baseContCE_, Collator.SECONDARY);
1365         m_utilCEBuffer_[Collator.TERTIARY] = mergeCE(listheader.m_baseCE_,
1366                 listheader.m_baseContCE_, Collator.TERTIARY);
1367         while (token != null) {
1368             fstrength = token.m_strength_;
1369             if (fstrength < initstrength) {
1370                 initstrength = fstrength;
1371                 if (listheader.m_pos_[fstrength] == -1) {
1372                     while (listheader.m_pos_[fstrength] == -1 && fstrength > 0) {
1373                         fstrength--;
1374                     }
1375                     if (listheader.m_pos_[fstrength] == -1) {
1376                         throw new Exception("Internal program error");
1377                     }
1378                 }
1379                 if (initstrength == Collator.TERTIARY) {
1380                     // starting with tertiary
1381                     m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3];
1382                     m_utilCEBuffer_[Collator.SECONDARY] = listheader.m_gapsLo_[fstrength * 3 + 1];
1383                     m_utilCEBuffer_[Collator.TERTIARY] = getCEGenerator(
1384                             m_utilGens_[Collator.TERTIARY],
1385                             listheader.m_gapsLo_, listheader.m_gapsHi_, token,
1386                             fstrength);
1387                 } else if (initstrength == Collator.SECONDARY) {
1388                     // secondaries
1389                     m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3];
1390                     m_utilCEBuffer_[Collator.SECONDARY] = getCEGenerator(
1391                             m_utilGens_[Collator.SECONDARY],
1392                             listheader.m_gapsLo_, listheader.m_gapsHi_, token,
1393                             fstrength);
1394                     m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1395                             m_utilGens_[Collator.TERTIARY], token,
1396                             Collator.TERTIARY);
1397                 } else {
1398                     // primaries
1399                     m_utilCEBuffer_[Collator.PRIMARY] = getCEGenerator(
1400                             m_utilGens_[Collator.PRIMARY],
1401                             listheader.m_gapsLo_, listheader.m_gapsHi_, token,
1402                             fstrength);
1403                     m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator(
1404                             m_utilGens_[Collator.SECONDARY], token,
1405                             Collator.SECONDARY);
1406                     m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1407                             m_utilGens_[Collator.TERTIARY], token,
1408                             Collator.TERTIARY);
1409                 }
1410             } else {
1411                 if (token.m_strength_ == Collator.TERTIARY) {
1412                     m_utilCEBuffer_[Collator.TERTIARY] = getNextGenerated(m_utilGens_[Collator.TERTIARY]);
1413                 } else if (token.m_strength_ == Collator.SECONDARY) {
1414                     m_utilCEBuffer_[Collator.SECONDARY] = getNextGenerated(m_utilGens_[Collator.SECONDARY]);
1415                     m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1416                             m_utilGens_[Collator.TERTIARY], token,
1417                             Collator.TERTIARY);
1418                 } else if (token.m_strength_ == Collator.PRIMARY) {
1419                     m_utilCEBuffer_[Collator.PRIMARY] = getNextGenerated(m_utilGens_[Collator.PRIMARY]);
1420                     m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator(
1421                             m_utilGens_[Collator.SECONDARY], token,
1422                             Collator.SECONDARY);
1423                     m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1424                             m_utilGens_[Collator.TERTIARY], token,
1425                             Collator.TERTIARY);
1426                 }
1427             }
1428             doCE(m_utilCEBuffer_, token);
1429             token = token.m_next_;
1430         }
1431     }
1432
1433     /**
1434      * Get the next generated ce
1435      * 
1436      * @param g
1437      *            ce generator
1438      * @return next generated ce
1439      */
1440     private int getNextGenerated(CEGenerator g) {
1441         g.m_current_ = nextWeight(g);
1442         return g.m_current_;
1443     }
1444
1445     /**
1446      * @param g
1447      *            CEGenerator
1448      * @param token
1449      *            rule token
1450      * @param strength
1451      * @return ce generator
1452      * @exception Exception
1453      *                thrown when internal error occurs
1454      */
1455     private int getSimpleCEGenerator(CEGenerator g,
1456             CollationRuleParser.Token token, int strength) throws Exception {
1457         int high, low, count = 1;
1458         int maxbyte = (strength == Collator.TERTIARY) ? 0x3F : 0xFF;
1459
1460         if (strength == Collator.SECONDARY) {
1461             low = RuleBasedCollator.COMMON_TOP_2_ << 24;
1462             high = 0xFFFFFFFF;
1463             count = 0xFF - RuleBasedCollator.COMMON_TOP_2_;
1464         } else {
1465             low = RuleBasedCollator.BYTE_COMMON_ << 24; // 0x05000000;
1466             high = 0x40000000;
1467             count = 0x40 - RuleBasedCollator.BYTE_COMMON_;
1468         }
1469
1470         if (token.m_next_ != null && token.m_next_.m_strength_ == strength) {
1471             count = token.m_next_.m_toInsert_;
1472         }
1473
1474         g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,
1475                 g.m_ranges_);
1476         g.m_current_ = RuleBasedCollator.BYTE_COMMON_ << 24;
1477
1478         if (g.m_rangesLength_ == 0) {
1479             throw new Exception("Internal program error");
1480         }
1481         return g.m_current_;
1482     }
1483
1484     /**
1485      * Combines 2 ce into one with respect to the argument strength
1486      * 
1487      * @param ce1
1488      *            first ce
1489      * @param ce2
1490      *            second ce
1491      * @param strength
1492      *            strength to use
1493      * @return combined ce
1494      */
1495     private static int mergeCE(int ce1, int ce2, int strength) {
1496         int mask = RuleBasedCollator.CE_TERTIARY_MASK_;
1497         if (strength == Collator.SECONDARY) {
1498             mask = RuleBasedCollator.CE_SECONDARY_MASK_;
1499         } else if (strength == Collator.PRIMARY) {
1500             mask = RuleBasedCollator.CE_PRIMARY_MASK_;
1501         }
1502         ce1 &= mask;
1503         ce2 &= mask;
1504         switch (strength) {
1505         case Collator.PRIMARY:
1506             return ce1 | ce2 >>> 16;
1507         case Collator.SECONDARY:
1508             return ce1 << 16 | ce2 << 8;
1509         default:
1510             return ce1 << 24 | ce2 << 16;
1511         }
1512     }
1513
1514     /**
1515      * @param g
1516      *            CEGenerator
1517      * @param lows
1518      *            low gap array
1519      * @param highs
1520      *            high gap array
1521      * @param token
1522      *            rule token
1523      * @param fstrength
1524      * @exception Exception
1525      *                thrown when internal error occurs
1526      */
1527     private int getCEGenerator(CEGenerator g, int lows[], int highs[],
1528             CollationRuleParser.Token token, int fstrength) throws Exception {
1529         int strength = token.m_strength_;
1530         int low = lows[fstrength * 3 + strength];
1531         int high = highs[fstrength * 3 + strength];
1532         int maxbyte = 0;
1533         if (strength == Collator.TERTIARY) {
1534             maxbyte = 0x3F;
1535         } else if (strength == Collator.PRIMARY) {
1536             maxbyte = 0xFE;
1537         } else {
1538             maxbyte = 0xFF;
1539         }
1540
1541         int count = token.m_toInsert_;
1542
1543         if (Utility.compareUnsigned(low, high) >= 0
1544                 && strength > Collator.PRIMARY) {
1545             int s = strength;
1546             while (true) {
1547                 s--;
1548                 if (lows[fstrength * 3 + s] != highs[fstrength * 3 + s]) {
1549                     if (strength == Collator.SECONDARY) {
1550                         if (low < (RuleBasedCollator.COMMON_TOP_2_ << 24)) {
1551                             // Override if low range is less than
1552                             // UCOL_COMMON_TOP2.
1553                             low = RuleBasedCollator.COMMON_TOP_2_ << 24;
1554                         }
1555                         high = 0xFFFFFFFF;
1556                     } else {
1557                         if (low < RuleBasedCollator.COMMON_BOTTOM_3 << 24) {
1558                             // Override if low range is less than
1559                             // UCOL_COMMON_BOT3.
1560                             low = RuleBasedCollator.COMMON_BOTTOM_3 << 24;
1561                         }
1562                         high = 0x40000000;
1563                     }
1564                     break;
1565                 }
1566                 if (s < 0) {
1567                     throw new Exception("Internal program error");
1568                 }
1569             }
1570         }
1571         if(0 <= low && low < 0x02000000) {  // unsigned comparison < 0x02000000
1572             // We must not use CE weight byte 02, so we set it as the minimum lower bound.
1573             // See http://site.icu-project.org/design/collation/bytes
1574             low = 0x02000000;
1575         }
1576
1577         if (strength == Collator.SECONDARY) { // similar as simple
1578             if (Utility.compareUnsigned(low,
1579                     RuleBasedCollator.COMMON_BOTTOM_2_ << 24) >= 0
1580                     && Utility.compareUnsigned(low,
1581                             RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {
1582                 low = RuleBasedCollator.COMMON_TOP_2_ << 24;
1583             }
1584             if (Utility.compareUnsigned(high,
1585                     RuleBasedCollator.COMMON_BOTTOM_2_ << 24) > 0
1586                     && Utility.compareUnsigned(high,
1587                             RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {
1588                 high = RuleBasedCollator.COMMON_TOP_2_ << 24;
1589             }
1590             if (Utility.compareUnsigned(low,
1591                     RuleBasedCollator.COMMON_BOTTOM_2_ << 24) < 0) {
1592                 g.m_rangesLength_ = allocateWeights(
1593                         RuleBasedCollator.BYTE_UNSHIFTED_MIN_ << 24, high,
1594                         count, maxbyte, g.m_ranges_);
1595                 g.m_current_ = nextWeight(g);
1596                 // g.m_current_ = RuleBasedCollator.COMMON_BOTTOM_2_ << 24;
1597                 return g.m_current_;
1598             }
1599         }
1600
1601         g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,
1602                 g.m_ranges_);
1603         if (g.m_rangesLength_ == 0) {
1604             throw new Exception("Internal program error");
1605         }
1606         g.m_current_ = nextWeight(g);
1607         return g.m_current_;
1608     }
1609
1610     /**
1611      * @param ceparts
1612      *            list of collation elements parts
1613      * @param token
1614      *            rule token
1615      * @exception Exception
1616      *                thrown when forming case bits for expansions fails
1617      */
1618     private void doCE(int ceparts[], CollationRuleParser.Token token)
1619             throws Exception {
1620         // this one makes the table and stuff
1621         // int noofbytes[] = new int[3];
1622         for (int i = 0; i < 3; i++) {
1623             // noofbytes[i] = countBytes(ceparts[i]);
1624             m_utilIntBuffer_[i] = countBytes(ceparts[i]);
1625         }
1626
1627         // Here we have to pack CEs from parts
1628         int cei = 0;
1629         int value = 0;
1630
1631         while ((cei << 1) < m_utilIntBuffer_[0] || cei < m_utilIntBuffer_[1]
1632                 || cei < m_utilIntBuffer_[2]) {
1633             if (cei > 0) {
1634                 value = RuleBasedCollator.CE_CONTINUATION_MARKER_;
1635             } else {
1636                 value = 0;
1637             }
1638
1639             if ((cei << 1) < m_utilIntBuffer_[0]) {
1640                 value |= ((ceparts[0] >> (32 - ((cei + 1) << 4))) & 0xFFFF) << 16;
1641             }
1642             if (cei < m_utilIntBuffer_[1]) {
1643                 value |= ((ceparts[1] >> (32 - ((cei + 1) << 3))) & 0xFF) << 8;
1644             }
1645
1646             if (cei < m_utilIntBuffer_[2]) {
1647                 value |= ((ceparts[2] >> (32 - ((cei + 1) << 3))) & 0x3F);
1648             }
1649             token.m_CE_[cei] = value;
1650             cei++;
1651         }
1652         if (cei == 0) { // totally ignorable
1653             token.m_CELength_ = 1;
1654             token.m_CE_[0] = 0;
1655         } else { // there is at least something
1656             token.m_CELength_ = cei;
1657         }
1658
1659         // Case bits handling for expansion
1660         if (token.m_CE_[0] != 0) { // case bits should be set only for
1661                                    // non-ignorables
1662             token.m_CE_[0] &= 0xFFFFFF3F; // Clean the case bits field
1663             int cSize = (token.m_source_ & 0xFF000000) >>> 24;
1664             int startoftokenrule = token.m_source_ & 0x00FFFFFF;
1665
1666             if (cSize > 1) {
1667                 // Do it manually
1668                 String tokenstr = token.m_rules_.substring(startoftokenrule,
1669                                   startoftokenrule + cSize);
1670                 token.m_CE_[0] |= getCaseBits(tokenstr);
1671             } else {
1672                 // Copy it from the UCA
1673                 int caseCE = getFirstCE(token.m_rules_.charAt(startoftokenrule));
1674                 token.m_CE_[0] |= (caseCE & 0xC0);
1675             }
1676         }
1677     }
1678
1679     /**
1680      * Count the number of non-zero bytes used in the ce
1681      * 
1682      * @param ce
1683      * @return number of non-zero bytes used in ce
1684      */
1685     private static final int countBytes(int ce) {
1686         int mask = 0xFFFFFFFF;
1687         int result = 0;
1688         while (mask != 0) {
1689             if ((ce & mask) != 0) {
1690                 result++;
1691             }
1692             mask >>>= 8;
1693         }
1694         return result;
1695     }
1696
1697     /**
1698      * We are ready to create collation elements
1699      * 
1700      * @param t
1701      *            build table to insert
1702      * @param lh
1703      *            rule token list header
1704      */
1705     private void createElements(BuildTable t,
1706             CollationRuleParser.TokenListHeader lh) {
1707         CollationRuleParser.Token tok = lh.m_first_;
1708         m_utilElement_.clear();
1709         while (tok != null) {
1710             // first, check if there are any expansions
1711             // if there are expansions, we need to do a little bit more
1712             // processing since parts of expansion can be tailored, while
1713             // others are not
1714             if (tok.m_expansion_ != 0) {
1715                 int len = tok.m_expansion_ >>> 24;
1716                 int currentSequenceLen = len;
1717                 int expOffset = tok.m_expansion_ & 0x00FFFFFF;
1718                 m_utilToken_.m_source_ = currentSequenceLen | expOffset;
1719                 m_utilToken_.m_rules_ = m_parser_.m_source_;
1720
1721                 while (len > 0) {
1722                     currentSequenceLen = len;
1723                     while (currentSequenceLen > 0) {
1724                         m_utilToken_.m_source_ = (currentSequenceLen << 24)
1725                                 | expOffset;
1726                         CollationRuleParser.Token expt = m_parser_.m_hashTable_.get(m_utilToken_);
1727                         if (expt != null
1728                                 && expt.m_strength_ != CollationRuleParser.TOKEN_RESET_) {
1729                             // expansion is tailored
1730                             int noOfCEsToCopy = expt.m_CELength_;
1731                             for (int j = 0; j < noOfCEsToCopy; j++) {
1732                                 tok.m_expCE_[tok.m_expCELength_ + j] = expt.m_CE_[j];
1733                             }
1734                             tok.m_expCELength_ += noOfCEsToCopy;
1735                             // never try to add codepoints and CEs.
1736                             // For some odd reason, it won't work.
1737                             expOffset += currentSequenceLen; // noOfCEsToCopy;
1738                             len -= currentSequenceLen; // noOfCEsToCopy;
1739                             break;
1740                         } else {
1741                             currentSequenceLen--;
1742                         }
1743                     }
1744                     if (currentSequenceLen == 0) {
1745                         // couldn't find any tailored subsequence, will have to
1746                         // get one from UCA. first, get the UChars from the
1747                         // rules then pick CEs out until there is no more and
1748                         // stuff them into expansion
1749                         m_utilColEIter_.setText(m_parser_.m_source_.substring(
1750                                 expOffset, expOffset + 1));
1751                         while (true) {
1752                             int order = m_utilColEIter_.next();
1753                             if (order == CollationElementIterator.NULLORDER) {
1754                                 break;
1755                             }
1756                             tok.m_expCE_[tok.m_expCELength_++] = order;
1757                         }
1758                         expOffset++;
1759                         len--;
1760                     }
1761                 }
1762             } else {
1763                 tok.m_expCELength_ = 0;
1764             }
1765
1766             // set the ucaelement with obtained values
1767             m_utilElement_.m_CELength_ = tok.m_CELength_ + tok.m_expCELength_;
1768
1769             // copy CEs
1770             System.arraycopy(tok.m_CE_, 0, m_utilElement_.m_CEs_, 0,
1771                     tok.m_CELength_);
1772             System.arraycopy(tok.m_expCE_, 0, m_utilElement_.m_CEs_,
1773                     tok.m_CELength_, tok.m_expCELength_);
1774
1775             // copy UChars
1776             // We kept prefix and source kind of together, as it is a kind of a
1777             // contraction.
1778             // However, now we have to slice the prefix off the main thing -
1779             m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_;
1780             m_utilElement_.m_cPointsOffset_ = 0; // el.m_uchars_;
1781             if (tok.m_prefix_ != 0) {
1782                 // we will just copy the prefix here, and adjust accordingly in
1783                 // the addPrefix function in ucol_elm. The reason is that we
1784                 // need to add both composed AND decomposed elements to the
1785                 // unsafe table.
1786                 int size = tok.m_prefix_ >> 24;
1787                 int offset = tok.m_prefix_ & 0x00FFFFFF;
1788                 m_utilElement_.m_prefixChars_ = m_parser_.m_source_.substring(
1789                         offset, offset + size);
1790                 size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24);
1791                 offset = (tok.m_source_ & 0x00FFFFFF) + (tok.m_prefix_ >> 24);
1792                 m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
1793                         offset, offset + size);
1794             } else {
1795                 m_utilElement_.m_prefixChars_ = null;
1796                 int offset = tok.m_source_ & 0x00FFFFFF;
1797                 int size = tok.m_source_ >>> 24;
1798                 m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
1799                         offset, offset + size);
1800             }
1801             m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
1802
1803             boolean containCombinMarks = false;
1804             for (int i = 0; i < m_utilElement_.m_cPoints_.length()
1805                     - m_utilElement_.m_cPointsOffset_; i++) {
1806                 if (isJamo(m_utilElement_.m_cPoints_.charAt(i))) {
1807                     t.m_collator_.m_isJamoSpecial_ = true;
1808                     break;
1809                 }
1810                 if (!buildCMTabFlag) {
1811                     // check combining class
1812                     int fcd = m_nfcImpl_.getFCD16(m_utilElement_.m_cPoints_.charAt(i));  // TODO: review for handling supplementary characters
1813                     if ((fcd & 0xff) == 0) {
1814                         // reset flag when current char is not combining mark.
1815                         containCombinMarks = false;
1816                     } else {
1817                         containCombinMarks = true;
1818                     }
1819                 }
1820             }
1821
1822             if (!buildCMTabFlag && containCombinMarks) {
1823                 buildCMTabFlag = true;
1824             }
1825
1826             /***
1827              * // Case bits handling m_utilElement_.m_CEs_[0] &= 0xFFFFFF3F; //
1828              * Clean the case bits field if (m_utilElement_.m_cPoints_.length()
1829              * - m_utilElement_.m_cPointsOffset_ > 1) { // Do it manually
1830              * m_utilElement_.m_CEs_[0] |=
1831              * getCaseBits(m_utilElement_.m_cPoints_); } else { // Copy it from
1832              * the UCA int caseCE =
1833              * getFirstCE(m_utilElement_.m_cPoints_.charAt(0));
1834              * m_utilElement_.m_CEs_[0] |= (caseCE & 0xC0); }
1835              ***/
1836             // and then, add it
1837             addAnElement(t, m_utilElement_);
1838             tok = tok.m_next_;
1839         }
1840     }
1841
1842     /**
1843      * Testing if the string argument has case
1844      * 
1845      * @param src
1846      *            string
1847      * @return the case for this char array
1848      * @exception Exception
1849      *                thrown when internal program error occurs
1850      */
1851     private final int getCaseBits(String src) throws Exception {
1852         int uCount = 0;
1853         int lCount = 0;
1854         src = Normalizer.decompose(src, true);
1855         m_utilColEIter_.setText(src);
1856         for (int i = 0; i < src.length(); i++) {
1857             m_utilColEIter_.setText(src.substring(i, i + 1));
1858             int order = m_utilColEIter_.next();
1859             if (RuleBasedCollator.isContinuation(order)) {
1860                 throw new Exception("Internal program error");
1861             }
1862             if ((order & RuleBasedCollator.CE_CASE_BIT_MASK_) == UPPER_CASE_) {
1863                 uCount++;
1864             } else {
1865                 char ch = src.charAt(i);
1866                 if (UCharacter.isLowerCase(ch)) {
1867                     lCount++;
1868                 } else {
1869                     if (toSmallKana(ch) == ch && toLargeKana(ch) != ch) {
1870                         lCount++;
1871                     }
1872                 }
1873             }
1874         }
1875
1876         if (uCount != 0 && lCount != 0) {
1877             return MIXED_CASE_;
1878         } else if (uCount != 0) {
1879             return UPPER_CASE_;
1880         } else {
1881             return LOWER_CASE_;
1882         }
1883     }
1884
1885     /**
1886      * Converts a char to the uppercase Kana
1887      * 
1888      * @param ch
1889      *            character to convert
1890      * @return the converted Kana character
1891      */
1892     private static final char toLargeKana(char ch) {
1893         if (0x3042 < ch && ch < 0x30ef) { // Kana range
1894             switch (ch - 0x3000) {
1895             case 0x41:
1896             case 0x43:
1897             case 0x45:
1898             case 0x47:
1899             case 0x49:
1900             case 0x63:
1901             case 0x83:
1902             case 0x85:
1903             case 0x8E:
1904             case 0xA1:
1905             case 0xA3:
1906             case 0xA5:
1907             case 0xA7:
1908             case 0xA9:
1909             case 0xC3:
1910             case 0xE3:
1911             case 0xE5:
1912             case 0xEE:
1913                 ch++;
1914                 break;
1915             case 0xF5:
1916                 ch = 0x30AB;
1917                 break;
1918             case 0xF6:
1919                 ch = 0x30B1;
1920                 break;
1921             }
1922         }
1923         return ch;
1924     }
1925
1926     /**
1927      * Converts a char to the lowercase Kana
1928      * 
1929      * @param ch
1930      *            character to convert
1931      * @return the converted Kana character
1932      */
1933     private static final char toSmallKana(char ch) {
1934         if (0x3042 < ch && ch < 0x30ef) { // Kana range
1935             switch (ch - 0x3000) {
1936             case 0x42:
1937             case 0x44:
1938             case 0x46:
1939             case 0x48:
1940             case 0x4A:
1941             case 0x64:
1942             case 0x84:
1943             case 0x86:
1944             case 0x8F:
1945             case 0xA2:
1946             case 0xA4:
1947             case 0xA6:
1948             case 0xA8:
1949             case 0xAA:
1950             case 0xC4:
1951             case 0xE4:
1952             case 0xE6:
1953             case 0xEF:
1954                 ch--;
1955                 break;
1956             case 0xAB:
1957                 ch = 0x30F5;
1958                 break;
1959             case 0xB1:
1960                 ch = 0x30F6;
1961                 break;
1962             }
1963         }
1964         return ch;
1965     }
1966
1967     /**
1968      * This should be connected to special Jamo handling.
1969      */
1970     private int getFirstCE(char ch) {
1971         m_utilColEIter_.setText(UCharacter.toString(ch));
1972         return m_utilColEIter_.next();
1973     }
1974
1975     /**
1976      * This adds a read element, while testing for existence
1977      * 
1978      * @param t
1979      *            build table
1980      * @param element
1981      * @return ce
1982      */
1983     private int addAnElement(BuildTable t, Elements element) {
1984         List<Integer> expansions = t.m_expansions_;
1985         element.m_mapCE_ = 0;
1986
1987         if (element.m_CELength_ == 1) {
1988             element.m_mapCE_ = element.m_CEs_[0];
1989
1990         } else {
1991             // unfortunately, it looks like we have to look for a long primary
1992             // here since in canonical closure we are going to hit some long
1993             // primaries from the first phase, and they will come back as
1994             // continuations/expansions destroying the effect of the previous
1995             // opitimization. A long primary is a three byte primary with
1996             // starting secondaries and tertiaries. It can appear in long runs
1997             // of only primary differences (like east Asian tailorings) also,
1998             // it should not be an expansion, as expansions would break with
1999             // this
2000             if (element.m_CELength_ == 2 // a two CE expansion
2001                     && RuleBasedCollator.isContinuation(element.m_CEs_[1])
2002                     && (element.m_CEs_[1] & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that
2003                                                                                                               // has
2004                                                                                                               // only
2005                                                                                                               // primaries
2006                                                                                                               // in
2007                                                                                                               // continuation
2008                     && (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_)
2009                     // a common secondary
2010                     && ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_) // and
2011                                                                                       // a
2012                                                                                       // common
2013                                                                                       // tertiary
2014             ) {
2015                 element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_
2016                 // a long primary special
2017                         | (CE_LONG_PRIMARY_TAG_ << 24)
2018                         // first and second byte of primary
2019                         | ((element.m_CEs_[0] >> 8) & 0xFFFF00)
2020                         // third byte of primary
2021                         | ((element.m_CEs_[1] >> 24) & 0xFF);
2022             } else {
2023                 // omitting expansion offset in builder
2024                 // (HEADER_SIZE_ >> 2)
2025                 int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
2026                         | (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
2027                         | (addExpansion(expansions, element.m_CEs_[0]) << 4)
2028                         & 0xFFFFF0;
2029
2030                 for (int i = 1; i < element.m_CELength_; i++) {
2031                     addExpansion(expansions, element.m_CEs_[i]);
2032                 }
2033                 if (element.m_CELength_ <= 0xF) {
2034                     expansion |= element.m_CELength_;
2035                 } else {
2036                     addExpansion(expansions, 0);
2037                 }
2038                 element.m_mapCE_ = expansion;
2039                 setMaxExpansion(element.m_CEs_[element.m_CELength_ - 1],
2040                         (byte) element.m_CELength_, t.m_maxExpansions_);
2041                 if (isJamo(element.m_cPoints_.charAt(0))) {
2042                     t.m_collator_.m_isJamoSpecial_ = true;
2043                     setMaxJamoExpansion(element.m_cPoints_.charAt(0),
2044                             element.m_CEs_[element.m_CELength_ - 1],
2045                             (byte) element.m_CELength_, t.m_maxJamoExpansions_);
2046                 }
2047             }
2048         }
2049
2050         // We treat digits differently - they are "uber special" and should be
2051         // processed differently if numeric collation is on.
2052         int uniChar = 0;
2053         if ((element.m_uchars_.length() == 2)
2054                 && UTF16.isLeadSurrogate(element.m_uchars_.charAt(0))) {
2055             uniChar = UCharacterProperty.getRawSupplementary(element.m_uchars_
2056                     .charAt(0), element.m_uchars_.charAt(1));
2057         } else if (element.m_uchars_.length() == 1) {
2058             uniChar = element.m_uchars_.charAt(0);
2059         }
2060
2061         // Here, we either have one normal CE OR mapCE is set. Therefore, we
2062         // stuff only one element to the expansion buffer. When we encounter a
2063         // digit and we don't do numeric collation, we will just pick the CE
2064         // we have and break out of case (see ucol.cpp ucol_prv_getSpecialCE
2065         // && ucol_prv_getSpecialPrevCE). If we picked a special, further
2066         // processing will occur. If it's a simple CE, we'll return due
2067         // to how the loop is constructed.
2068         if (uniChar != 0 && UCharacter.isDigit(uniChar)) {
2069             // prepare the element
2070             int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
2071                     | (CollationElementIterator.CE_DIGIT_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
2072                     | 1;
2073             if (element.m_mapCE_ != 0) {
2074                 // if there is an expansion, we'll pick it here
2075                 expansion |= (addExpansion(expansions, element.m_mapCE_) << 4);
2076             } else {
2077                 expansion |= (addExpansion(expansions, element.m_CEs_[0]) << 4);
2078             }
2079             element.m_mapCE_ = expansion;
2080         }
2081
2082         // here we want to add the prefix structure.
2083         // I will try to process it as a reverse contraction, if possible.
2084         // prefix buffer is already reversed.
2085
2086         if (element.m_prefixChars_ != null
2087                 && element.m_prefixChars_.length() - element.m_prefix_ > 0) {
2088             // We keep the seen prefix starter elements in a hashtable we need
2089             // it to be able to distinguish between the simple codepoints and
2090             // prefix starters. Also, we need to use it for canonical closure.
2091             m_utilElement2_.m_caseBit_ = element.m_caseBit_;
2092             m_utilElement2_.m_CELength_ = element.m_CELength_;
2093             m_utilElement2_.m_CEs_ = element.m_CEs_;
2094             m_utilElement2_.m_mapCE_ = element.m_mapCE_;
2095             // m_utilElement2_.m_prefixChars_ = element.m_prefixChars_;
2096             m_utilElement2_.m_sizePrim_ = element.m_sizePrim_;
2097             m_utilElement2_.m_sizeSec_ = element.m_sizeSec_;
2098             m_utilElement2_.m_sizeTer_ = element.m_sizeTer_;
2099             m_utilElement2_.m_variableTop_ = element.m_variableTop_;
2100             m_utilElement2_.m_prefix_ = element.m_prefix_;
2101             m_utilElement2_.m_prefixChars_ = Normalizer.compose(
2102                     element.m_prefixChars_, false);
2103             m_utilElement2_.m_uchars_ = element.m_uchars_;
2104             m_utilElement2_.m_cPoints_ = element.m_cPoints_;
2105             m_utilElement2_.m_cPointsOffset_ = 0;
2106
2107             if (t.m_prefixLookup_ != null) {
2108                 Elements uCE = t.m_prefixLookup_.get(element);
2109                 if (uCE != null) {
2110                     // there is already a set of code points here
2111                     element.m_mapCE_ = addPrefix(t, uCE.m_mapCE_, element);
2112                 } else { // no code points, so this spot is clean
2113                     element.m_mapCE_ = addPrefix(t, CE_NOT_FOUND_, element);
2114                     uCE = new Elements(element);
2115                     uCE.m_cPoints_ = uCE.m_uchars_;
2116                     t.m_prefixLookup_.put(uCE, uCE);
2117                 }
2118                 if (m_utilElement2_.m_prefixChars_.length() != element.m_prefixChars_
2119                         .length()
2120                         - element.m_prefix_
2121                         || !m_utilElement2_.m_prefixChars_.regionMatches(0,
2122                                 element.m_prefixChars_, element.m_prefix_,
2123                                 m_utilElement2_.m_prefixChars_.length())) {
2124                     // do it!
2125                     m_utilElement2_.m_mapCE_ = addPrefix(t, element.m_mapCE_,
2126                             m_utilElement2_);
2127                 }
2128             }
2129         }
2130
2131         // We need to use the canonical iterator here
2132         // the way we do it is to generate the canonically equivalent strings
2133         // for the contraction and then add the sequences that pass FCD check
2134         if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1
2135                 && !(element.m_cPoints_.length() - element.m_cPointsOffset_ == 2
2136                         && UTF16.isLeadSurrogate(element.m_cPoints_.charAt(0)) && UTF16
2137                         .isTrailSurrogate(element.m_cPoints_.charAt(1)))) {
2138             // this is a contraction, we should check whether a composed form
2139             // should also be included
2140             m_utilCanIter_.setSource(element.m_cPoints_);
2141             String source = m_utilCanIter_.next();
2142             while (source != null && source.length() > 0) {
2143                 if (Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.NO) {
2144                     element.m_uchars_ = source;
2145                     element.m_cPoints_ = element.m_uchars_;
2146                     finalizeAddition(t, element);
2147                 }
2148                 source = m_utilCanIter_.next();
2149             }
2150
2151             return element.m_mapCE_;
2152         } else {
2153             return finalizeAddition(t, element);
2154         }
2155     }
2156
2157     /**
2158      * Adds an expansion ce to the expansion vector
2159      * 
2160      * @param expansions
2161      *            vector to add to
2162      * @param value
2163      *            of the expansion
2164      * @return the current position of the new element
2165      */
2166     private static final int addExpansion(List<Integer> expansions, int value) {
2167         expansions.add(Integer.valueOf(value));
2168         return expansions.size() - 1;
2169     }
2170
2171     /**
2172      * Looks for the maximum length of all expansion sequences ending with the
2173      * same collation element. The size required for maxexpansion and maxsize is
2174      * returned if the arrays are too small.
2175      * 
2176      * @param endexpansion
2177      *            the last expansion collation element to be added
2178      * @param expansionsize
2179      *            size of the expansion
2180      * @param maxexpansion
2181      *            data structure to store the maximum expansion data.
2182      * @returns size of the maxexpansion and maxsize used.
2183      */
2184     private static int setMaxExpansion(int endexpansion, byte expansionsize,
2185             MaxExpansionTable maxexpansion) {
2186         int start = 0;
2187         int limit = maxexpansion.m_endExpansionCE_.size();
2188         long unsigned = (long) endexpansion;
2189         unsigned &= 0xFFFFFFFFl;
2190
2191         // using binary search to determine if last expansion element is
2192         // already in the array
2193         int result = -1;
2194         if (limit > 0) {
2195             while (start < limit - 1) {
2196                 int mid = (start + limit) >> 1;
2197                 long unsignedce = (maxexpansion.m_endExpansionCE_
2198                         .get(mid)).intValue();
2199                 unsignedce &= 0xFFFFFFFFl;
2200                 if (unsigned < unsignedce) {
2201                     limit = mid;
2202                 } else {
2203                     start = mid;
2204                 }
2205             }
2206     
2207             if ((maxexpansion.m_endExpansionCE_.get(start)).intValue() == endexpansion) {
2208                 result = start;
2209             }
2210         }
2211         if (result > -1) {
2212             // found the ce in expansion, we'll just modify the size if it
2213             // is smaller
2214             Object currentsize = maxexpansion.m_expansionCESize_.get(result);
2215             if (((Byte) currentsize).byteValue() < expansionsize) {
2216                 maxexpansion.m_expansionCESize_.set(result, Byte.valueOf(
2217                             expansionsize));
2218             }
2219         } else {
2220             // we'll need to squeeze the value into the array. initial
2221             // implementation. shifting the subarray down by 1
2222             maxexpansion.m_endExpansionCE_.add(start + 1, Integer.valueOf(endexpansion));
2223             maxexpansion.m_expansionCESize_.add(start + 1, Byte.valueOf(expansionsize));
2224         }
2225         return maxexpansion.m_endExpansionCE_.size();
2226     }
2227
2228     /**
2229      * Sets the maximum length of all jamo expansion sequences ending with the
2230      * same collation element. The size required for maxexpansion and maxsize is
2231      * returned if the arrays are too small.
2232      * 
2233      * @param ch
2234      *            the jamo codepoint
2235      * @param endexpansion
2236      *            the last expansion collation element to be added
2237      * @param expansionsize
2238      *            size of the expansion
2239      * @param maxexpansion
2240      *            data structure to store the maximum expansion data.
2241      * @returns size of the maxexpansion and maxsize used.
2242      */
2243     private static int setMaxJamoExpansion(char ch, int endexpansion,
2244             byte expansionsize, MaxJamoExpansionTable maxexpansion) {
2245         boolean isV = true;
2246         if (ch >= 0x1100 && ch <= 0x1112) {
2247             // determines L for Jamo, doesn't need to store this since it is
2248             // never at the end of a expansion
2249             if (maxexpansion.m_maxLSize_ < expansionsize) {
2250                 maxexpansion.m_maxLSize_ = expansionsize;
2251             }
2252             return maxexpansion.m_endExpansionCE_.size();
2253         }
2254
2255         if (ch >= 0x1161 && ch <= 0x1175) {
2256             // determines V for Jamo
2257             if (maxexpansion.m_maxVSize_ < expansionsize) {
2258                 maxexpansion.m_maxVSize_ = expansionsize;
2259             }
2260         }
2261
2262         if (ch >= 0x11A8 && ch <= 0x11C2) {
2263             isV = false;
2264             // determines T for Jamo
2265             if (maxexpansion.m_maxTSize_ < expansionsize) {
2266                 maxexpansion.m_maxTSize_ = expansionsize;
2267             }
2268         }
2269
2270         int pos = maxexpansion.m_endExpansionCE_.size();
2271         while (pos > 0) {
2272             pos--;
2273             if ((maxexpansion.m_endExpansionCE_.get(pos)).intValue() == endexpansion) {
2274                 return maxexpansion.m_endExpansionCE_.size();
2275             }
2276         }
2277         maxexpansion.m_endExpansionCE_.add(Integer.valueOf(endexpansion));
2278         maxexpansion.m_isV_.add(isV ? Boolean.TRUE : Boolean.FALSE);
2279
2280         return maxexpansion.m_endExpansionCE_.size();
2281     }
2282
2283     /**
2284      * Adds a prefix to the table
2285      * 
2286      * @param t
2287      *            build table to update
2288      * @param CE
2289      *            collation element to add
2290      * @param element
2291      *            rule element to add
2292      * @return modified ce
2293      */
2294     private int addPrefix(BuildTable t, int CE, Elements element) {
2295         // currently the longest prefix we're supporting in Japanese is two
2296         // characters long. Although this table could quite easily mimic
2297         // complete contraction stuff there is no good reason to make a general
2298         // solution, as it would require some error prone messing.
2299         ContractionTable contractions = t.m_contractions_;
2300         String oldCP = element.m_cPoints_;
2301         int oldCPOffset = element.m_cPointsOffset_;
2302
2303         contractions.m_currentTag_ = CE_SPEC_PROC_TAG_;
2304         // here, we will normalize & add prefix to the table.
2305         int size = element.m_prefixChars_.length() - element.m_prefix_;
2306         for (int j = 1; j < size; j++) {
2307             // First add NFD prefix chars to unsafe CP hash table
2308             // Unless it is a trail surrogate, which is handled algoritmically
2309             // and shouldn't take up space in the table.
2310             char ch = element.m_prefixChars_.charAt(j + element.m_prefix_);
2311             if (!UTF16.isTrailSurrogate(ch)) {
2312                 unsafeCPSet(t.m_unsafeCP_, ch);
2313             }
2314         }
2315
2316         // StringBuffer reversed = new StringBuffer();
2317         m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
2318         for (int j = 0; j < size; j++) {
2319             // prefixes are going to be looked up backwards
2320             // therefore, we will promptly reverse the prefix buffer...
2321             int offset = element.m_prefixChars_.length() - j - 1;
2322             m_utilStringBuffer_.append(element.m_prefixChars_.charAt(offset));
2323         }
2324         element.m_prefixChars_ = m_utilStringBuffer_.toString();
2325         element.m_prefix_ = 0;
2326
2327         // the first codepoint is also unsafe, as it forms a 'contraction' with
2328         // the prefix
2329         if (!UTF16.isTrailSurrogate(element.m_cPoints_.charAt(0))) {
2330             unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_.charAt(0));
2331         }
2332
2333         element.m_cPoints_ = element.m_prefixChars_;
2334         element.m_cPointsOffset_ = element.m_prefix_;
2335
2336         // Add the last char of the contraction to the contraction-end hash
2337         // table. unless it is a trail surrogate, which is handled
2338         // algorithmically and shouldn't be in the table
2339         if (!UTF16.isTrailSurrogate(element.m_cPoints_
2340                 .charAt(element.m_cPoints_.length() - 1))) {
2341             ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_
2342                     .charAt(element.m_cPoints_.length() - 1));
2343         }
2344         // First we need to check if contractions starts with a surrogate
2345         // int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_);
2346
2347         // If there are any Jamos in the contraction, we should turn on special
2348         // processing for Jamos
2349         if (isJamo(element.m_prefixChars_.charAt(element.m_prefix_))) {
2350             t.m_collator_.m_isJamoSpecial_ = true;
2351         }
2352         // then we need to deal with it
2353         // we could aready have something in table - or we might not
2354         if (!isPrefix(CE)) {
2355             // if it wasn't contraction, we wouldn't end up here
2356             int firstContractionOffset = addContraction(contractions,
2357                     CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, CE);
2358             int newCE = processContraction(contractions, element, CE_NOT_FOUND_);
2359             addContraction(contractions, firstContractionOffset,
2360                     element.m_prefixChars_.charAt(element.m_prefix_), newCE);
2361             addContraction(contractions, firstContractionOffset, (char) 0xFFFF,
2362                     CE);
2363             CE = constructSpecialCE(CE_SPEC_PROC_TAG_, firstContractionOffset);
2364         } else {
2365             // we are adding to existing contraction
2366             // there were already some elements in the table, so we need to add
2367             // a new contraction
2368             // Two things can happen here: either the codepoint is already in
2369             // the table, or it is not
2370             char ch = element.m_prefixChars_.charAt(element.m_prefix_);
2371             int position = findCP(contractions, CE, ch);
2372             if (position > 0) {
2373                 // if it is we just continue down the chain
2374                 int eCE = getCE(contractions, CE, position);
2375                 int newCE = processContraction(contractions, element, eCE);
2376                 setContraction(contractions, CE, position, ch, newCE);
2377             } else {
2378                 // if it isn't, we will have to create a new sequence
2379                 processContraction(contractions, element, CE_NOT_FOUND_);
2380                 insertContraction(contractions, CE, ch, element.m_mapCE_);
2381             }
2382         }
2383
2384         element.m_cPoints_ = oldCP;
2385         element.m_cPointsOffset_ = oldCPOffset;
2386
2387         return CE;
2388     }
2389
2390     /**
2391      * Checks if the argument ce is a contraction
2392      * 
2393      * @param CE
2394      *            collation element
2395      * @return true if argument ce is a contraction
2396      */
2397     private static final boolean isContraction(int CE) {
2398         return isSpecial(CE) && (getCETag(CE) == CE_CONTRACTION_TAG_);
2399     }
2400
2401     /**
2402      * Checks if the argument ce has a prefix
2403      * 
2404      * @param CE
2405      *            collation element
2406      * @return true if argument ce has a prefix
2407      */
2408     private static final boolean isPrefix(int CE) {
2409         return isSpecial(CE) && (getCETag(CE) == CE_SPEC_PROC_TAG_);
2410     }
2411
2412     /**
2413      * Checks if the argument ce is special
2414      * 
2415      * @param CE
2416      *            collation element
2417      * @return true if argument ce is special
2418      */
2419     private static final boolean isSpecial(int CE) {
2420         return (CE & RuleBasedCollator.CE_SPECIAL_FLAG_) == 0xF0000000;
2421     }
2422
2423     /**
2424      * Checks if the argument ce has a prefix
2425      * 
2426      * @param CE
2427      *            collation element
2428      * @return true if argument ce has a prefix
2429      */
2430     private static final int getCETag(int CE) {
2431         return (CE & RuleBasedCollator.CE_TAG_MASK_) >>> RuleBasedCollator.CE_TAG_SHIFT_;
2432     }
2433
2434     /**
2435      * Gets the ce at position in contraction table
2436      * 
2437      * @param table
2438      *            contraction table
2439      * @param position
2440      *            offset to the contraction table
2441      * @return ce
2442      */
2443     private static final int getCE(ContractionTable table, int element,
2444             int position) {
2445         element &= 0xFFFFFF;
2446         BasicContractionTable tbl = getBasicContractionTable(table, element);
2447
2448         if (tbl == null) {
2449             return CE_NOT_FOUND_;
2450         }
2451         if (position > tbl.m_CEs_.size() || position == -1) {
2452             return CE_NOT_FOUND_;
2453         } else {
2454             return tbl.m_CEs_.get(position).intValue();
2455         }
2456     }
2457
2458     /**
2459      * Sets the unsafe character
2460      * 
2461      * @param table
2462      *            unsafe table
2463      * @param c
2464      *            character to be added
2465      */
2466     private static final void unsafeCPSet(byte table[], char c) {
2467         int hash = c;
2468         if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {
2469             if (hash >= 0xd800 && hash <= 0xf8ff) {
2470                 // Part of a surrogate, or in private use area.
2471                 // These don't go in the table
2472                 return;
2473             }
2474             hash = (hash & UNSAFECP_TABLE_MASK_) + 256;
2475         }
2476         table[hash >> 3] |= (1 << (hash & 7));
2477     }
2478
2479     /**
2480      * Sets the contraction end character
2481      * 
2482      * @param table
2483      *            contraction end table
2484      * @param c
2485      *            character to be added
2486      */
2487     private static final void ContrEndCPSet(byte table[], char c) {
2488         int hash = c;
2489         if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {
2490             hash = (hash & UNSAFECP_TABLE_MASK_) + 256;
2491         }
2492         table[hash >> 3] |= (1 << (hash & 7));
2493     }
2494
2495     /**
2496      * Adds more contractions in table. If element is non existant, it creates
2497      * on. Returns element handle
2498      * 
2499      * @param table
2500      *            contraction table
2501      * @param element
2502      *            offset to the contraction table
2503      * @param codePoint
2504      *            codepoint to add
2505      * @param value
2506      * @return collation element
2507      */
2508     private static int addContraction(ContractionTable table, int element,
2509             char codePoint, int value) {
2510         BasicContractionTable tbl = getBasicContractionTable(table, element);
2511         if (tbl == null) {
2512             tbl = addAContractionElement(table);
2513             element = table.m_elements_.size() - 1;
2514         }
2515
2516         tbl.m_CEs_.add(Integer.valueOf(value));
2517         tbl.m_codePoints_.append(codePoint);
2518         return constructSpecialCE(table.m_currentTag_, element);
2519     }
2520
2521     /**
2522      * Adds a contraction element to the table
2523      * 
2524      * @param table
2525      *            contraction table to update
2526      * @return contraction
2527      */
2528     private static BasicContractionTable addAContractionElement(
2529             ContractionTable table) {
2530         BasicContractionTable result = new BasicContractionTable();
2531         table.m_elements_.add(result);
2532         return result;
2533     }
2534
2535     /**
2536      * Constructs a special ce
2537      * 
2538      * @param tag
2539      *            special tag
2540      * @param CE
2541      *            collation element
2542      * @return a contraction ce
2543      */
2544     private static final int constructSpecialCE(int tag, int CE) {
2545         return RuleBasedCollator.CE_SPECIAL_FLAG_
2546                 | (tag << RuleBasedCollator.CE_TAG_SHIFT_) | (CE & 0xFFFFFF);
2547     }
2548
2549     /**
2550      * Sets and inserts the element that has a contraction
2551      * 
2552      * @param contractions
2553      *            contraction table
2554      * @param element
2555      *            contracting element
2556      * @param existingCE
2557      * @return contraction ce
2558      */
2559     private static int processContraction(ContractionTable contractions,
2560             Elements element, int existingCE) {
2561         int firstContractionOffset = 0;
2562         // end of recursion
2563         if (element.m_cPoints_.length() - element.m_cPointsOffset_ == 1) {
2564             if (isContractionTableElement(existingCE)
2565                     && getCETag(existingCE) == contractions.m_currentTag_) {
2566                 changeContraction(contractions, existingCE, (char) 0,
2567                         element.m_mapCE_);
2568                 changeContraction(contractions, existingCE, (char) 0xFFFF,
2569                         element.m_mapCE_);
2570                 return existingCE;
2571             } else {
2572                 // can't do just that. existingCe might be a contraction,
2573                 // meaning that we need to do another step
2574                 return element.m_mapCE_;
2575             }
2576         }
2577
2578         // this recursion currently feeds on the only element we have...
2579         // We will have to copy it in order to accomodate for both backward
2580         // and forward cycles
2581         // we encountered either an empty space or a non-contraction element
2582         // this means we are constructing a new contraction sequence
2583         element.m_cPointsOffset_++;
2584         if (!isContractionTableElement(existingCE)) {
2585             // if it wasn't contraction, we wouldn't end up here
2586             firstContractionOffset = addContraction(contractions,
2587                     CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, existingCE);
2588             int newCE = processContraction(contractions, element, CE_NOT_FOUND_);
2589             addContraction(contractions, firstContractionOffset,
2590                     element.m_cPoints_.charAt(element.m_cPointsOffset_), newCE);
2591             addContraction(contractions, firstContractionOffset, (char) 0xFFFF,
2592                     existingCE);
2593             existingCE = constructSpecialCE(contractions.m_currentTag_,
2594                     firstContractionOffset);
2595         } else {
2596             // we are adding to existing contraction
2597             // there were already some elements in the table, so we need to add
2598             // a new contraction
2599             // Two things can happen here: either the codepoint is already in
2600             // the table, or it is not
2601             int position = findCP(contractions, existingCE, element.m_cPoints_
2602                     .charAt(element.m_cPointsOffset_));
2603             if (position > 0) {
2604                 // if it is we just continue down the chain
2605                 int eCE = getCE(contractions, existingCE, position);
2606                 int newCE = processContraction(contractions, element, eCE);
2607                 setContraction(contractions, existingCE, position,
2608                         element.m_cPoints_.charAt(element.m_cPointsOffset_),
2609                         newCE);
2610             } else {
2611                 // if it isn't, we will have to create a new sequence
2612                 int newCE = processContraction(contractions, element,
2613                         CE_NOT_FOUND_);
2614                 insertContraction(contractions, existingCE, element.m_cPoints_
2615                         .charAt(element.m_cPointsOffset_), newCE);
2616             }
2617         }
2618         element.m_cPointsOffset_--;
2619         return existingCE;
2620     }
2621
2622     /**
2623      * Checks if CE belongs to the contraction table
2624      * 
2625      * @param CE
2626      *            collation element to test
2627      * @return true if CE belongs to the contraction table
2628      */
2629     private static final boolean isContractionTableElement(int CE) {
2630         return isSpecial(CE)
2631                 && (getCETag(CE) == CE_CONTRACTION_TAG_ || getCETag(CE) == CE_SPEC_PROC_TAG_);
2632     }
2633
2634     /**
2635      * Gets the codepoint
2636      * 
2637      * @param table
2638      *            contraction table
2639      * @param element
2640      *            offset to the contraction element in the table
2641      * @param codePoint
2642      *            code point to look for
2643      * @return the offset to the code point
2644      */
2645     private static int findCP(ContractionTable table, int element,
2646             char codePoint) {
2647         BasicContractionTable tbl = getBasicContractionTable(table, element);
2648         if (tbl == null) {
2649             return -1;
2650         }
2651
2652         int position = 0;
2653         while (codePoint > tbl.m_codePoints_.charAt(position)) {
2654             position++;
2655             if (position > tbl.m_codePoints_.length()) {
2656                 return -1;
2657             }
2658         }
2659         if (codePoint == tbl.m_codePoints_.charAt(position)) {
2660             return position;
2661         } else {
2662             return -1;
2663         }
2664     }
2665
2666     /**
2667      * Gets the contraction element out of the contraction table
2668      * 
2669      * @param table
2670      *            contraction table
2671      * @param offset
2672      *            to the element in the contraction table
2673      * @return basic contraction element at offset in the contraction table
2674      */
2675     private static final BasicContractionTable getBasicContractionTable(
2676             ContractionTable table, int offset) {
2677         offset &= 0xFFFFFF;
2678         if (offset == 0xFFFFFF) {
2679             return null;
2680         }
2681         return table.m_elements_.get(offset);
2682     }
2683
2684     /**
2685      * Changes the contraction element
2686      * 
2687      * @param table
2688      *            contraction table
2689      * @param element
2690      *            offset to the element in the contraction table
2691      * @param codePoint
2692      *            codepoint
2693      * @param newCE
2694      *            new collation element
2695      * @return basic contraction element at offset in the contraction table
2696      */
2697     private static final int changeContraction(ContractionTable table,
2698             int element, char codePoint, int newCE) {
2699         BasicContractionTable tbl = getBasicContractionTable(table, element);
2700         if (tbl == null) {
2701             return 0;
2702         }
2703         int position = 0;
2704         while (codePoint > tbl.m_codePoints_.charAt(position)) {
2705             position++;
2706             if (position > tbl.m_codePoints_.length()) {
2707                 return CE_NOT_FOUND_;
2708             }
2709         }
2710         if (codePoint == tbl.m_codePoints_.charAt(position)) {
2711             tbl.m_CEs_.set(position, Integer.valueOf(newCE));
2712             return element & 0xFFFFFF;
2713         } else {
2714             return CE_NOT_FOUND_;
2715         }
2716     }
2717
2718     /**
2719      * Sets a part of contraction sequence in table. If element is non existant,
2720      * it creates on. Returns element handle.
2721      * 
2722      * @param table
2723      *            contraction table
2724      * @param element
2725      *            offset to the contraction table
2726      * @param offset
2727      * @param codePoint
2728      *            contraction character
2729      * @param value
2730      *            ce value
2731      * @return new contraction ce
2732      */
2733     private static final int setContraction(ContractionTable table,
2734             int element, int offset, char codePoint, int value) {
2735         element &= 0xFFFFFF;
2736         BasicContractionTable tbl = getBasicContractionTable(table, element);
2737         if (tbl == null) {
2738             tbl = addAContractionElement(table);
2739             element = table.m_elements_.size() - 1;
2740         }
2741
2742         tbl.m_CEs_.set(offset, Integer.valueOf(value));
2743         tbl.m_codePoints_.setCharAt(offset, codePoint);
2744         return constructSpecialCE(table.m_currentTag_, element);
2745     }
2746
2747     /**
2748      * Inserts a part of contraction sequence in table. Sequences behind the
2749      * offset are moved back. If element is non existent, it creates on.
2750      * 
2751      * @param table
2752      *            contraction
2753      * @param element
2754      *            offset to the table contraction
2755      * @param codePoint
2756      *            code point
2757      * @param value
2758      *            collation element value
2759      * @return contraction collation element
2760      */
2761     private static final int insertContraction(ContractionTable table,
2762             int element, char codePoint, int value) {
2763         element &= 0xFFFFFF;
2764         BasicContractionTable tbl = getBasicContractionTable(table, element);
2765         if (tbl == null) {
2766             tbl = addAContractionElement(table);
2767             element = table.m_elements_.size() - 1;
2768         }
2769
2770         int offset = 0;
2771         while (tbl.m_codePoints_.charAt(offset) < codePoint
2772                 && offset < tbl.m_codePoints_.length()) {
2773             offset++;
2774         }
2775
2776         tbl.m_CEs_.add(offset, Integer.valueOf(value));
2777         tbl.m_codePoints_.insert(offset, codePoint);
2778
2779         return constructSpecialCE(table.m_currentTag_, element);
2780     }
2781
2782     /**
2783      * Finalize addition
2784      * 
2785      * @param t
2786      *            build table
2787      * @param element
2788      *            to add
2789      */
2790     private final static int finalizeAddition(BuildTable t, Elements element) {
2791         int CE = CE_NOT_FOUND_;
2792         // This should add a completely ignorable element to the
2793         // unsafe table, so that backward iteration will skip
2794         // over it when treating contractions.
2795         if (element.m_mapCE_ == 0) {
2796             for (int i = 0; i < element.m_cPoints_.length(); i++) {
2797                 char ch = element.m_cPoints_.charAt(i);
2798                 if (!UTF16.isTrailSurrogate(ch)) {
2799                     unsafeCPSet(t.m_unsafeCP_, ch);
2800                 }
2801             }
2802         }
2803
2804         if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1) {
2805             // we're adding a contraction
2806             int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_);
2807             CE = t.m_mapping_.getValue(cp);
2808             CE = addContraction(t, CE, element);
2809         } else {
2810             // easy case
2811             CE = t.m_mapping_.getValue(element.m_cPoints_
2812                     .charAt(element.m_cPointsOffset_));
2813
2814             if (CE != CE_NOT_FOUND_) {
2815                 if (isContractionTableElement(CE)) {
2816                     // adding a non contraction element (thai, expansion,
2817                     // single) to already existing contraction
2818                     if (!isPrefix(element.m_mapCE_)) {
2819                         // we cannot reenter prefix elements - as we are going
2820                         // to create a dead loop
2821                         // Only expansions and regular CEs can go here...
2822                         // Contractions will never happen in this place
2823                         setContraction(t.m_contractions_, CE, 0, (char) 0,
2824                                 element.m_mapCE_);
2825                         // This loop has to change the CE at the end of
2826                         // contraction REDO!
2827                         changeLastCE(t.m_contractions_, CE, element.m_mapCE_);
2828                     }
2829                 } else {
2830                     t.m_mapping_
2831                             .setValue(element.m_cPoints_
2832                                     .charAt(element.m_cPointsOffset_),
2833                                     element.m_mapCE_);
2834                     if (element.m_prefixChars_ != null
2835                             && element.m_prefixChars_.length() > 0
2836                             && getCETag(CE) != CE_IMPLICIT_TAG_) {
2837                         // Add CE for standalone precontext char.
2838                         Elements origElem = new Elements();
2839                         origElem.m_prefixChars_ = null;
2840                         origElem.m_uchars_ = element.m_cPoints_;
2841                         origElem.m_cPoints_ = origElem.m_uchars_;
2842                         origElem.m_CEs_[0] = CE;
2843                         origElem.m_mapCE_ = CE;
2844                         origElem.m_CELength_ = 1;
2845                         finalizeAddition(t, origElem);
2846                     }
2847                 }
2848             } else {
2849                 t.m_mapping_.setValue(element.m_cPoints_
2850                         .charAt(element.m_cPointsOffset_), element.m_mapCE_);
2851             }
2852         }
2853         return CE;
2854     }
2855
2856     /**
2857      * Note regarding surrogate handling: We are interested only in the single
2858      * or leading surrogates in a contraction. If a surrogate is somewhere else
2859      * in the contraction, it is going to be handled as a pair of code units, as
2860      * it doesn't affect the performance AND handling surrogates specially would
2861      * complicate code way too much.
2862      */
2863     private static int addContraction(BuildTable t, int CE, Elements element) {
2864         ContractionTable contractions = t.m_contractions_;
2865         contractions.m_currentTag_ = CE_CONTRACTION_TAG_;
2866
2867         // First we need to check if contractions starts with a surrogate
2868         int cp = UTF16.charAt(element.m_cPoints_, 0);
2869         int cpsize = 1;
2870         if (UCharacter.isSupplementary(cp)) {
2871             cpsize = 2;
2872         }
2873         if (cpsize < element.m_cPoints_.length()) {
2874             // This is a real contraction, if there are other characters after
2875             // the first
2876             int size = element.m_cPoints_.length() - element.m_cPointsOffset_;
2877             for (int j = 1; j < size; j++) {
2878                 // First add contraction chars to unsafe CP hash table
2879                 // Unless it is a trail surrogate, which is handled
2880                 // algoritmically and shouldn't take up space in the table.
2881                 if (!UTF16.isTrailSurrogate(element.m_cPoints_
2882                         .charAt(element.m_cPointsOffset_ + j))) {
2883                     unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_
2884                             .charAt(element.m_cPointsOffset_ + j));
2885                 }
2886             }
2887             // Add the last char of the contraction to the contraction-end
2888             // hash table. unless it is a trail surrogate, which is handled
2889             // algorithmically and shouldn't be in the table
2890             if (!UTF16.isTrailSurrogate(element.m_cPoints_
2891                     .charAt(element.m_cPoints_.length() - 1))) {
2892                 ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_
2893                         .charAt(element.m_cPoints_.length() - 1));
2894             }
2895
2896             // If there are any Jamos in the contraction, we should turn on
2897             // special processing for Jamos
2898             if (isJamo(element.m_cPoints_.charAt(element.m_cPointsOffset_))) {
2899                 t.m_collator_.m_isJamoSpecial_ = true;
2900             }
2901             // then we need to deal with it
2902             // we could aready have something in table - or we might not
2903             element.m_cPointsOffset_ += cpsize;
2904             if (!isContraction(CE)) {
2905                 // if it wasn't contraction, we wouldn't end up here
2906                 int firstContractionOffset = addContraction(contractions,
2907                         CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, CE);
2908                 int newCE = processContraction(contractions, element,
2909                         CE_NOT_FOUND_);
2910                 addContraction(contractions, firstContractionOffset,
2911                         element.m_cPoints_.charAt(element.m_cPointsOffset_),
2912                         newCE);
2913                 addContraction(contractions, firstContractionOffset,
2914                         (char) 0xFFFF, CE);
2915                 CE = constructSpecialCE(CE_CONTRACTION_TAG_,
2916                         firstContractionOffset);
2917             } else {
2918                 // we are adding to existing contraction
2919                 // there were already some elements in the table, so we need to
2920                 // add a new contraction
2921                 // Two things can happen here: either the codepoint is already
2922                 // in the table, or it is not
2923                 int position = findCP(contractions, CE, element.m_cPoints_
2924                         .charAt(element.m_cPointsOffset_));
2925                 if (position > 0) {
2926                     // if it is we just continue down the chain
2927                     int eCE = getCE(contractions, CE, position);
2928                     int newCE = processContraction(contractions, element, eCE);
2929                     setContraction(
2930                             contractions,
2931                             CE,
2932                             position,
2933                             element.m_cPoints_.charAt(element.m_cPointsOffset_),
2934                             newCE);
2935                 } else {
2936                     // if it isn't, we will have to create a new sequence
2937                     int newCE = processContraction(contractions, element,
2938                             CE_NOT_FOUND_);
2939                     insertContraction(contractions, CE, element.m_cPoints_
2940                             .charAt(element.m_cPointsOffset_), newCE);
2941                 }
2942             }
2943             element.m_cPointsOffset_ -= cpsize;
2944             t.m_mapping_.setValue(cp, CE);
2945         } else if (!isContraction(CE)) {
2946             // this is just a surrogate, and there is no contraction
2947             t.m_mapping_.setValue(cp, element.m_mapCE_);
2948         } else {
2949             // fill out the first stage of the contraction with the surrogate
2950             // CE
2951             changeContraction(contractions, CE, (char) 0, element.m_mapCE_);
2952             changeContraction(contractions, CE, (char) 0xFFFF, element.m_mapCE_);
2953         }
2954         return CE;
2955     }
2956
2957     /**
2958      * this is for adding non contractions
2959      * 
2960      * @param table
2961      *            contraction table
2962      * @param element
2963      *            offset to the contraction table
2964      * @param value
2965      *            collation element value
2966      * @return new collation element
2967      */
2968     private static final int changeLastCE(ContractionTable table, int element,
2969             int value) {
2970         BasicContractionTable tbl = getBasicContractionTable(table, element);
2971         if (tbl == null) {
2972             return 0;
2973         }
2974
2975         tbl.m_CEs_.set(tbl.m_CEs_.size() - 1, Integer.valueOf(value));
2976         return constructSpecialCE(table.m_currentTag_, element & 0xFFFFFF);
2977     }
2978
2979     /**
2980      * Given a set of ranges calculated by allocWeights(), iterate through the
2981      * weights. Sets the next weight in cegenerator.m_current_.
2982      * 
2983      * @param cegenerator
2984      *            object that contains ranges weight range array and its
2985      *            rangeCount
2986      * @return the next weight
2987      */
2988     private static int nextWeight(CEGenerator cegenerator) {
2989         if (cegenerator.m_rangesLength_ > 0) {
2990             // get maxByte from the .count field
2991             int maxByte = cegenerator.m_ranges_[0].m_count_;
2992             // get the next weight
2993             int weight = cegenerator.m_ranges_[0].m_start_;
2994             if (weight == cegenerator.m_ranges_[0].m_end_) {
2995                 // this range is finished, remove it and move the following
2996                 // ones up
2997                 cegenerator.m_rangesLength_--;
2998                 if (cegenerator.m_rangesLength_ > 0) {
2999                     System.arraycopy(cegenerator.m_ranges_, 1,
3000                             cegenerator.m_ranges_, 0,
3001                             cegenerator.m_rangesLength_);
3002                     cegenerator.m_ranges_[0].m_count_ = maxByte;
3003                     // keep maxByte in ranges[0]
3004                 }
3005             } else {
3006                 // increment the weight for the next value
3007                 cegenerator.m_ranges_[0].m_start_ = incWeight(weight,
3008                         cegenerator.m_ranges_[0].m_length2_, maxByte);
3009             }
3010             return weight;
3011         }
3012         return -1;
3013     }
3014
3015     /**
3016      * Increment the collation weight
3017      * 
3018      * @param weight
3019      *            to increment
3020      * @param length
3021      * @param maxByte
3022      * @return new incremented weight
3023      */
3024     private static final int incWeight(int weight, int length, int maxByte) {
3025         while (true) {
3026             int b = getWeightByte(weight, length);
3027             if (b < maxByte) {
3028                 return setWeightByte(weight, length, b + 1);
3029             } else {
3030                 // roll over, set this byte to BYTE_FIRST_TAILORED_ and
3031                 // increment the previous one
3032                 weight = setWeightByte(weight, length,
3033                         RuleBasedCollator.BYTE_FIRST_TAILORED_);
3034                 --length;
3035             }
3036         }
3037     }
3038
3039     /**
3040      * Gets the weight byte
3041      * 
3042      * @param weight
3043      * @param index
3044      * @return byte
3045      */
3046     private static final int getWeightByte(int weight, int index) {
3047         return (weight >> ((4 - index) << 3)) & 0xff;
3048     }
3049
3050     /**
3051      * Set the weight byte in table
3052      * 
3053      * @param weight
3054      * @param index
3055      * @param b
3056      *            byte
3057      */
3058     private static final int setWeightByte(int weight, int index, int b) {
3059         index <<= 3;
3060         // 0xffffffff except a 00 "hole" for the index-th byte
3061         int mask;
3062         if (index < 32) {
3063             mask = 0xffffffff >>> index;
3064         } else {
3065             // Do not use int>>>32 because that does not shift at all
3066             // while we need it to become 0.
3067             // 
3068             // Java Language Specification (Third Edition) 15.19 Shift Operators:
3069             // "If the promoted type of the left-hand operand is int,
3070             // only the five lowest-order bits of the right-hand operand
3071             // are used as the shift distance.
3072             // It is as if the right-hand operand were subjected to
3073             // a bitwise logical AND operator & (§15.22.1) with the mask value 0x1f.
3074             // The shift distance actually used is therefore
3075             // always in the range 0 to 31, inclusive."
3076             mask = 0;
3077         }
3078         index = 32 - index;
3079         mask |= 0xffffff00 << index;
3080         return (weight & mask) | (b << index);
3081     }
3082
3083     /**
3084      * Call getWeightRanges and then determine heuristically which ranges to use
3085      * for a given number of weights between (excluding) two limits
3086      * 
3087      * @param lowerLimit
3088      * @param upperLimit
3089      * @param n
3090      * @param maxByte
3091      * @param ranges
3092      * @return
3093      */
3094     private int allocateWeights(int lowerLimit, int upperLimit, int n,
3095             int maxByte, WeightRange ranges[]) {
3096         // number of usable byte values 3..maxByte
3097         int countBytes = maxByte - RuleBasedCollator.BYTE_FIRST_TAILORED_ + 1;
3098         // [0] unused, [5] to make index checks unnecessary, m_utilCountBuffer_
3099         // countBytes to the power of index, m_utilLongBuffer_ for unsignedness
3100         // gcc requires explicit initialization
3101         m_utilLongBuffer_[0] = 1;
3102         m_utilLongBuffer_[1] = countBytes;
3103         m_utilLongBuffer_[2] = m_utilLongBuffer_[1] * countBytes;
3104         m_utilLongBuffer_[3] = m_utilLongBuffer_[2] * countBytes;
3105         m_utilLongBuffer_[4] = m_utilLongBuffer_[3] * countBytes;
3106         int rangeCount = getWeightRanges(lowerLimit, upperLimit, maxByte,
3107                 countBytes, ranges);
3108         if (rangeCount <= 0) {
3109             return 0;
3110         }
3111         // what is the maximum number of weights with these ranges?
3112         long maxCount = 0;
3113         for (int i = 0; i < rangeCount; ++i) {
3114             maxCount += (long) ranges[i].m_count_
3115                     * m_utilLongBuffer_[4 - ranges[i].m_length_];
3116         }
3117         if (maxCount < n) {
3118             return 0;
3119         }
3120         // set the length2 and count2 fields
3121         for (int i = 0; i < rangeCount; ++i) {
3122             ranges[i].m_length2_ = ranges[i].m_length_;
3123             ranges[i].m_count2_ = ranges[i].m_count_;
3124         }
3125         // try until we find suitably large ranges
3126         while (true) {
3127             // get the smallest number of bytes in a range
3128             int minLength = ranges[0].m_length2_;
3129             // sum up the number of elements that fit into ranges of each byte
3130             // length
3131             Arrays.fill(m_utilCountBuffer_, 0);
3132             for (int i = 0; i < rangeCount; ++i) {
3133                 m_utilCountBuffer_[ranges[i].m_length2_] += ranges[i].m_count2_;
3134             }
3135             // now try to allocate n elements in the available short ranges
3136             if (n <= m_utilCountBuffer_[minLength]
3137                     + m_utilCountBuffer_[minLength + 1]) {
3138                 // trivial cases, use the first few ranges
3139                 maxCount = 0;
3140                 rangeCount = 0;
3141                 do {
3142                     maxCount += ranges[rangeCount].m_count2_;
3143                     ++rangeCount;
3144                 } while (n > maxCount);
3145                 break;
3146             } else if (n <= ranges[0].m_count2_ * countBytes) {
3147                 // easy case, just make this one range large enough by
3148                 // lengthening it once more, possibly split it
3149                 rangeCount = 1;
3150                 // calculate how to split the range between maxLength-1
3151                 // (count1) and maxLength (count2)
3152                 long power_1 = m_utilLongBuffer_[minLength
3153                         - ranges[0].m_length_];
3154                 long power = power_1 * countBytes;
3155                 int count2 = (int) ((n + power - 1) / power);
3156                 int count1 = ranges[0].m_count_ - count2;
3157                 // split the range
3158                 if (count1 < 1) {
3159                     // lengthen the entire range to maxLength
3160                     lengthenRange(ranges, 0, maxByte, countBytes);
3161                 } else {
3162                     // really split the range
3163                     // create a new range with the end and initial and current
3164                     // length of the old one
3165                     rangeCount = 2;
3166                     ranges[1].m_end_ = ranges[0].m_end_;
3167                     ranges[1].m_length_ = ranges[0].m_length_;
3168                     ranges[1].m_length2_ = minLength;
3169                     // set the end of the first range according to count1
3170                     int i = ranges[0].m_length_;
3171                     int b = getWeightByte(ranges[0].m_start_, i) + count1 - 1;
3172                     // ranges[0].count and count1 may be >countBytes from
3173                     // merging adjacent ranges; b > maxByte is possible
3174                     if (b <= maxByte) {
3175                         ranges[0].m_end_ = setWeightByte(ranges[0].m_start_, i,
3176                                 b);
3177                     } else {
3178                         ranges[0].m_end_ = setWeightByte(incWeight(
3179                                 ranges[0].m_start_, i - 1, maxByte), i, b
3180                                 - countBytes);
3181                     }
3182                     // set the bytes in the end weight at length + 1..length2
3183                     // to maxByte
3184                     b = (maxByte << 24) | (maxByte << 16) | (maxByte << 8)
3185                             | maxByte; // this used to be 0xffffffff
3186                     ranges[0].m_end_ = truncateWeight(ranges[0].m_end_, i)
3187                             | (b >>> (i << 3)) & (b << ((4 - minLength) << 3));
3188                     // set the start of the second range to immediately follow
3189                     // the end of the first one
3190                     ranges[1].m_start_ = incWeight(ranges[0].m_end_, minLength,
3191                             maxByte);
3192                     // set the count values (informational)
3193                     ranges[0].m_count_ = count1;
3194                     ranges[1].m_count_ = count2;
3195
3196                     ranges[0].m_count2_ = (int) (count1 * power_1);
3197                     // will be *countBytes when lengthened
3198                     ranges[1].m_count2_ = (int) (count2 * power_1);
3199
3200                     // lengthen the second range to maxLength
3201                     lengthenRange(ranges, 1, maxByte, countBytes);
3202                 }
3203                 break;
3204             }
3205             // no good match, lengthen all minLength ranges and iterate
3206             for (int i = 0; ranges[i].m_length2_ == minLength; ++i) {
3207                 lengthenRange(ranges, i, maxByte, countBytes);
3208             }
3209         }
3210
3211         if (rangeCount > 1) {
3212             // sort the ranges by weight values
3213             Arrays.sort(ranges, 0, rangeCount);
3214         }
3215
3216         // set maxByte in ranges[0] for ucol_nextWeight()
3217         ranges[0].m_count_ = maxByte;
3218
3219         return rangeCount;
3220     }
3221
3222     /**
3223      * Updates the range length
3224      * 
3225      * @param range
3226      *            weight range array
3227      * @param offset
3228      *            to weight range array
3229      * @param maxByte
3230      * @param countBytes
3231      * @return new length
3232      */
3233     private static final int lengthenRange(WeightRange range[], int offset,
3234             int maxByte, int countBytes) {
3235         int length = range[offset].m_length2_ + 1;
3236         range[offset].m_start_ = setWeightTrail(range[offset].m_start_, length,
3237                 RuleBasedCollator.BYTE_FIRST_TAILORED_);
3238         range[offset].m_end_ = setWeightTrail(range[offset].m_end_, length,
3239                 maxByte);
3240         range[offset].m_count2_ *= countBytes;
3241         range[offset].m_length2_ = length;
3242         return length;
3243     }
3244
3245     /**
3246      * Gets the weight
3247      * 
3248      * @param weight
3249      * @param length
3250      * @param trail
3251      * @return new weight
3252      */
3253     private static final int setWeightTrail(int weight, int length, int trail) {
3254         length = (4 - length) << 3;
3255         return (weight & (0xffffff00 << length)) | (trail << length);
3256     }
3257
3258     /**
3259      * take two CE weights and calculate the possible ranges of weights between
3260      * the two limits, excluding them for weights with up to 4 bytes there are
3261      * up to 2*4-1=7 ranges
3262      * 
3263      * @param lowerLimit
3264      * @param upperLimit
3265      * @param maxByte
3266      * @param countBytes
3267      * @param ranges
3268      * @return weight ranges
3269      */
3270     private int getWeightRanges(int lowerLimit, int upperLimit, int maxByte,
3271             int countBytes, WeightRange ranges[]) {
3272         // assume that both lowerLimit & upperLimit are not 0
3273         // get the lengths of the limits
3274         int lowerLength = lengthOfWeight(lowerLimit);
3275         int upperLength = lengthOfWeight(upperLimit);
3276         if (Utility.compareUnsigned(lowerLimit, upperLimit) >= 0) {
3277             return 0;
3278         }
3279         // check that neither is a prefix of the other
3280         if (lowerLength < upperLength) {
3281             if (lowerLimit == truncateWeight(upperLimit, lowerLength)) {
3282                 return 0;
3283             }
3284         }
3285         // if the upper limit is a prefix of the lower limit then the earlier
3286         // test lowerLimit >= upperLimit has caught it
3287         // reset local variables
3288         // With the limit lengths of 1..4, there are up to 7 ranges for
3289         // allocation:
3290         // range minimum length
3291         // lower[4] 4
3292         // lower[3] 3
3293         // lower[2] 2
3294         // middle 1
3295         // upper[2] 2
3296         // upper[3] 3
3297         // upper[4] 4
3298         // We are now going to calculate up to 7 ranges.
3299         // Some of them will typically overlap, so we will then have to merge
3300         // and eliminate ranges.
3301
3302         // We have to clean cruft from previous invocations
3303         // before doing anything. C++ already does that
3304         for (int length = 0; length < 5; length++) {
3305             m_utilLowerWeightRange_[length].clear();
3306             m_utilUpperWeightRange_[length].clear();
3307         }
3308         m_utilWeightRange_.clear();
3309
3310         int weight = lowerLimit;
3311         for (int length = lowerLength; length >= 2; --length) {
3312             m_utilLowerWeightRange_[length].clear();
3313             int trail = getWeightByte(weight, length);
3314             if (trail < maxByte) {
3315                 m_utilLowerWeightRange_[length].m_start_ = incWeightTrail(
3316                         weight, length);
3317                 m_utilLowerWeightRange_[length].m_end_ = setWeightTrail(weight,
3318                         length, maxByte);
3319                 m_utilLowerWeightRange_[length].m_length_ = length;
3320                 m_utilLowerWeightRange_[length].m_count_ = maxByte - trail;
3321             }
3322             weight = truncateWeight(weight, length - 1);
3323         }
3324         m_utilWeightRange_.m_start_ = incWeightTrail(weight, 1);
3325
3326         weight = upperLimit;
3327         // [0] and [1] are not used - this simplifies indexing,
3328         // m_utilUpperWeightRange_
3329
3330         for (int length = upperLength; length >= 2; length--) {
3331             int trail = getWeightByte(weight, length);
3332             if (trail > RuleBasedCollator.BYTE_FIRST_TAILORED_) {
3333                 m_utilUpperWeightRange_[length].m_start_ = setWeightTrail(
3334                         weight, length, RuleBasedCollator.BYTE_FIRST_TAILORED_);
3335                 m_utilUpperWeightRange_[length].m_end_ = decWeightTrail(weight,
3336                         length);
3337                 m_utilUpperWeightRange_[length].m_length_ = length;
3338                 m_utilUpperWeightRange_[length].m_count_ = trail
3339                         - RuleBasedCollator.BYTE_FIRST_TAILORED_;
3340             }
3341             weight = truncateWeight(weight, length - 1);
3342         }
3343         m_utilWeightRange_.m_end_ = decWeightTrail(weight, 1);
3344
3345         // set the middle range
3346         m_utilWeightRange_.m_length_ = 1;
3347         if (Utility.compareUnsigned(m_utilWeightRange_.m_end_,
3348                 m_utilWeightRange_.m_start_) >= 0) {
3349             // if (m_utilWeightRange_.m_end_ >= m_utilWeightRange_.m_start_) {
3350             m_utilWeightRange_.m_count_ = ((m_utilWeightRange_.m_end_ - m_utilWeightRange_.m_start_) >>> 24) + 1;
3351         } else {
3352             // eliminate overlaps
3353             // remove the middle range
3354             m_utilWeightRange_.m_count_ = 0;
3355             // reduce or remove the lower ranges that go beyond upperLimit
3356             for (int length = 4; length >= 2; --length) {
3357                 if (m_utilLowerWeightRange_[length].m_count_ > 0
3358                         && m_utilUpperWeightRange_[length].m_count_ > 0) {
3359                     int start = m_utilUpperWeightRange_[length].m_start_;
3360                     int end = m_utilLowerWeightRange_[length].m_end_;
3361                     if (end >= start
3362                             || incWeight(end, length, maxByte) == start) {
3363                         // lower and upper ranges collide or are directly
3364                         // adjacent: merge these two and remove all shorter
3365                         // ranges
3366                         start = m_utilLowerWeightRange_[length].m_start_;
3367                         end = m_utilLowerWeightRange_[length].m_end_ = m_utilUpperWeightRange_[length].m_end_;
3368                         // merging directly adjacent ranges needs to subtract
3369                         // the 0/1 gaps in between;
3370                         // it may result in a range with count>countBytes
3371                         m_utilLowerWeightRange_[length].m_count_ = getWeightByte(
3372                                 end, length)
3373                                 - getWeightByte(start, length)
3374                                 + 1
3375                                 + countBytes
3376                                 * (getWeightByte(end, length - 1) - getWeightByte(
3377                                         start, length - 1));
3378                         m_utilUpperWeightRange_[length].m_count_ = 0;
3379                         while (--length >= 2) {
3380                             m_utilLowerWeightRange_[length].m_count_ = m_utilUpperWeightRange_[length].m_count_ = 0;
3381                         }
3382                         break;
3383                     }
3384                 }
3385             }
3386         }
3387
3388         // copy the ranges, shortest first, into the result array
3389         int rangeCount = 0;
3390         if (m_utilWeightRange_.m_count_ > 0) {
3391             ranges[0] = new WeightRange(m_utilWeightRange_);
3392             rangeCount = 1;
3393         }
3394         for (int length = 2; length <= 4; ++length) {
3395             // copy upper first so that later the middle range is more likely
3396             // the first one to use
3397             if (m_utilUpperWeightRange_[length].m_count_ > 0) {
3398                 ranges[rangeCount] = new WeightRange(
3399                         m_utilUpperWeightRange_[length]);
3400                 ++rangeCount;
3401             }
3402             if (m_utilLowerWeightRange_[length].m_count_ > 0) {
3403                 ranges[rangeCount] = new WeightRange(
3404                         m_utilLowerWeightRange_[length]);
3405                 ++rangeCount;
3406             }
3407         }
3408         return rangeCount;
3409     }
3410
3411     /**
3412      * Truncates the weight with length
3413      * 
3414      * @param weight
3415      * @param length
3416      * @return truncated weight
3417      */
3418     private static final int truncateWeight(int weight, int length) {
3419         return weight & (0xffffffff << ((4 - length) << 3));
3420     }
3421
3422     /**
3423      * Length of the weight
3424      * 
3425      * @param weight
3426      * @return length of the weight
3427      */
3428     private static final int lengthOfWeight(int weight) {
3429         if ((weight & 0xffffff) == 0) {
3430             return 1;
3431         } else if ((weight & 0xffff) == 0) {
3432             return 2;
3433         } else if ((weight & 0xff) == 0) {
3434             return 3;
3435         }
3436         return 4;
3437     }
3438
3439     /**
3440      * Increment the weight trail
3441      * 
3442      * @param weight
3443      * @param length
3444      * @return new weight
3445      */
3446     private static final int incWeightTrail(int weight, int length) {
3447         return weight + (1 << ((4 - length) << 3));
3448     }
3449
3450     /**
3451      * Decrement the weight trail
3452      * 
3453      * @param weight
3454      * @param length
3455      * @return new weight
3456      */
3457     private static int decWeightTrail(int weight, int length) {
3458         return weight - (1 << ((4 - length) << 3));
3459     }
3460
3461     /**
3462      * Gets the codepoint
3463      * 
3464      * @param tbl
3465      *            contraction table
3466      * @param codePoint
3467      *            code point to look for
3468      * @return the offset to the code point
3469      */
3470     private static int findCP(BasicContractionTable tbl, char codePoint) {
3471         int position = 0;
3472         while (codePoint > tbl.m_codePoints_.charAt(position)) {
3473             position++;
3474             if (position > tbl.m_codePoints_.length()) {
3475                 return -1;
3476             }
3477         }
3478         if (codePoint == tbl.m_codePoints_.charAt(position)) {
3479             return position;
3480         } else {
3481             return -1;
3482         }
3483     }
3484
3485     /**
3486      * Finds a contraction ce
3487      * 
3488      * @param table
3489      * @param element
3490      * @param ch
3491      * @return ce
3492      */
3493     private static int findCE(ContractionTable table, int element, char ch) {
3494         if (table == null) {
3495             return CE_NOT_FOUND_;
3496         }
3497         BasicContractionTable tbl = getBasicContractionTable(table, element);
3498         if (tbl == null) {
3499             return CE_NOT_FOUND_;
3500         }
3501         int position = findCP(tbl, ch);
3502         if (position > tbl.m_CEs_.size() || position < 0) {
3503             return CE_NOT_FOUND_;
3504         }
3505         return tbl.m_CEs_.get(position).intValue();
3506     }
3507
3508     /**
3509      * Checks if the string is tailored in the contraction
3510      * 
3511      * @param table
3512      *            contraction table
3513      * @param element
3514      * @param array
3515      *            character array to check
3516      * @param offset
3517      *            array offset
3518      * @return true if it is tailored
3519      */
3520     private static boolean isTailored(ContractionTable table, int element,
3521             char array[], int offset) {
3522         while (array[offset] != 0) {
3523             element = findCE(table, element, array[offset]);
3524             if (element == CE_NOT_FOUND_) {
3525                 return false;
3526             }
3527             if (!isContractionTableElement(element)) {
3528                 return true;
3529             }
3530             offset++;
3531         }
3532         if (getCE(table, element, 0) != CE_NOT_FOUND_) {
3533             return true;
3534         } else {
3535             return false;
3536         }
3537     }
3538
3539     /**
3540      * Assemble RuleBasedCollator
3541      * 
3542      * @param t
3543      *            build table
3544      * @param collator
3545      *            to update
3546      */
3547     private void assembleTable(BuildTable t, RuleBasedCollator collator) {
3548         IntTrieBuilder mapping = t.m_mapping_;
3549         List<Integer> expansions = t.m_expansions_;
3550         ContractionTable contractions = t.m_contractions_;
3551         MaxExpansionTable maxexpansion = t.m_maxExpansions_;
3552
3553         // contraction offset has to be in since we are building on the
3554         // UCA contractions
3555         // int beforeContractions = (HEADER_SIZE_
3556         // + paddedsize(expansions.size() << 2)) >>> 1;
3557         collator.m_contractionOffset_ = 0;
3558         int contractionsSize = constructTable(contractions);
3559
3560         // the following operation depends on the trie data. Therefore, we have
3561         // to do it before the trie is compacted
3562         // sets jamo expansions
3563         getMaxExpansionJamo(mapping, maxexpansion, t.m_maxJamoExpansions_,
3564                 collator.m_isJamoSpecial_);
3565
3566         // TODO: LATIN1 array is now in the utrie - it should be removed from
3567         // the calculation
3568         setAttributes(collator, t.m_options_);
3569         // copy expansions
3570         int size = expansions.size();
3571         collator.m_expansion_ = new int[size];
3572         for (int i = 0; i < size; i++) {
3573             collator.m_expansion_[i] = expansions.get(i).intValue();
3574         }
3575         // contractions block
3576         if (contractionsSize != 0) {
3577             // copy contraction index
3578             collator.m_contractionIndex_ = new char[contractionsSize];
3579             contractions.m_codePoints_.getChars(0, contractionsSize,
3580                     collator.m_contractionIndex_, 0);
3581             // copy contraction collation elements
3582             collator.m_contractionCE_ = new int[contractionsSize];
3583             for (int i = 0; i < contractionsSize; i++) {
3584                 collator.m_contractionCE_[i] = contractions.m_CEs_.get(i).intValue();
3585             }
3586         }
3587         // copy mapping table
3588         collator.m_trie_ = mapping.serialize(t,
3589                 RuleBasedCollator.DataManipulate.getInstance());
3590         // copy max expansion table
3591         // not copying the first element which is a dummy
3592         // to be in synch with icu4c's builder, we continue to use the
3593         // expansion offset
3594         // omitting expansion offset in builder
3595         collator.m_expansionOffset_ = 0;
3596         size = maxexpansion.m_endExpansionCE_.size();
3597         collator.m_expansionEndCE_ = new int[size - 1];
3598         for (int i = 1; i < size; i++) {
3599             collator.m_expansionEndCE_[i - 1] = maxexpansion.m_endExpansionCE_
3600                     .get(i).intValue();
3601         }
3602         collator.m_expansionEndCEMaxSize_ = new byte[size - 1];
3603         for (int i = 1; i < size; i++) {
3604             collator.m_expansionEndCEMaxSize_[i - 1] = maxexpansion.m_expansionCESize_
3605                     .get(i).byteValue();
3606         }
3607         // Unsafe chars table. Finish it off, then copy it.
3608         unsafeCPAddCCNZ(t);
3609         // Or in unsafebits from UCA, making a combined table.
3610         for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) {
3611             t.m_unsafeCP_[i] |= RuleBasedCollator.UCA_.m_unsafe_[i];
3612         }
3613         collator.m_unsafe_ = t.m_unsafeCP_;
3614
3615         // Finish building Contraction Ending chars hash table and then copy it
3616         // out.
3617         // Or in unsafebits from UCA, making a combined table
3618         for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) {
3619             t.m_contrEndCP_[i] |= RuleBasedCollator.UCA_.m_contractionEnd_[i];
3620         }
3621         collator.m_contractionEnd_ = t.m_contrEndCP_;
3622     }
3623
3624     /**
3625      * Sets this collator to use the all options and tables in UCA.
3626      * 
3627      * @param collator
3628      *            which attribute is to be set
3629      * @param option
3630      *            to set with
3631      */
3632     private static final void setAttributes(RuleBasedCollator collator,
3633             CollationRuleParser.OptionSet option) {
3634         collator.latinOneFailed_ = true;
3635         collator.m_caseFirst_ = option.m_caseFirst_;
3636         collator.setDecomposition(option.m_decomposition_);
3637         collator
3638                 .setAlternateHandlingShifted(option.m_isAlternateHandlingShifted_);
3639         collator.setCaseLevel(option.m_isCaseLevel_);
3640         collator.setFrenchCollation(option.m_isFrenchCollation_);
3641         collator.m_isHiragana4_ = option.m_isHiragana4_;
3642         collator.setStrength(option.m_strength_);
3643         collator.m_variableTopValue_ = option.m_variableTopValue_;
3644         collator.m_reorderCodes_ = option.m_scriptOrder_;
3645         collator.latinOneFailed_ = false;
3646     }
3647
3648     /**
3649      * Constructing the contraction table
3650      * 
3651      * @param table
3652      *            contraction table
3653      * @return
3654      */
3655     private int constructTable(ContractionTable table) {
3656         // See how much memory we need
3657         int tsize = table.m_elements_.size();
3658         if (tsize == 0) {
3659             return 0;
3660         }
3661         table.m_offsets_.clear();
3662         int position = 0;
3663         for (int i = 0; i < tsize; i++) {
3664             table.m_offsets_.add(Integer.valueOf(position));
3665             position += table.m_elements_.get(i).m_CEs_
3666                     .size();
3667         }
3668         table.m_CEs_.clear();
3669         table.m_codePoints_.delete(0, table.m_codePoints_.length());
3670         // Now stuff the things in
3671         StringBuilder cpPointer = table.m_codePoints_;
3672         List<Integer> CEPointer = table.m_CEs_;
3673         for (int i = 0; i < tsize; i++) {
3674             BasicContractionTable bct = table.m_elements_.get(i);
3675             int size = bct.m_CEs_.size();
3676             char ccMax = 0;
3677             char ccMin = 255;
3678             int offset = CEPointer.size();
3679             CEPointer.add(bct.m_CEs_.get(0));
3680             for (int j = 1; j < size; j++) {
3681                 char ch = bct.m_codePoints_.charAt(j);
3682                 char cc = (char) (UCharacter.getCombiningClass(ch) & 0xFF);
3683                 if (cc > ccMax) {
3684                     ccMax = cc;
3685                 }
3686                 if (cc < ccMin) {
3687                     ccMin = cc;
3688                 }
3689                 cpPointer.append(ch);
3690                 CEPointer.add(bct.m_CEs_.get(j));
3691             }
3692             cpPointer.insert(offset,
3693                     (char) (((ccMin == ccMax) ? 1 : 0 << 8) | ccMax));
3694             for (int j = 0; j < size; j++) {
3695                 if (isContractionTableElement(CEPointer.get(offset + j).intValue())) {
3696                     int ce = CEPointer.get(offset + j).intValue();
3697                     CEPointer.set(offset + j,
3698                             Integer.valueOf(constructSpecialCE(getCETag(ce),
3699                                     table.m_offsets_.get(getContractionOffset(ce))
3700                                     .intValue())));
3701                 }
3702             }
3703         }
3704
3705         for (int i = 0; i <= 0x10FFFF; i++) {
3706             int CE = table.m_mapping_.getValue(i);
3707             if (isContractionTableElement(CE)) {
3708                 CE = constructSpecialCE(getCETag(CE),
3709                         table.m_offsets_.get(getContractionOffset(CE)).intValue());
3710                 table.m_mapping_.setValue(i, CE);
3711             }
3712         }
3713         return position;
3714     }
3715
3716     /**
3717      * Get contraction offset
3718      * 
3719      * @param ce
3720      *            collation element
3721      * @return contraction offset
3722      */
3723     private static final int getContractionOffset(int ce) {
3724         return ce & 0xFFFFFF;
3725     }
3726
3727     /**
3728      * Gets the maximum Jamo expansion
3729      * 
3730      * @param mapping
3731      *            trie table
3732      * @param maxexpansion
3733      *            maximum expansion table
3734      * @param maxjamoexpansion
3735      *            maximum jamo expansion table
3736      * @param jamospecial
3737      *            is jamo special?
3738      */
3739     private static void getMaxExpansionJamo(IntTrieBuilder mapping,
3740             MaxExpansionTable maxexpansion,
3741             MaxJamoExpansionTable maxjamoexpansion, boolean jamospecial) {
3742         int VBASE = 0x1161;
3743         int TBASE = 0x11A8;
3744         int VCOUNT = 21;
3745         int TCOUNT = 28;
3746         int v = VBASE + VCOUNT - 1;
3747         int t = TBASE + TCOUNT - 1;
3748
3749         while (v >= VBASE) {
3750             int ce = mapping.getValue(v);
3751             if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) {
3752                 setMaxExpansion(ce, (byte) 2, maxexpansion);
3753             }
3754             v--;
3755         }
3756
3757         while (t >= TBASE) {
3758             int ce = mapping.getValue(t);
3759             if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) {
3760                 setMaxExpansion(ce, (byte) 3, maxexpansion);
3761             }
3762             t--;
3763         }
3764         // According to the docs, 99% of the time, the Jamo will not be special
3765         if (jamospecial) {
3766             // gets the max expansion in all unicode characters
3767             int count = maxjamoexpansion.m_endExpansionCE_.size();
3768             byte maxTSize = (byte) (maxjamoexpansion.m_maxLSize_
3769                     + maxjamoexpansion.m_maxVSize_ + maxjamoexpansion.m_maxTSize_);
3770             byte maxVSize = (byte) (maxjamoexpansion.m_maxLSize_ + maxjamoexpansion.m_maxVSize_);
3771
3772             while (count > 0) {
3773                 count--;
3774                 if ((maxjamoexpansion.m_isV_.get(count))
3775                         .booleanValue() == true) {
3776                     setMaxExpansion(
3777                             (maxjamoexpansion.m_endExpansionCE_
3778                                     .get(count)).intValue(), maxVSize,
3779                             maxexpansion);
3780                 } else {
3781                     setMaxExpansion(
3782                             (maxjamoexpansion.m_endExpansionCE_
3783                                     .get(count)).intValue(), maxTSize,
3784                             maxexpansion);
3785                 }
3786             }
3787         }
3788     }
3789
3790     /**
3791      * To the UnsafeCP hash table, add all chars with combining class != 0
3792      * 
3793      * @param t
3794      *            build table
3795      */
3796     private final void unsafeCPAddCCNZ(BuildTable t) {
3797         boolean buildCMTable = (buildCMTabFlag & (t.cmLookup == null));
3798         char[] cm = null; // combining mark array
3799         int[] index = new int[256];
3800         int count = 0;
3801
3802         if (buildCMTable) {
3803             cm = new char[0x10000];
3804         }
3805         for (char c = 0; c < 0xffff; c++) {
3806             int fcd;
3807             if (UTF16.isLeadSurrogate(c)) {
3808                 fcd = 0;
3809                 if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(c)) {
3810                     int supp = Character.toCodePoint(c, (char)0xdc00);
3811                     int suppLimit = supp + 0x400;
3812                     while (supp < suppLimit) {
3813                         fcd |= m_nfcImpl_.getFCD16FromNormData(supp++);
3814                     }
3815                 }
3816             } else {
3817                 fcd = m_nfcImpl_.getFCD16(c);
3818             }
3819             // TODO: review for handling supplementary characters
3820             if (fcd >= 0x100 || // if the leading combining class(c) > 0 ||
3821                     (UTF16.isLeadSurrogate(c) && fcd != 0)) {
3822                 // c is a leading surrogate with some FCD data
3823                 unsafeCPSet(t.m_unsafeCP_, c);
3824                 if (buildCMTable) {
3825                     int cc = (fcd & 0xff);
3826                     int pos = (cc << 8) + index[cc];
3827                     cm[pos] = c;
3828                     index[cc]++;
3829                     count++;
3830                 }
3831             }
3832         }
3833
3834         if (t.m_prefixLookup_ != null) {
3835             Enumeration<Elements> els = Collections.enumeration(t.m_prefixLookup_.values());
3836             while (els.hasMoreElements()) {
3837                 Elements e = els.nextElement();
3838                 // codepoints here are in the NFD form. We need to add the
3839                 // first code point of the NFC form to unsafe, because
3840                 // strcoll needs to backup over them.
3841                 // weiv: This is wrong! See the comment above.
3842                 // String decomp = Normalizer.decompose(e.m_cPoints_, true);
3843                 // unsafeCPSet(t.m_unsafeCP_, decomp.charAt(0));
3844                 // it should be:
3845                 String comp = Normalizer.compose(e.m_cPoints_, false);
3846                 unsafeCPSet(t.m_unsafeCP_, comp.charAt(0));
3847             }
3848         }
3849
3850         if (buildCMTable) {
3851             t.cmLookup = new CombinClassTable();
3852             t.cmLookup.generate(cm, count, index);
3853         }
3854     }
3855
3856     /**
3857      * Create closure
3858      * 
3859      * @param t
3860      *            build table
3861      * @param collator
3862      *            RuleBasedCollator
3863      * @param colEl
3864      *            collation element iterator
3865      * @param start
3866      * @param limit
3867      * @param type
3868      *            character type
3869      * @return
3870      */
3871     private boolean enumCategoryRangeClosureCategory(BuildTable t,
3872             RuleBasedCollator collator, CollationElementIterator colEl,
3873             int start, int limit, int type) {
3874         if (type != UCharacterCategory.UNASSIGNED
3875                 && type != UCharacterCategory.PRIVATE_USE) {
3876             // if the range is assigned - we might ommit more categories later
3877
3878             for (int u32 = start; u32 < limit; u32++) {
3879                 String decomp = m_nfcImpl_.getDecomposition(u32);
3880                 if (decomp != null) {
3881                     String comp = UCharacter.toString(u32);
3882                     if (!collator.equals(comp, decomp)) {
3883                         m_utilElement_.m_cPoints_ = decomp;
3884                         m_utilElement_.m_prefix_ = 0;
3885                         Elements prefix = t.m_prefixLookup_.get(m_utilElement_);
3886                         if (prefix == null) {
3887                             m_utilElement_.m_cPoints_ = comp;
3888                             m_utilElement_.m_prefix_ = 0;
3889                             m_utilElement_.m_prefixChars_ = null;
3890                             colEl.setText(decomp);
3891                             int ce = colEl.next();
3892                             m_utilElement_.m_CELength_ = 0;
3893                             while (ce != CollationElementIterator.NULLORDER) {
3894                                 m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = ce;
3895                                 ce = colEl.next();
3896                             }
3897                         } else {
3898                             m_utilElement_.m_cPoints_ = comp;
3899                             m_utilElement_.m_prefix_ = 0;
3900                             m_utilElement_.m_prefixChars_ = null;
3901                             m_utilElement_.m_CELength_ = 1;
3902                             m_utilElement_.m_CEs_[0] = prefix.m_mapCE_;
3903                             // This character uses a prefix. We have to add it
3904                             // to the unsafe table, as it decomposed form is
3905                             // already in. In Japanese, this happens for \u309e
3906                             // & \u30fe
3907                             // Since unsafeCPSet is static in ucol_elm, we are
3908                             // going to wrap it up in the unsafeCPAddCCNZ
3909                             // function
3910                         }
3911                         addAnElement(t, m_utilElement_);
3912                     }
3913                 }
3914             }
3915         }
3916         return true;
3917     }
3918
3919     /**
3920      * Determine if a character is a Jamo
3921      * 
3922      * @param ch
3923      *            character to test
3924      * @return true if ch is a Jamo, false otherwise
3925      */
3926     private static final boolean isJamo(char ch) {
3927         return (ch >= 0x1100 && ch <= 0x1112) || (ch >= 0x1175 && ch <= 0x1161)
3928                 || (ch >= 0x11A8 && ch <= 0x11C2);
3929     }
3930
3931     /**
3932      * Produces canonical closure
3933      */
3934     private void canonicalClosure(BuildTable t) {
3935         BuildTable temp = new BuildTable(t);
3936         assembleTable(temp, temp.m_collator_);
3937         // produce canonical closure
3938         CollationElementIterator coleiter = temp.m_collator_
3939                 .getCollationElementIterator("");
3940         RangeValueIterator typeiter = UCharacter.getTypeIterator();
3941         RangeValueIterator.Element element = new RangeValueIterator.Element();
3942         while (typeiter.next(element)) {
3943             enumCategoryRangeClosureCategory(t, temp.m_collator_, coleiter,
3944                     element.start, element.limit, element.value);
3945         }
3946
3947         t.cmLookup = temp.cmLookup;
3948         temp.cmLookup = null;
3949
3950         for (int i = 0; i < m_parser_.m_resultLength_; i++) {
3951             char baseChar, firstCM;
3952             // now we need to generate the CEs
3953             // We stuff the initial value in the buffers, and increase the
3954             // appropriate buffer according to strength */
3955             // createElements(t, m_parser_.m_listHeader_[i]);
3956             CollationRuleParser.Token tok = m_parser_.m_listHeader_[i].m_first_;
3957             m_utilElement_.clear();
3958             while (tok != null) {
3959                 m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_;
3960                 m_utilElement_.m_cPointsOffset_ = 0; // el.m_uchars_;
3961                 if (tok.m_prefix_ != 0) {
3962                     // we will just copy the prefix here, and adjust accordingly
3963                     // in
3964                     // the addPrefix function in ucol_elm. The reason is that we
3965                     // need to add both composed AND decomposed elements to the
3966                     // unsafe table.
3967                     int size = tok.m_prefix_ >> 24;
3968                     int offset = tok.m_prefix_ & 0x00FFFFFF;
3969                     m_utilElement_.m_prefixChars_ = m_parser_.m_source_
3970                             .substring(offset, offset + size);
3971                     size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24);
3972                     offset = (tok.m_source_ & 0x00FFFFFF)
3973                             + (tok.m_prefix_ >> 24);
3974                     m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
3975                             offset, offset + size);
3976                 } else {
3977                     m_utilElement_.m_prefixChars_ = null;
3978                     int offset = tok.m_source_ & 0x00FFFFFF;
3979                     int size = tok.m_source_ >>> 24;
3980                     m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
3981                             offset, offset + size);
3982                 }
3983                 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
3984
3985                 baseChar = firstCM = 0; // reset
3986                 for (int j = 0; j < m_utilElement_.m_cPoints_.length()
3987                         - m_utilElement_.m_cPointsOffset_; j++) {
3988
3989                     int fcd = m_nfcImpl_.getFCD16(m_utilElement_.m_cPoints_.charAt(j));  // TODO: review for handling supplementary characters
3990                     if ((fcd & 0xff) == 0) {
3991                         baseChar = m_utilElement_.m_cPoints_.charAt(j);
3992                     } else {
3993                         if ((baseChar != 0) && (firstCM == 0)) {
3994                             firstCM = m_utilElement_.m_cPoints_.charAt(j); // first
3995                                                                            // combining
3996                                                                            // mark
3997                         }
3998                     }
3999                 }
4000
4001                 if ((baseChar != 0) && (firstCM != 0)) {
4002                     addTailCanonicalClosures(t, temp.m_collator_, coleiter,
4003                             baseChar, firstCM);
4004                 }
4005                 tok = tok.m_next_;
4006             }
4007         }
4008     }
4009
4010     private void addTailCanonicalClosures(BuildTable t,
4011             RuleBasedCollator m_collator, CollationElementIterator colEl,
4012             char baseChar, char cMark) {
4013         if (t.cmLookup == null) {
4014             return;
4015         }
4016         CombinClassTable cmLookup = t.cmLookup;
4017         int[] index = cmLookup.index;
4018         int cClass = m_nfcImpl_.getFCD16(cMark) & 0xff;  // TODO: review for handling supplementary characters
4019         int maxIndex = 0;
4020         char[] precompCh = new char[256];
4021         int[] precompClass = new int[256];
4022         int precompLen = 0;
4023         Elements element = new Elements();
4024
4025         if (cClass > 0) {
4026             maxIndex = index[cClass - 1];
4027         }
4028         for (int i = 0; i < maxIndex; i++) {
4029             StringBuilder decompBuf = new StringBuilder();
4030             decompBuf.append(baseChar).append(cmLookup.cPoints[i]);
4031             String comp = Normalizer.compose(decompBuf.toString(), false);
4032             if (comp.length() == 1) {
4033                 precompCh[precompLen] = comp.charAt(0);
4034                 precompClass[precompLen] = m_nfcImpl_.getFCD16(cmLookup.cPoints[i]) & 0xff;  // TODO: review for handling supplementary characters
4035                 precompLen++;
4036                 StringBuilder decomp = new StringBuilder();
4037                 for (int j = 0; j < m_utilElement_.m_cPoints_.length(); j++) {
4038                     if (m_utilElement_.m_cPoints_.charAt(j) == cMark) {
4039                         decomp.append(cmLookup.cPoints[i]);
4040                     } else {
4041                         decomp.append(m_utilElement_.m_cPoints_.charAt(j));
4042                     }
4043                 }
4044                 comp = Normalizer.compose(decomp.toString(), false);
4045                 StringBuilder buf = new StringBuilder(comp);
4046                 buf.append(cMark);
4047                 decomp.append(cMark);
4048                 comp = buf.toString();
4049
4050                 element.m_cPoints_ = decomp.toString();
4051                 element.m_CELength_ = 0;
4052                 element.m_prefix_ = 0;
4053                 Elements prefix = t.m_prefixLookup_.get(element);
4054                 element.m_cPoints_ = comp;
4055                 element.m_uchars_ = comp;
4056
4057                 if (prefix == null) {
4058                     element.m_prefix_ = 0;
4059                     element.m_prefixChars_ = null;
4060                     colEl.setText(decomp.toString());
4061                     int ce = colEl.next();
4062                     element.m_CELength_ = 0;
4063                     while (ce != CollationElementIterator.NULLORDER) {
4064                         element.m_CEs_[element.m_CELength_++] = ce;
4065                         ce = colEl.next();
4066                     }
4067                 } else {
4068                     element.m_cPoints_ = comp;
4069                     element.m_prefix_ = 0;
4070                     element.m_prefixChars_ = null;
4071                     element.m_CELength_ = 1;
4072                     element.m_CEs_[0] = prefix.m_mapCE_;
4073                 }
4074                 setMapCE(t, element);
4075                 finalizeAddition(t, element);
4076
4077                 if (comp.length() > 2) {
4078                     // This is a fix for tailoring contractions with accented
4079                     // character at the end of contraction string.
4080                     addFCD4AccentedContractions(t, colEl, comp, element);
4081                 }
4082                 if (precompLen > 1) {
4083                     precompLen = addMultiCMontractions(t, colEl, element,
4084                             precompCh, precompClass, precompLen, cMark, i,
4085                             decomp.toString());
4086                 }
4087             }
4088         }
4089
4090     }
4091
4092     private void setMapCE(BuildTable t, Elements element) {
4093         List<Integer> expansions = t.m_expansions_;
4094         element.m_mapCE_ = 0;
4095
4096         if (element.m_CELength_ == 2 // a two CE expansion
4097                 && RuleBasedCollator.isContinuation(element.m_CEs_[1])
4098                 && (element.m_CEs_[1] & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that
4099                                                                                                           // has
4100                                                                                                           // only
4101                                                                                                           // primaries
4102                                                                                                           // in
4103                                                                                                           // continuation
4104                 && (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_)
4105                 // a common secondary
4106                 && ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_)) { // and
4107                                                                                      // a
4108                                                                                      // common
4109                                                                                      // tertiary
4110
4111             element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_
4112             // a long primary special
4113                     | (CE_LONG_PRIMARY_TAG_ << 24)
4114                     // first and second byte of primary
4115                     | ((element.m_CEs_[0] >> 8) & 0xFFFF00)
4116                     // third byte of primary
4117                     | ((element.m_CEs_[1] >> 24) & 0xFF);
4118         } else {
4119             // omitting expansion offset in builder
4120             // (HEADER_SIZE_ >> 2)
4121             int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
4122                     | (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
4123                     | (addExpansion(expansions, element.m_CEs_[0]) << 4)
4124                     & 0xFFFFF0;
4125
4126             for (int i = 1; i < element.m_CELength_; i++) {
4127                 addExpansion(expansions, element.m_CEs_[i]);
4128             }
4129             if (element.m_CELength_ <= 0xF) {
4130                 expansion |= element.m_CELength_;
4131             } else {
4132                 addExpansion(expansions, 0);
4133             }
4134             element.m_mapCE_ = expansion;
4135             setMaxExpansion(element.m_CEs_[element.m_CELength_ - 1],
4136                     (byte) element.m_CELength_, t.m_maxExpansions_);
4137         }
4138     }
4139
4140     private int addMultiCMontractions(BuildTable t,
4141             CollationElementIterator colEl, Elements element, char[] precompCh,
4142             int[] precompClass, int maxComp, char cMark, int cmPos,
4143             String decomp) {
4144
4145         CombinClassTable cmLookup = t.cmLookup;
4146         char[] combiningMarks = { cMark };
4147         int cMarkClass = UCharacter.getCombiningClass(cMark) & 0xFF;
4148         String comMark = new String(combiningMarks);
4149         int noOfPrecomposedChs = maxComp;
4150
4151         for (int j = 0; j < maxComp; j++) {
4152             int count = 0;
4153             StringBuilder temp;
4154
4155             do {
4156                 String newDecomp, comp;
4157
4158                 if (count == 0) { // Decompose the saved precomposed char.
4159                     newDecomp = Normalizer.decompose(
4160                             new String(precompCh, j, 1), false);
4161                     temp = new StringBuilder(newDecomp);
4162                     temp.append(cmLookup.cPoints[cmPos]);
4163                     newDecomp = temp.toString();
4164                 } else {
4165                     temp = new StringBuilder(decomp);
4166                     temp.append(precompCh[j]);
4167                     newDecomp = temp.toString();
4168                 }
4169                 comp = Normalizer.compose(newDecomp, false);
4170                 if (comp.length() == 1) {
4171                     temp.append(cMark);
4172                     element.m_cPoints_ = temp.toString();
4173                     element.m_CELength_ = 0;
4174                     element.m_prefix_ = 0;
4175                     Elements prefix = t.m_prefixLookup_.get(element);
4176                     element.m_cPoints_ = comp + comMark;
4177                     if (prefix == null) {
4178                         element.m_prefix_ = 0;
4179                         element.m_prefixChars_ = null;
4180                         colEl.setText(temp.toString());
4181                         int ce = colEl.next();
4182                         element.m_CELength_ = 0;
4183                         while (ce != CollationElementIterator.NULLORDER) {
4184                             element.m_CEs_[element.m_CELength_++] = ce;
4185                             ce = colEl.next();
4186                         }
4187                     } else {
4188                         element.m_cPoints_ = comp;
4189                         element.m_prefix_ = 0;
4190                         element.m_prefixChars_ = null;
4191                         element.m_CELength_ = 1;
4192                         element.m_CEs_[0] = prefix.m_mapCE_;
4193                     }
4194                     setMapCE(t, element);
4195                     finalizeAddition(t, element);
4196                     precompCh[noOfPrecomposedChs] = comp.charAt(0);
4197                     precompClass[noOfPrecomposedChs] = cMarkClass;
4198                     noOfPrecomposedChs++;
4199                 }
4200             } while (++count < 2 && (precompClass[j] == cMarkClass));
4201         }
4202         return noOfPrecomposedChs;
4203     }
4204
4205     private void addFCD4AccentedContractions(BuildTable t,
4206             CollationElementIterator colEl, String data, Elements element) {
4207         String decomp = Normalizer.decompose(data, false);
4208         String comp = Normalizer.compose(data, false);
4209
4210         element.m_cPoints_ = decomp;
4211         element.m_CELength_ = 0;
4212         element.m_prefix_ = 0;
4213         Elements prefix = t.m_prefixLookup_.get(element);
4214         if (prefix == null) {
4215             element.m_cPoints_ = comp;
4216             element.m_prefix_ = 0;
4217             element.m_prefixChars_ = null;
4218             element.m_CELength_ = 0;
4219             colEl.setText(decomp);
4220             int ce = colEl.next();
4221             element.m_CELength_ = 0;
4222             while (ce != CollationElementIterator.NULLORDER) {
4223                 element.m_CEs_[element.m_CELength_++] = ce;
4224                 ce = colEl.next();
4225             }
4226             addAnElement(t, element);
4227         }
4228     }
4229
4230     private void processUCACompleteIgnorables(BuildTable t) {
4231         TrieIterator trieiterator = new TrieIterator(
4232                 RuleBasedCollator.UCA_.m_trie_);
4233         RangeValueIterator.Element element = new RangeValueIterator.Element();
4234         while (trieiterator.next(element)) {
4235             int start = element.start;
4236             int limit = element.limit;
4237             if (element.value == 0) {
4238                 while (start < limit) {
4239                     int CE = t.m_mapping_.getValue(start);
4240                     if (CE == CE_NOT_FOUND_) {
4241                         m_utilElement_.m_prefix_ = 0;
4242                         m_utilElement_.m_uchars_ = UCharacter.toString(start);
4243                         m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
4244                         m_utilElement_.m_cPointsOffset_ = 0;
4245                         m_utilElement_.m_CELength_ = 1;
4246                         m_utilElement_.m_CEs_[0] = 0;
4247                         addAnElement(t, m_utilElement_);
4248                     }
4249                     start++;
4250                 }
4251             }
4252         }
4253     }
4254 }