]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_4_2-src/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java
go
[Dictionary.git] / jars / icu4j-4_4_2-src / main / classes / collate / src / com / ibm / icu / text / CollationParsedRuleBuilder.java
1 /**\r
2  *******************************************************************************\r
3  * Copyright (C) 1996-2010, International Business Machines Corporation and    *\r
4  * others. All Rights Reserved.                                                *\r
5  *******************************************************************************\r
6  */\r
7 package com.ibm.icu.text;\r
8 \r
9 import java.io.IOException;\r
10 import java.text.ParseException;\r
11 import java.util.Arrays;\r
12 import java.util.Enumeration;\r
13 import java.util.Hashtable;\r
14 import java.util.Vector;\r
15 \r
16 import com.ibm.icu.impl.IntTrieBuilder;\r
17 import com.ibm.icu.impl.Norm2AllModes;\r
18 import com.ibm.icu.impl.Normalizer2Impl;\r
19 import com.ibm.icu.impl.TrieBuilder;\r
20 import com.ibm.icu.impl.TrieIterator;\r
21 import com.ibm.icu.impl.UCharacterProperty;\r
22 import com.ibm.icu.impl.Utility;\r
23 import com.ibm.icu.lang.UCharacter;\r
24 import com.ibm.icu.lang.UCharacterCategory;\r
25 import com.ibm.icu.util.RangeValueIterator;\r
26 import com.ibm.icu.util.VersionInfo;\r
27 \r
28 /**\r
29  * Class for building a collator from a list of collation rules. This class is\r
30  * uses CollationRuleParser\r
31  * \r
32  * @author Syn Wee Quek\r
33  * @since release 2.2, June 11 2002\r
34  */\r
35 final class CollationParsedRuleBuilder {\r
36     // package private constructors ------------------------------------------\r
37 \r
38     /**\r
39      * Constructor\r
40      * \r
41      * @param rules\r
42      *            collation rules\r
43      * @exception ParseException\r
44      *                thrown when argument rules have an invalid syntax\r
45      */\r
46     CollationParsedRuleBuilder(String rules) throws ParseException {\r
47         m_parser_ = new CollationRuleParser(rules);\r
48         m_parser_.assembleTokenList();\r
49         m_utilColEIter_ = RuleBasedCollator.UCA_\r
50                 .getCollationElementIterator("");\r
51     }\r
52 \r
53     // package private inner classes -----------------------------------------\r
54 \r
55     /**\r
56      * Inverse UCA wrapper\r
57      */\r
58     static class InverseUCA {\r
59         // package private constructor ---------------------------------------\r
60 \r
61         InverseUCA() {\r
62         }\r
63 \r
64         // package private data member ---------------------------------------\r
65 \r
66         /**\r
67          * Array list of characters\r
68          */\r
69         int m_table_[];\r
70         /**\r
71          * Array list of continuation characters\r
72          */\r
73         char m_continuations_[];\r
74 \r
75         /**\r
76          * UCA version of inverse UCA table\r
77          */\r
78         VersionInfo m_UCA_version_;\r
79 \r
80         // package private method --------------------------------------------\r
81 \r
82         /**\r
83          * Returns the previous inverse ces of the argument ces\r
84          * \r
85          * @param ce\r
86          *            ce to test\r
87          * @param contce\r
88          *            continuation ce to test\r
89          * @param strength\r
90          *            collation strength\r
91          * @param prevresult\r
92          *            an array to store the return results previous inverse ce\r
93          *            and previous inverse continuation ce\r
94          * @return result of the inverse ce\r
95          */\r
96         final int getInversePrevCE(int ce, int contce, int strength,\r
97                 int prevresult[]) {\r
98             int result = findInverseCE(ce, contce);\r
99 \r
100             if (result < 0) {\r
101                 prevresult[0] = CollationElementIterator.NULLORDER;\r
102                 return -1;\r
103             }\r
104 \r
105             ce &= STRENGTH_MASK_[strength];\r
106             contce &= STRENGTH_MASK_[strength];\r
107 \r
108             prevresult[0] = ce;\r
109             prevresult[1] = contce;\r
110 \r
111             while ((prevresult[0] & STRENGTH_MASK_[strength]) == ce\r
112                     && (prevresult[1] & STRENGTH_MASK_[strength]) == contce\r
113                     && result > 0) {\r
114                 // this condition should prevent falling off the edge of the\r
115                 // world\r
116                 // here, we end up in a singularity - zero\r
117                 prevresult[0] = m_table_[3 * (--result)];\r
118                 prevresult[1] = m_table_[3 * result + 1];\r
119             }\r
120             return result;\r
121         }\r
122 \r
123         final int getCEStrengthDifference(int CE, int contCE, int prevCE,\r
124                 int prevContCE) {\r
125             int strength = Collator.TERTIARY;\r
126             while (((prevCE & STRENGTH_MASK_[strength]) != (CE & STRENGTH_MASK_[strength]) || (prevContCE & STRENGTH_MASK_[strength]) != (contCE & STRENGTH_MASK_[strength]))\r
127                     && (strength != 0)) {\r
128                 strength--;\r
129             }\r
130             return strength;\r
131         }\r
132 \r
133         private int compareCEs(int source0, int source1, int target0,\r
134                 int target1) {\r
135             int s1 = source0, s2, t1 = target0, t2;\r
136             if (RuleBasedCollator.isContinuation(source1)) {\r
137                 s2 = source1;\r
138             } else {\r
139                 s2 = 0;\r
140             }\r
141             if (RuleBasedCollator.isContinuation(target1)) {\r
142                 t2 = target1;\r
143             } else {\r
144                 t2 = 0;\r
145             }\r
146 \r
147             int s = 0, t = 0;\r
148             if (s1 == t1 && s2 == t2) {\r
149                 return 0;\r
150             }\r
151             s = (s1 & 0xFFFF0000) | ((s2 & 0xFFFF0000) >>> 16);\r
152             t = (t1 & 0xFFFF0000) | ((t2 & 0xFFFF0000) >>> 16);\r
153             if (s == t) {\r
154                 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00) >> 8;\r
155                 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00) >> 8;\r
156                 if (s == t) {\r
157                     s = (s1 & 0x000000FF) << 8 | (s2 & 0x000000FF);\r
158                     t = (t1 & 0x000000FF) << 8 | (t2 & 0x000000FF);\r
159                     return Utility.compareUnsigned(s, t);\r
160                 } else {\r
161                     return Utility.compareUnsigned(s, t);\r
162                 }\r
163             } else {\r
164                 return Utility.compareUnsigned(s, t);\r
165             }\r
166         }\r
167 \r
168         /**\r
169          * Finding the inverse CE of the argument CEs\r
170          * \r
171          * @param ce\r
172          *            CE to be tested\r
173          * @param contce\r
174          *            continuation CE\r
175          * @return inverse CE\r
176          */\r
177         int findInverseCE(int ce, int contce) {\r
178             int bottom = 0;\r
179             int top = m_table_.length / 3;\r
180             int result = 0;\r
181 \r
182             while (bottom < top - 1) {\r
183                 result = (top + bottom) >> 1;\r
184                 int first = m_table_[3 * result];\r
185                 int second = m_table_[3 * result + 1];\r
186                 int comparison = compareCEs(first, second, ce, contce);\r
187                 if (comparison > 0) {\r
188                     top = result;\r
189                 } else if (comparison < 0) {\r
190                     bottom = result;\r
191                 } else {\r
192                     break;\r
193                 }\r
194             }\r
195 \r
196             return result;\r
197         }\r
198 \r
199         /**\r
200          * Getting gap offsets in the inverse UCA\r
201          * \r
202          * @param listheader\r
203          *            parsed token lists\r
204          * @exception Exception\r
205          *                thrown when error occurs while finding the collation\r
206          *                gaps\r
207          */\r
208         void getInverseGapPositions(\r
209                 CollationRuleParser.TokenListHeader listheader)\r
210                 throws Exception {\r
211             // reset all the gaps\r
212             CollationRuleParser.Token token = listheader.m_first_;\r
213             int tokenstrength = token.m_strength_;\r
214 \r
215             for (int i = 0; i < 3; i++) {\r
216                 listheader.m_gapsHi_[3 * i] = 0;\r
217                 listheader.m_gapsHi_[3 * i + 1] = 0;\r
218                 listheader.m_gapsHi_[3 * i + 2] = 0;\r
219                 listheader.m_gapsLo_[3 * i] = 0;\r
220                 listheader.m_gapsLo_[3 * i + 1] = 0;\r
221                 listheader.m_gapsLo_[3 * i + 2] = 0;\r
222                 listheader.m_numStr_[i] = 0;\r
223                 listheader.m_fStrToken_[i] = null;\r
224                 listheader.m_lStrToken_[i] = null;\r
225                 listheader.m_pos_[i] = -1;\r
226             }\r
227 \r
228             if ((listheader.m_baseCE_ >>> 24) >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_\r
229                     && (listheader.m_baseCE_ >>> 24) <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_) {\r
230                 // implicits -\r
231                 listheader.m_pos_[0] = 0;\r
232                 int t1 = listheader.m_baseCE_;\r
233                 int t2 = listheader.m_baseContCE_;\r
234                 listheader.m_gapsLo_[0] = mergeCE(t1, t2, Collator.PRIMARY);\r
235                 listheader.m_gapsLo_[1] = mergeCE(t1, t2, Collator.SECONDARY);\r
236                 listheader.m_gapsLo_[2] = mergeCE(t1, t2, Collator.TERTIARY);\r
237                 int primaryCE = t1 & RuleBasedCollator.CE_PRIMARY_MASK_\r
238                         | (t2 & RuleBasedCollator.CE_PRIMARY_MASK_) >>> 16;\r
239                 primaryCE = RuleBasedCollator.impCEGen_\r
240                         .getImplicitFromRaw(RuleBasedCollator.impCEGen_\r
241                                 .getRawFromImplicit(primaryCE) + 1);\r
242 \r
243                 t1 = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;\r
244                 t2 = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_\r
245                         | RuleBasedCollator.CE_CONTINUATION_MARKER_;\r
246 \r
247                 // if (listheader.m_baseCE_ < 0xEF000000) {\r
248                 // // first implicits have three byte primaries, with a gap of\r
249                 // // one so we esentially need to add 2 to the top byte in\r
250                 // // listheader.m_baseContCE_\r
251                 // t2 += 0x02000000;\r
252                 // }\r
253                 // else {\r
254                 // // second implicits have four byte primaries, with a gap of\r
255                 // // IMPLICIT_LAST2_MULTIPLIER_\r
256                 // // Now, this guy is not really accessible here, so until we\r
257                 // // find a better way to pass it around, assume that the gap\r
258                 // is 1\r
259                 // t2 += 0x00020000;\r
260                 // }\r
261                 listheader.m_gapsHi_[0] = mergeCE(t1, t2, Collator.PRIMARY);\r
262                 listheader.m_gapsHi_[1] = mergeCE(t1, t2, Collator.SECONDARY);\r
263                 listheader.m_gapsHi_[2] = mergeCE(t1, t2, Collator.TERTIARY);\r
264             } else if (listheader.m_indirect_ == true\r
265                     && listheader.m_nextCE_ != 0) {\r
266                 listheader.m_pos_[0] = 0;\r
267                 int t1 = listheader.m_baseCE_;\r
268                 int t2 = listheader.m_baseContCE_;\r
269                 listheader.m_gapsLo_[0] = mergeCE(t1, t2, Collator.PRIMARY);\r
270                 listheader.m_gapsLo_[1] = mergeCE(t1, t2, Collator.SECONDARY);\r
271                 listheader.m_gapsLo_[2] = mergeCE(t1, t2, Collator.TERTIARY);\r
272                 t1 = listheader.m_nextCE_;\r
273                 t2 = listheader.m_nextContCE_;\r
274                 listheader.m_gapsHi_[0] = mergeCE(t1, t2, Collator.PRIMARY);\r
275                 listheader.m_gapsHi_[1] = mergeCE(t1, t2, Collator.SECONDARY);\r
276                 listheader.m_gapsHi_[2] = mergeCE(t1, t2, Collator.TERTIARY);\r
277             } else {\r
278                 while (true) {\r
279                     if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {\r
280                         listheader.m_pos_[tokenstrength] = getInverseNext(\r
281                                 listheader, tokenstrength);\r
282                         if (listheader.m_pos_[tokenstrength] >= 0) {\r
283                             listheader.m_fStrToken_[tokenstrength] = token;\r
284                         } else {\r
285                             // The CE must be implicit, since it's not in the\r
286                             // table\r
287                             // Error\r
288                             throw new Exception("Internal program error");\r
289                         }\r
290                     }\r
291 \r
292                     while (token != null && token.m_strength_ >= tokenstrength) {\r
293                         if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {\r
294                             listheader.m_lStrToken_[tokenstrength] = token;\r
295                         }\r
296                         token = token.m_next_;\r
297                     }\r
298                     if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_ - 1) {\r
299                         // check if previous interval is the same and merge the\r
300                         // intervals if it is so\r
301                         if (listheader.m_pos_[tokenstrength] == listheader.m_pos_[tokenstrength + 1]) {\r
302                             listheader.m_fStrToken_[tokenstrength] = listheader.m_fStrToken_[tokenstrength + 1];\r
303                             listheader.m_fStrToken_[tokenstrength + 1] = null;\r
304                             listheader.m_lStrToken_[tokenstrength + 1] = null;\r
305                             listheader.m_pos_[tokenstrength + 1] = -1;\r
306                         }\r
307                     }\r
308                     if (token != null) {\r
309                         tokenstrength = token.m_strength_;\r
310                     } else {\r
311                         break;\r
312                     }\r
313                 }\r
314                 for (int st = 0; st < 3; st++) {\r
315                     int pos = listheader.m_pos_[st];\r
316                     if (pos >= 0) {\r
317                         int t1 = m_table_[3 * pos];\r
318                         int t2 = m_table_[3 * pos + 1];\r
319                         listheader.m_gapsHi_[3 * st] = mergeCE(t1, t2,\r
320                                 Collator.PRIMARY);\r
321                         listheader.m_gapsHi_[3 * st + 1] = mergeCE(t1, t2,\r
322                                 Collator.SECONDARY);\r
323                         listheader.m_gapsHi_[3 * st + 2] = (t1 & 0x3f) << 24\r
324                                 | (t2 & 0x3f) << 16;\r
325                         // pos --;\r
326                         // t1 = m_table_[3 * pos];\r
327                         // t2 = m_table_[3 * pos + 1];\r
328                         t1 = listheader.m_baseCE_;\r
329                         t2 = listheader.m_baseContCE_;\r
330 \r
331                         listheader.m_gapsLo_[3 * st] = mergeCE(t1, t2,\r
332                                 Collator.PRIMARY);\r
333                         listheader.m_gapsLo_[3 * st + 1] = mergeCE(t1, t2,\r
334                                 Collator.SECONDARY);\r
335                         listheader.m_gapsLo_[3 * st + 2] = (t1 & 0x3f) << 24\r
336                                 | (t2 & 0x3f) << 16;\r
337                     }\r
338                 }\r
339             }\r
340         }\r
341 \r
342         /**\r
343          * Gets the next CE in the inverse table\r
344          * \r
345          * @param listheader\r
346          *            token list header\r
347          * @param strength\r
348          *            collation strength\r
349          * @return next ce\r
350          */\r
351         private final int getInverseNext(\r
352                 CollationRuleParser.TokenListHeader listheader, int strength) {\r
353             int ce = listheader.m_baseCE_;\r
354             int secondce = listheader.m_baseContCE_;\r
355             int result = findInverseCE(ce, secondce);\r
356 \r
357             if (result < 0) {\r
358                 return -1;\r
359             }\r
360 \r
361             ce &= STRENGTH_MASK_[strength];\r
362             secondce &= STRENGTH_MASK_[strength];\r
363 \r
364             int nextce = ce;\r
365             int nextcontce = secondce;\r
366 \r
367             while ((nextce & STRENGTH_MASK_[strength]) == ce\r
368                     && (nextcontce & STRENGTH_MASK_[strength]) == secondce) {\r
369                 nextce = m_table_[3 * (++result)];\r
370                 nextcontce = m_table_[3 * result + 1];\r
371             }\r
372 \r
373             listheader.m_nextCE_ = nextce;\r
374             listheader.m_nextContCE_ = nextcontce;\r
375 \r
376             return result;\r
377         }\r
378     }\r
379 \r
380     // package private data members ------------------------------------------\r
381 \r
382     /**\r
383      * Inverse UCA, instantiate only when required\r
384      */\r
385     static final InverseUCA INVERSE_UCA_;\r
386 \r
387     /**\r
388      * UCA and Inverse UCA version do not match\r
389      */\r
390     private static final String INV_UCA_VERSION_MISMATCH_ = "UCA versions of UCA and inverse UCA should match";\r
391 \r
392     /**\r
393      * UCA and Inverse UCA version do not match\r
394      */\r
395     private static final String UCA_NOT_INSTANTIATED_ = "UCA is not instantiated!";\r
396 \r
397     /**\r
398      * Initializing the inverse UCA\r
399      */\r
400     static {\r
401         InverseUCA temp = null;\r
402         try {\r
403             temp = CollatorReader.getInverseUCA();\r
404         } catch (IOException e) {\r
405         }\r
406         /*\r
407          * try { String invdat = "/com/ibm/icu/impl/data/invuca.icu";\r
408          * InputStream i =\r
409          * CollationParsedRuleBuilder.class.getResourceAsStream(invdat);\r
410          * BufferedInputStream b = new BufferedInputStream(i, 110000);\r
411          * INVERSE_UCA_ = CollatorReader.readInverseUCA(b); b.close();\r
412          * i.close(); } catch (Exception e) { e.printStackTrace(); throw new\r
413          * RuntimeException(e.getMessage()); }\r
414          */\r
415 \r
416         if (temp != null && RuleBasedCollator.UCA_ != null) {\r
417             if (!temp.m_UCA_version_\r
418                     .equals(RuleBasedCollator.UCA_.m_UCA_version_)) {\r
419                 throw new RuntimeException(INV_UCA_VERSION_MISMATCH_);\r
420             }\r
421         } else {\r
422             throw new RuntimeException(UCA_NOT_INSTANTIATED_);\r
423         }\r
424 \r
425         INVERSE_UCA_ = temp;\r
426     }\r
427 \r
428     // package private methods -----------------------------------------------\r
429 \r
430     /**\r
431      * Parse and sets the collation rules in the argument collator\r
432      * \r
433      * @param collator\r
434      *            to set\r
435      * @exception Exception\r
436      *                thrown when internal program error occurs\r
437      */\r
438     void setRules(RuleBasedCollator collator) throws Exception {\r
439         if (m_parser_.m_resultLength_ > 0 || m_parser_.m_removeSet_ != null) {\r
440             // we have a set of rules, let's make something of it\r
441             assembleTailoringTable(collator);\r
442         } else { // no rules, but no error either must be only options\r
443             // We will init the collator from UCA\r
444             collator.setWithUCATables();\r
445         }\r
446         // And set only the options\r
447         m_parser_.setDefaultOptionsInCollator(collator);\r
448     }\r
449 \r
450     private void copyRangeFromUCA(BuildTable t, int start, int end) {\r
451         int u = 0;\r
452         for (u = start; u <= end; u++) {\r
453             // if ((CE = ucmpe32_get(t.m_mapping, u)) == UCOL_NOT_FOUND\r
454             int CE = t.m_mapping_.getValue(u);\r
455             if (CE == CE_NOT_FOUND_\r
456             // this test is for contractions that are missing the starting\r
457                     // element. Looks like latin-1 should be done before\r
458                     // assembling the table, even if it results in more false\r
459                     // closure elements\r
460                     || (isContractionTableElement(CE) && getCE(\r
461                             t.m_contractions_, CE, 0) == CE_NOT_FOUND_)) {\r
462                 // m_utilElement_.m_uchars_ = str.toString();\r
463                 m_utilElement_.m_uchars_ = UCharacter.toString(u);\r
464                 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;\r
465                 m_utilElement_.m_prefix_ = 0;\r
466                 m_utilElement_.m_CELength_ = 0;\r
467                 m_utilElement_.m_prefixChars_ = null;\r
468                 m_utilColEIter_.setText(m_utilElement_.m_uchars_);\r
469                 while (CE != CollationElementIterator.NULLORDER) {\r
470                     CE = m_utilColEIter_.next();\r
471                     if (CE != CollationElementIterator.NULLORDER) {\r
472                         m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;\r
473                     }\r
474                 }\r
475                 addAnElement(t, m_utilElement_);\r
476             }\r
477         }\r
478     }\r
479 \r
480     /**\r
481      * 2. Eliminate the negative lists by doing the following for each non-null\r
482      * negative list: o if previousCE(baseCE, strongestN) != some ListHeader X's\r
483      * baseCE, create new ListHeader X o reverse the list, add to the end of X's\r
484      * positive list. Reset the strength of the first item you add, based on the\r
485      * stronger strength levels of the two lists.\r
486      * \r
487      * 3. For each ListHeader with a non-null positive list: o Find all\r
488      * character strings with CEs between the baseCE and the next/previous CE,\r
489      * at the strength of the first token. Add these to the tailoring. ? That\r
490      * is, if UCA has ... x <<< X << x' <<< X' < y ..., and the tailoring has &\r
491      * x < z... ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...\r
492      * \r
493      * It is possible that this part should be done even while constructing list\r
494      * The problem is that it is unknown what is going to be the strongest\r
495      * weight. So we might as well do it here o Allocate CEs for each token in\r
496      * the list, based on the total number N of the largest level difference,\r
497      * and the gap G between baseCE and nextCE at that level. The relation *\r
498      * between the last item and nextCE is the same as the strongest strength. o\r
499      * Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) ? There are 3\r
500      * primary items: a, d, e. Fit them into the primary gap. Then fit b and c\r
501      * into the secondary gap between a and d, then fit q into the tertiary gap\r
502      * between b and c. o Example: baseCE << b <<< q << c * nextCE(X,2) ? There\r
503      * are 2 secondary items: b, c. Fit them into the secondary gap. Then fit q\r
504      * into the tertiary gap between b and c. o When incrementing primary\r
505      * values, we will not cross high byte boundaries except where there is only\r
506      * a single-byte primary. That is to ensure that the script reordering will\r
507      * continue to work.\r
508      * \r
509      * @param collator\r
510      *            the rule based collator to update\r
511      * @exception Exception\r
512      *                thrown when internal program error occurs\r
513      */\r
514     void assembleTailoringTable(RuleBasedCollator collator) throws Exception {\r
515 \r
516         for (int i = 0; i < m_parser_.m_resultLength_; i++) {\r
517             // now we need to generate the CEs\r
518             // We stuff the initial value in the buffers, and increase the\r
519             // appropriate buffer according to strength\r
520             if (m_parser_.m_listHeader_[i].m_first_ != null) {\r
521                 // if there are any elements\r
522                 // due to the way parser works, subsequent tailorings\r
523                 // may remove all the elements from a sequence, therefore\r
524                 // leaving an empty tailoring sequence.\r
525                 initBuffers(m_parser_.m_listHeader_[i]);\r
526             }\r
527         }\r
528 \r
529         if (m_parser_.m_variableTop_ != null) {\r
530             // stuff the variable top value\r
531             m_parser_.m_options_.m_variableTopValue_ = m_parser_.m_variableTop_.m_CE_[0] >>> 16;\r
532             // remove it from the list\r
533             if (m_parser_.m_variableTop_.m_listHeader_.m_first_ == m_parser_.m_variableTop_) { // first\r
534                                                                                                // in\r
535                                                                                                // list\r
536                 m_parser_.m_variableTop_.m_listHeader_.m_first_ = m_parser_.m_variableTop_.m_next_;\r
537             }\r
538             if (m_parser_.m_variableTop_.m_listHeader_.m_last_ == m_parser_.m_variableTop_) {\r
539                 // first in list\r
540                 m_parser_.m_variableTop_.m_listHeader_.m_last_ = m_parser_.m_variableTop_.m_previous_;\r
541             }\r
542             if (m_parser_.m_variableTop_.m_next_ != null) {\r
543                 m_parser_.m_variableTop_.m_next_.m_previous_ = m_parser_.m_variableTop_.m_previous_;\r
544             }\r
545             if (m_parser_.m_variableTop_.m_previous_ != null) {\r
546                 m_parser_.m_variableTop_.m_previous_.m_next_ = m_parser_.m_variableTop_.m_next_;\r
547             }\r
548         }\r
549 \r
550         BuildTable t = new BuildTable(m_parser_);\r
551 \r
552         // After this, we have assigned CE values to all regular CEs now we\r
553         // will go through list once more and resolve expansions, make\r
554         // UCAElements structs and add them to table\r
555         for (int i = 0; i < m_parser_.m_resultLength_; i++) {\r
556             // now we need to generate the CEs\r
557             // We stuff the initial value in the buffers, and increase the\r
558             // appropriate buffer according to strength */\r
559             createElements(t, m_parser_.m_listHeader_[i]);\r
560         }\r
561 \r
562         m_utilElement_.clear();\r
563         StringBuilder str = new StringBuilder();\r
564 \r
565         // add latin-1 stuff\r
566         copyRangeFromUCA(t, 0, 0xFF);\r
567 \r
568         // add stuff for copying\r
569         if (m_parser_.m_copySet_ != null) {\r
570             int i = 0;\r
571             for (i = 0; i < m_parser_.m_copySet_.getRangeCount(); i++) {\r
572                 copyRangeFromUCA(t, m_parser_.m_copySet_.getRangeStart(i),\r
573                         m_parser_.m_copySet_.getRangeEnd(i));\r
574             }\r
575         }\r
576 \r
577         // copy contractions from the UCA - this is felt mostly for cyrillic\r
578         char conts[] = RuleBasedCollator.UCA_CONTRACTIONS_;\r
579         int offset = 0;\r
580         while (conts[offset] != 0) {\r
581             // tailoredCE = ucmpe32_get(t.m_mapping, *conts);\r
582             int tailoredCE = t.m_mapping_.getValue(conts[offset]);\r
583             Elements prefixElm = null;\r
584             if (tailoredCE != CE_NOT_FOUND_) {\r
585                 boolean needToAdd = true;\r
586                 if (isContractionTableElement(tailoredCE)) {\r
587                     if (isTailored(t.m_contractions_, tailoredCE, conts,\r
588                             offset + 1) == true) {\r
589                         needToAdd = false;\r
590                     }\r
591                 }\r
592                 if (!needToAdd && isPrefix(tailoredCE)\r
593                         && conts[offset + 1] == 0) {\r
594                     // pre-context character in UCA\r
595                     // The format for pre-context character is\r
596                     // conts[0]: baseCP conts[1]:0 conts[2]:pre-context CP\r
597                     Elements elm = new Elements();\r
598                     elm.m_cPoints_ = m_utilElement_.m_uchars_;\r
599                     elm.m_CELength_ = 0;\r
600                     elm.m_uchars_ = UCharacter.toString(conts[offset]);\r
601                     elm.m_prefixChars_ = UCharacter.toString(conts[offset + 2]);\r
602                     elm.m_prefix_ = 0; // TODO(claireho) : confirm!\r
603                     prefixElm = t.m_prefixLookup_.get(elm);\r
604                     if ((prefixElm == null)\r
605                             || (prefixElm.m_prefixChars_.charAt(0) != conts[offset + 2])) {\r
606                         needToAdd = true;\r
607                     }\r
608                 }\r
609                 if (m_parser_.m_removeSet_ != null\r
610                         && m_parser_.m_removeSet_.contains(conts[offset])) {\r
611                     needToAdd = false;\r
612                 }\r
613 \r
614                 if (needToAdd == true) {\r
615                     // we need to add if this contraction is not tailored.\r
616                     if (conts[offset + 1] != 0) { // not precontext\r
617                         m_utilElement_.m_prefix_ = 0;\r
618                         m_utilElement_.m_prefixChars_ = null;\r
619                         m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;\r
620                         str.delete(0, str.length());\r
621                         str.append(conts[offset]);\r
622                         str.append(conts[offset + 1]);\r
623                         if (conts[offset + 2] != 0) {\r
624                             str.append(conts[offset + 2]);\r
625                         }\r
626                         m_utilElement_.m_uchars_ = str.toString();\r
627                         m_utilElement_.m_CELength_ = 0;\r
628                         m_utilColEIter_.setText(m_utilElement_.m_uchars_);\r
629                     } else { // add a pre-context element\r
630                         int preKeyLen = 0;\r
631                         str.delete(0, str.length()); // clean up\r
632                         m_utilElement_.m_cPoints_ = UCharacter\r
633                                 .toString(conts[offset]);\r
634                         m_utilElement_.m_CELength_ = 0;\r
635                         m_utilElement_.m_uchars_ = UCharacter\r
636                                 .toString(conts[offset]);\r
637                         m_utilElement_.m_prefixChars_ = UCharacter\r
638                                 .toString(conts[offset + 2]);\r
639                         if (prefixElm == null) {\r
640                             m_utilElement_.m_prefix_ = 0;\r
641                         } else { // TODO (claireho): confirm!\r
642                             m_utilElement_.m_prefix_ = m_utilElement_.m_prefix_;\r
643                             // m_utilElement_.m_prefix_= prefixElm.m_prefix_;\r
644                         }\r
645                         m_utilColEIter_.setText(m_utilElement_.m_prefixChars_);\r
646                         while (m_utilColEIter_.next() != CollationElementIterator.NULLORDER) {\r
647                             // count number of keys for pre-context char.\r
648                             preKeyLen++;\r
649                         }\r
650                         str.append(conts[offset + 2]);\r
651                         str.append(conts[offset]);\r
652                         m_utilColEIter_.setText(str.toString());\r
653                         // Skip the keys for prefix character, then copy the\r
654                         // rest to el.\r
655                         while ((preKeyLen-- > 0)\r
656                                 && m_utilColEIter_.next() != CollationElementIterator.NULLORDER) {\r
657                             continue;\r
658                         }\r
659 \r
660                     }\r
661                     while (true) {\r
662                         int CE = m_utilColEIter_.next();\r
663                         if (CE != CollationElementIterator.NULLORDER) {\r
664                             m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;\r
665                         } else {\r
666                             break;\r
667                         }\r
668                     }\r
669                     addAnElement(t, m_utilElement_);\r
670                 }\r
671             } else if (m_parser_.m_removeSet_ != null\r
672                     && m_parser_.m_removeSet_.contains(conts[offset])) {\r
673                 copyRangeFromUCA(t, conts[offset], conts[offset]);\r
674             }\r
675 \r
676             offset += 3;\r
677         }\r
678 \r
679         // Add completely ignorable elements\r
680         processUCACompleteIgnorables(t);\r
681 \r
682         // canonical closure\r
683         canonicalClosure(t);\r
684 \r
685         // still need to produce compatibility closure\r
686         assembleTable(t, collator);\r
687     }\r
688 \r
689     // private inner classes -------------------------------------------------\r
690 \r
691     @SuppressWarnings("unused")\r
692     private static class CEGenerator {\r
693         // package private data members --------------------------------------\r
694 \r
695         WeightRange m_ranges_[];\r
696         int m_rangesLength_;\r
697         int m_byteSize_;\r
698         int m_start_;\r
699         int m_limit_;\r
700         int m_maxCount_;\r
701         int m_count_;\r
702         int m_current_;\r
703         int m_fLow_; // forbidden Low\r
704         int m_fHigh_; // forbidden High\r
705 \r
706         // package private constructor ---------------------------------------\r
707 \r
708         CEGenerator() {\r
709             m_ranges_ = new WeightRange[7];\r
710             for (int i = 6; i >= 0; i--) {\r
711                 m_ranges_[i] = new WeightRange();\r
712             }\r
713         }\r
714     }\r
715 \r
716     private static class WeightRange implements Comparable<WeightRange> {\r
717         // public methods ----------------------------------------------------\r
718 \r
719         /**\r
720          * Compares this object with target\r
721          * \r
722          * @param target object to compare with\r
723          * @return 0 if equals, 1 if this is > target, -1 otherwise\r
724          */\r
725         public int compareTo(WeightRange target) {\r
726             if (this == target) {\r
727                 return 0;\r
728             }\r
729             int tstart = target.m_start_;\r
730             if (m_start_ == tstart) {\r
731                 return 0;\r
732             }\r
733             if (m_start_ > tstart) {\r
734                 return 1;\r
735             }\r
736             return -1;\r
737         }\r
738 \r
739         /**\r
740          * Initialize\r
741          */\r
742         public void clear() {\r
743             m_start_ = 0;\r
744             m_end_ = 0;\r
745             m_length_ = 0;\r
746             m_count_ = 0;\r
747             m_length2_ = 0;\r
748             m_count2_ = 0;\r
749         }\r
750 \r
751         // package private data members --------------------------------------\r
752 \r
753         int m_start_;\r
754         int m_end_;\r
755         int m_length_;\r
756         int m_count_;\r
757         int m_length2_;\r
758         int m_count2_;\r
759 \r
760         // package private constructor ---------------------------------------\r
761 \r
762         WeightRange() {\r
763             clear();\r
764         }\r
765 \r
766         /**\r
767          * Copy constructor. Cloneable is troublesome, needs to check for\r
768          * exception\r
769          * \r
770          * @param source\r
771          *            to clone\r
772          */\r
773         WeightRange(WeightRange source) {\r
774             m_start_ = source.m_start_;\r
775             m_end_ = source.m_end_;\r
776             m_length_ = source.m_length_;\r
777             m_count_ = source.m_count_;\r
778             m_length2_ = source.m_length2_;\r
779             m_count2_ = source.m_count2_;\r
780         }\r
781     }\r
782 \r
783     private static class MaxJamoExpansionTable {\r
784         // package private data members --------------------------------------\r
785 \r
786         Vector<Integer> m_endExpansionCE_;\r
787         // vector of booleans\r
788         Vector<Boolean> m_isV_;\r
789         byte m_maxLSize_;\r
790         byte m_maxVSize_;\r
791         byte m_maxTSize_;\r
792 \r
793         // package private constructor ---------------------------------------\r
794 \r
795         MaxJamoExpansionTable() {\r
796             m_endExpansionCE_ = new Vector<Integer>();\r
797             m_isV_ = new Vector<Boolean>();\r
798             m_endExpansionCE_.add(new Integer(0));\r
799             m_isV_.add(Boolean.FALSE);\r
800             m_maxLSize_ = 1;\r
801             m_maxVSize_ = 1;\r
802             m_maxTSize_ = 1;\r
803         }\r
804 \r
805         MaxJamoExpansionTable(MaxJamoExpansionTable table) {\r
806             m_endExpansionCE_ = new Vector<Integer>(table.m_endExpansionCE_);\r
807             m_isV_ = new Vector<Boolean>(table.m_isV_);\r
808             m_maxLSize_ = table.m_maxLSize_;\r
809             m_maxVSize_ = table.m_maxVSize_;\r
810             m_maxTSize_ = table.m_maxTSize_;\r
811         }\r
812     }\r
813 \r
814     private static class MaxExpansionTable {\r
815         // package private constructor --------------------------------------\r
816 \r
817         MaxExpansionTable() {\r
818             m_endExpansionCE_ = new Vector<Integer>();\r
819             m_expansionCESize_ = new Vector<Byte>();\r
820             m_endExpansionCE_.add(new Integer(0));\r
821             m_expansionCESize_.add(new Byte((byte) 0));\r
822         }\r
823 \r
824         MaxExpansionTable(MaxExpansionTable table) {\r
825             m_endExpansionCE_ = new Vector<Integer>(table.m_endExpansionCE_);\r
826             m_expansionCESize_ = new Vector<Byte>(table.m_expansionCESize_);\r
827         }\r
828 \r
829         // package private data member --------------------------------------\r
830 \r
831         Vector<Integer> m_endExpansionCE_;\r
832         Vector<Byte> m_expansionCESize_;\r
833     }\r
834 \r
835     private static class BasicContractionTable {\r
836         // package private constructors -------------------------------------\r
837 \r
838         BasicContractionTable() {\r
839             m_CEs_ = new Vector<Integer>();\r
840             m_codePoints_ = new StringBuilder();\r
841         }\r
842 \r
843         // package private data members -------------------------------------\r
844 \r
845         StringBuilder m_codePoints_;\r
846         Vector<Integer> m_CEs_;\r
847     }\r
848 \r
849     private static class ContractionTable {\r
850         // package private constructor --------------------------------------\r
851 \r
852         /**\r
853          * Builds a contraction table\r
854          * \r
855          * @param mapping\r
856          */\r
857         ContractionTable(IntTrieBuilder mapping) {\r
858             m_mapping_ = mapping;\r
859             m_elements_ = new Vector<BasicContractionTable>();\r
860             m_CEs_ = new Vector<Integer>();\r
861             m_codePoints_ = new StringBuilder();\r
862             m_offsets_ = new Vector<Integer>();\r
863             m_currentTag_ = CE_NOT_FOUND_TAG_;\r
864         }\r
865 \r
866         /**\r
867          * Copies a contraction table. Not all data will be copied into their\r
868          * own object.\r
869          * \r
870          * @param table\r
871          */\r
872         ContractionTable(ContractionTable table) {\r
873             m_mapping_ = table.m_mapping_;\r
874             m_elements_ = new Vector<BasicContractionTable>(table.m_elements_);\r
875             m_codePoints_ = new StringBuilder(table.m_codePoints_);\r
876             m_CEs_ = new Vector<Integer>(table.m_CEs_);\r
877             m_offsets_ = new Vector<Integer>(table.m_offsets_);\r
878             m_currentTag_ = table.m_currentTag_;\r
879         }\r
880 \r
881         // package private data members ------------------------------------\r
882 \r
883         /**\r
884          * Vector of BasicContractionTable\r
885          */\r
886         Vector<BasicContractionTable> m_elements_;\r
887         IntTrieBuilder m_mapping_;\r
888         StringBuilder m_codePoints_;\r
889         Vector<Integer> m_CEs_;\r
890         Vector<Integer> m_offsets_;\r
891         int m_currentTag_;\r
892     }\r
893 \r
894     /**\r
895      * Private class for combining mark table. The table is indexed by the class\r
896      * value(0-255).\r
897      */\r
898     @SuppressWarnings("unused")\r
899     private static class CombinClassTable {\r
900         /**\r
901          * accumulated numbers of combining marks.\r
902          */\r
903         int[] index = new int[256];\r
904 \r
905         /**\r
906          * code point array for combining marks.\r
907          */\r
908         char[] cPoints;\r
909 \r
910         /**\r
911          * size of cPoints.\r
912          */\r
913         int size;\r
914 \r
915         // constructor\r
916         CombinClassTable() {\r
917             cPoints = null;\r
918             size = 0;\r
919             pos = 0;\r
920             curClass = 1;\r
921         }\r
922 \r
923         /**\r
924          * Copy the combining mark table from ccc and index in compact way.\r
925          * \r
926          * @param cps\r
927          *            : code point array\r
928          * @param size\r
929          *            : size of ccc\r
930          * @param index\r
931          *            : index of combining classes(0-255)\r
932          */\r
933         void generate(char[] cps, int numOfCM, int[] ccIndex) {\r
934             int count = 0;\r
935 \r
936             cPoints = new char[numOfCM];\r
937             for (int i = 0; i < 256; i++) {\r
938                 for (int j = 0; j < ccIndex[i]; j++) {\r
939                     cPoints[count++] = cps[(i << 8) + j];\r
940                 }\r
941                 index[i] = count;\r
942             }\r
943             size = count;\r
944         }\r
945 \r
946         /**\r
947          * Get first CM(combining mark) with the combining class value cClass.\r
948          * \r
949          * @param cClass\r
950          *            : combining class value.\r
951          * @return combining mark codepoint or 0 if no combining make with class\r
952          *         value cClass\r
953          */\r
954         char GetFirstCM(int cClass) {\r
955             curClass = cClass;\r
956             if (cPoints == null || cClass == 0\r
957                     || index[cClass] == index[cClass - 1]) {\r
958                 return 0;\r
959             }\r
960             pos = 1;\r
961             return cPoints[index[cClass - 1]];\r
962         }\r
963 \r
964         /**\r
965          * Get next CM(combining mark) with the combining class value cClass.\r
966          * Return combining mark codepoint or 0 if no next CM.\r
967          */\r
968         char GetNextCM() {\r
969             if (cPoints == null\r
970                     || index[curClass] == (index[curClass - 1] + pos)) {\r
971                 return 0;\r
972             }\r
973             return cPoints[index[curClass - 1] + (pos++)];\r
974         }\r
975 \r
976         // private data members\r
977         int pos;\r
978         int curClass;\r
979     }\r
980 \r
981     private static final class BuildTable implements TrieBuilder.DataManipulate {\r
982         // package private methods ------------------------------------------\r
983 \r
984         /**\r
985          * For construction of the Trie tables. Has to be labeled public\r
986          * \r
987          * @param cp The value of the code point.\r
988          * @param offset The value of the offset.\r
989          * @return data offset or 0\r
990          */\r
991         public int getFoldedValue(int cp, int offset) {\r
992             int limit = cp + 0x400;\r
993             while (cp < limit) {\r
994                 int value = m_mapping_.getValue(cp);\r
995                 boolean inBlockZero = m_mapping_.isInZeroBlock(cp);\r
996                 int tag = getCETag(value);\r
997                 if (inBlockZero == true) {\r
998                     cp += TrieBuilder.DATA_BLOCK_LENGTH;\r
999                 } else if (!(isSpecial(value) && (tag == CE_IMPLICIT_TAG_ || tag == CE_NOT_FOUND_TAG_))) {\r
1000                     // These are values that are starting in either UCA\r
1001                     // (IMPLICIT_TAG) or in the tailorings (NOT_FOUND_TAG).\r
1002                     // Presence of these tags means that there is nothing in\r
1003                     // this position and that it should be skipped.\r
1004                     return RuleBasedCollator.CE_SPECIAL_FLAG_\r
1005                             | (CE_SURROGATE_TAG_ << 24) | offset;\r
1006                 } else {\r
1007                     ++cp;\r
1008                 }\r
1009             }\r
1010             return 0;\r
1011         }\r
1012 \r
1013         // package private constructor --------------------------------------\r
1014 \r
1015         /**\r
1016          * Returns a table\r
1017          */\r
1018         BuildTable(CollationRuleParser parser) {\r
1019             m_collator_ = new RuleBasedCollator();\r
1020             m_collator_.setWithUCAData();\r
1021             MaxExpansionTable maxet = new MaxExpansionTable();\r
1022             MaxJamoExpansionTable maxjet = new MaxJamoExpansionTable();\r
1023             m_options_ = parser.m_options_;\r
1024             m_expansions_ = new Vector<Integer>();\r
1025             // Do your own mallocs for the structure, array and have linear\r
1026             // Latin 1\r
1027             int trieinitialvalue = RuleBasedCollator.CE_SPECIAL_FLAG_\r
1028                     | (CE_NOT_FOUND_TAG_ << 24);\r
1029             // temporary fix for jb3822, 0x100000 -> 30000\r
1030             m_mapping_ = new IntTrieBuilder(null, 0x30000, trieinitialvalue,\r
1031                     trieinitialvalue, true);\r
1032             m_prefixLookup_ = new Hashtable<Elements, Elements>();\r
1033             // uhash_open(prefixLookupHash, prefixLookupComp);\r
1034             m_contractions_ = new ContractionTable(m_mapping_);\r
1035             // copy UCA's maxexpansion and merge as we go along\r
1036             m_maxExpansions_ = maxet;\r
1037             // adding an extra initial value for easier manipulation\r
1038             for (int i = 0; i < RuleBasedCollator.UCA_.m_expansionEndCE_.length; i++) {\r
1039                 maxet.m_endExpansionCE_.add(new Integer(\r
1040                         RuleBasedCollator.UCA_.m_expansionEndCE_[i]));\r
1041                 maxet.m_expansionCESize_.add(new Byte(\r
1042                         RuleBasedCollator.UCA_.m_expansionEndCEMaxSize_[i]));\r
1043             }\r
1044             m_maxJamoExpansions_ = maxjet;\r
1045 \r
1046             m_unsafeCP_ = new byte[UNSAFECP_TABLE_SIZE_];\r
1047             m_contrEndCP_ = new byte[UNSAFECP_TABLE_SIZE_];\r
1048             Arrays.fill(m_unsafeCP_, (byte) 0);\r
1049             Arrays.fill(m_contrEndCP_, (byte) 0);\r
1050         }\r
1051 \r
1052         /**\r
1053          * Duplicating a BuildTable. Not all data will be duplicated into their\r
1054          * own object.\r
1055          * \r
1056          * @param table\r
1057          *            to clone\r
1058          */\r
1059         BuildTable(BuildTable table) {\r
1060             m_collator_ = table.m_collator_;\r
1061             m_mapping_ = new IntTrieBuilder(table.m_mapping_);\r
1062             m_expansions_ = new Vector<Integer>(table.m_expansions_);\r
1063             m_contractions_ = new ContractionTable(table.m_contractions_);\r
1064             m_contractions_.m_mapping_ = m_mapping_;\r
1065             m_options_ = table.m_options_;\r
1066             m_maxExpansions_ = new MaxExpansionTable(table.m_maxExpansions_);\r
1067             m_maxJamoExpansions_ = new MaxJamoExpansionTable(\r
1068                     table.m_maxJamoExpansions_);\r
1069             m_unsafeCP_ = new byte[table.m_unsafeCP_.length];\r
1070             System.arraycopy(table.m_unsafeCP_, 0, m_unsafeCP_, 0,\r
1071                     m_unsafeCP_.length);\r
1072             m_contrEndCP_ = new byte[table.m_contrEndCP_.length];\r
1073             System.arraycopy(table.m_contrEndCP_, 0, m_contrEndCP_, 0,\r
1074                     m_contrEndCP_.length);\r
1075         }\r
1076 \r
1077         // package private data members -------------------------------------\r
1078 \r
1079         RuleBasedCollator m_collator_;\r
1080         IntTrieBuilder m_mapping_;\r
1081         Vector<Integer> m_expansions_;\r
1082         ContractionTable m_contractions_;\r
1083         // UCATableHeader image;\r
1084         CollationRuleParser.OptionSet m_options_;\r
1085         MaxExpansionTable m_maxExpansions_;\r
1086         MaxJamoExpansionTable m_maxJamoExpansions_;\r
1087         byte m_unsafeCP_[];\r
1088         byte m_contrEndCP_[];\r
1089         Hashtable<Elements, Elements> m_prefixLookup_;\r
1090         CombinClassTable cmLookup = null;\r
1091     }\r
1092 \r
1093     private static class Elements {\r
1094         // package private data members -------------------------------------\r
1095 \r
1096         String m_prefixChars_;\r
1097         int m_prefix_;\r
1098         String m_uchars_;\r
1099         /**\r
1100          * Working string\r
1101          */\r
1102         String m_cPoints_;\r
1103         /**\r
1104          * Offset to the working string\r
1105          */\r
1106         int m_cPointsOffset_;\r
1107         /**\r
1108          * These are collation elements - there could be more than one - in case\r
1109          * of expansion\r
1110          */\r
1111         int m_CEs_[];\r
1112         int m_CELength_;\r
1113         /**\r
1114          * This is the value element maps in original table\r
1115          */\r
1116         int m_mapCE_;\r
1117         int m_sizePrim_[];\r
1118         int m_sizeSec_[];\r
1119         int m_sizeTer_[];\r
1120         boolean m_variableTop_;\r
1121         boolean m_caseBit_;\r
1122 \r
1123         // package private constructors -------------------------------------\r
1124 \r
1125         /**\r
1126          * Package private constructor\r
1127          */\r
1128         Elements() {\r
1129             m_sizePrim_ = new int[128];\r
1130             m_sizeSec_ = new int[128];\r
1131             m_sizeTer_ = new int[128];\r
1132             m_CEs_ = new int[256];\r
1133             m_CELength_ = 0;\r
1134         }\r
1135 \r
1136         /**\r
1137          * Package private constructor\r
1138          */\r
1139         Elements(Elements element) {\r
1140             m_prefixChars_ = element.m_prefixChars_;\r
1141             m_prefix_ = element.m_prefix_;\r
1142             m_uchars_ = element.m_uchars_;\r
1143             m_cPoints_ = element.m_cPoints_;\r
1144             m_cPointsOffset_ = element.m_cPointsOffset_;\r
1145             m_CEs_ = element.m_CEs_;\r
1146             m_CELength_ = element.m_CELength_;\r
1147             m_mapCE_ = element.m_mapCE_;\r
1148             m_sizePrim_ = element.m_sizePrim_;\r
1149             m_sizeSec_ = element.m_sizeSec_;\r
1150             m_sizeTer_ = element.m_sizeTer_;\r
1151             m_variableTop_ = element.m_variableTop_;\r
1152             m_caseBit_ = element.m_caseBit_;\r
1153         }\r
1154 \r
1155         // package private methods -------------------------------------------\r
1156 \r
1157         /**\r
1158          * Initializing the elements\r
1159          */\r
1160         public void clear() {\r
1161             m_prefixChars_ = null;\r
1162             m_prefix_ = 0;\r
1163             m_uchars_ = null;\r
1164             m_cPoints_ = null;\r
1165             m_cPointsOffset_ = 0;\r
1166             m_CELength_ = 0;\r
1167             m_mapCE_ = 0;\r
1168             Arrays.fill(m_sizePrim_, 0);\r
1169             Arrays.fill(m_sizeSec_, 0);\r
1170             Arrays.fill(m_sizeTer_, 0);\r
1171             m_variableTop_ = false;\r
1172             m_caseBit_ = false;\r
1173         }\r
1174 \r
1175         /**\r
1176          * Hashcode calculation for token\r
1177          * \r
1178          * @return the hashcode\r
1179          */\r
1180         public int hashCode() {\r
1181             String str = m_cPoints_.substring(m_cPointsOffset_);\r
1182             return str.hashCode();\r
1183         }\r
1184 \r
1185         /**\r
1186          * Equals calculation\r
1187          * \r
1188          * @param target Object to compare\r
1189          * @return true if target is the same as this object\r
1190          */\r
1191         public boolean equals(Object target) {\r
1192             if (target == this) {\r
1193                 return true;\r
1194             }\r
1195             if (target instanceof Elements) {\r
1196                 Elements t = (Elements) target;\r
1197                 int size = m_cPoints_.length() - m_cPointsOffset_;\r
1198                 if (size == t.m_cPoints_.length() - t.m_cPointsOffset_) {\r
1199                     return t.m_cPoints_.regionMatches(t.m_cPointsOffset_,\r
1200                             m_cPoints_, m_cPointsOffset_, size);\r
1201                 }\r
1202             }\r
1203             return false;\r
1204         }\r
1205     }\r
1206 \r
1207     // private data member ---------------------------------------------------\r
1208 \r
1209     /**\r
1210      * Maximum strength used in CE building\r
1211      */\r
1212     private static final int CE_BASIC_STRENGTH_LIMIT_ = 3;\r
1213     /**\r
1214      * Maximum collation strength\r
1215      */\r
1216     private static final int CE_STRENGTH_LIMIT_ = 16;\r
1217     /**\r
1218      * Strength mask array, used in inverse UCA\r
1219      */\r
1220     private static final int STRENGTH_MASK_[] = { 0xFFFF0000, 0xFFFFFF00,\r
1221         0xFFFFFFFF };\r
1222     /**\r
1223      * CE tag for not found\r
1224      */\r
1225     private static final int CE_NOT_FOUND_ = 0xF0000000;\r
1226     /**\r
1227      * CE tag for not found\r
1228      */\r
1229     private static final int CE_NOT_FOUND_TAG_ = 0;\r
1230     /**\r
1231      * This code point results in an expansion\r
1232      */\r
1233     private static final int CE_EXPANSION_TAG_ = 1;\r
1234     /**\r
1235      * Start of a contraction\r
1236      */\r
1237     private static final int CE_CONTRACTION_TAG_ = 2;\r
1238     /*\r
1239      * Thai character - do the reordering\r
1240      */\r
1241     // private static final int CE_THAI_TAG_ = 3;\r
1242     /*\r
1243      * Charset processing, not yet implemented\r
1244      */\r
1245     // private static final int CE_CHARSET_TAG_ = 4;\r
1246     /**\r
1247      * Lead surrogate that is tailored and doesn't start a contraction\r
1248      */\r
1249     private static final int CE_SURROGATE_TAG_ = 5;\r
1250     /*\r
1251      * AC00-D7AF\r
1252      */\r
1253     // private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;\r
1254     /*\r
1255      * D800-DBFF\r
1256      */\r
1257     // private static final int CE_LEAD_SURROGATE_TAG_ = 7;\r
1258     /*\r
1259      * DC00-DFFF\r
1260      */\r
1261     // private static final int CE_TRAIL_SURROGATE_TAG_ = 8;\r
1262     /*\r
1263      * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D\r
1264      */\r
1265     // private static final int CE_CJK_IMPLICIT_TAG_ = 9;\r
1266     private static final int CE_IMPLICIT_TAG_ = 10;\r
1267     private static final int CE_SPEC_PROC_TAG_ = 11;\r
1268     /**\r
1269      * This is a three byte primary with starting secondaries and tertiaries. It\r
1270      * fits in a single 32 bit CE and is used instead of expansion to save space\r
1271      * without affecting the performance (hopefully)\r
1272      */\r
1273     private static final int CE_LONG_PRIMARY_TAG_ = 12;\r
1274     /**\r
1275      * Unsafe UChar hash table table size. Size is 32 bytes for 1 bit for each\r
1276      * latin 1 char + some power of two for hashing the rest of the chars. Size\r
1277      * in bytes\r
1278      */\r
1279     private static final int UNSAFECP_TABLE_SIZE_ = 1056;\r
1280     /**\r
1281      * Mask value down to "some power of two" -1. Number of bits, not num of\r
1282      * bytes.\r
1283      */\r
1284     private static final int UNSAFECP_TABLE_MASK_ = 0x1fff;\r
1285     /**\r
1286      * Case values\r
1287      */\r
1288     private static final int UPPER_CASE_ = 0x80;\r
1289     private static final int MIXED_CASE_ = 0x40;\r
1290     private static final int LOWER_CASE_ = 0x00;\r
1291     /*\r
1292      * Initial table size\r
1293      */\r
1294     // private static final int INIT_TABLE_SIZE_ = 1028;\r
1295     /*\r
1296      * Header size, copied from ICU4C, to be changed when that value changes\r
1297      */\r
1298     // private static final int HEADER_SIZE_ = 0xC4;\r
1299     /**\r
1300      * Contraction table new element indicator\r
1301      */\r
1302     private static final int CONTRACTION_TABLE_NEW_ELEMENT_ = 0xFFFFFF;\r
1303     /**\r
1304      * Parser for the rules\r
1305      */\r
1306     private CollationRuleParser m_parser_;\r
1307     /**\r
1308      * Utility UCA collation element iterator\r
1309      */\r
1310     private CollationElementIterator m_utilColEIter_;\r
1311     /**\r
1312      * Utility data members\r
1313      */\r
1314     private CEGenerator m_utilGens_[] = { new CEGenerator(), new CEGenerator(),\r
1315         new CEGenerator() };\r
1316     private int m_utilCEBuffer_[] = new int[CE_BASIC_STRENGTH_LIMIT_];\r
1317     private int m_utilIntBuffer_[] = new int[CE_STRENGTH_LIMIT_];\r
1318     private Elements m_utilElement_ = new Elements();\r
1319     private Elements m_utilElement2_ = new Elements();\r
1320     private CollationRuleParser.Token m_utilToken_ = new CollationRuleParser.Token();\r
1321     private int m_utilCountBuffer_[] = new int[6];\r
1322     private long m_utilLongBuffer_[] = new long[5];\r
1323     private WeightRange m_utilLowerWeightRange_[] = { new WeightRange(),\r
1324         new WeightRange(), new WeightRange(), new WeightRange(),\r
1325         new WeightRange() };\r
1326     private WeightRange m_utilUpperWeightRange_[] = { new WeightRange(),\r
1327         new WeightRange(), new WeightRange(), new WeightRange(),\r
1328         new WeightRange() };\r
1329     private WeightRange m_utilWeightRange_ = new WeightRange();\r
1330     private final Normalizer2Impl m_nfcImpl_ = Norm2AllModes.getNFCInstance().impl;\r
1331     private CanonicalIterator m_utilCanIter_ = new CanonicalIterator("");\r
1332     private StringBuilder m_utilStringBuffer_ = new StringBuilder("");\r
1333     // Flag indicating a combining marks table is required or not.\r
1334     private static boolean buildCMTabFlag = false;\r
1335 \r
1336     // private methods -------------------------------------------------------\r
1337 \r
1338     /**\r
1339      * @param listheader\r
1340      *            parsed rule tokens\r
1341      * @exception Exception\r
1342      *                thrown when internal error occurs\r
1343      */\r
1344     private void initBuffers(CollationRuleParser.TokenListHeader listheader)\r
1345             throws Exception {\r
1346         CollationRuleParser.Token token = listheader.m_last_;\r
1347         Arrays.fill(m_utilIntBuffer_, 0, CE_STRENGTH_LIMIT_, 0);\r
1348 \r
1349         token.m_toInsert_ = 1;\r
1350         m_utilIntBuffer_[token.m_strength_] = 1;\r
1351         while (token.m_previous_ != null) {\r
1352             if (token.m_previous_.m_strength_ < token.m_strength_) {\r
1353                 // going up\r
1354                 m_utilIntBuffer_[token.m_strength_] = 0;\r
1355                 m_utilIntBuffer_[token.m_previous_.m_strength_]++;\r
1356             } else if (token.m_previous_.m_strength_ > token.m_strength_) {\r
1357                 // going down\r
1358                 m_utilIntBuffer_[token.m_previous_.m_strength_] = 1;\r
1359             } else {\r
1360                 m_utilIntBuffer_[token.m_strength_]++;\r
1361             }\r
1362             token = token.m_previous_;\r
1363             token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];\r
1364         }\r
1365 \r
1366         token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];\r
1367         INVERSE_UCA_.getInverseGapPositions(listheader);\r
1368 \r
1369         token = listheader.m_first_;\r
1370         int fstrength = Collator.IDENTICAL;\r
1371         int initstrength = Collator.IDENTICAL;\r
1372 \r
1373         m_utilCEBuffer_[Collator.PRIMARY] = mergeCE(listheader.m_baseCE_,\r
1374                 listheader.m_baseContCE_, Collator.PRIMARY);\r
1375         m_utilCEBuffer_[Collator.SECONDARY] = mergeCE(listheader.m_baseCE_,\r
1376                 listheader.m_baseContCE_, Collator.SECONDARY);\r
1377         m_utilCEBuffer_[Collator.TERTIARY] = mergeCE(listheader.m_baseCE_,\r
1378                 listheader.m_baseContCE_, Collator.TERTIARY);\r
1379         while (token != null) {\r
1380             fstrength = token.m_strength_;\r
1381             if (fstrength < initstrength) {\r
1382                 initstrength = fstrength;\r
1383                 if (listheader.m_pos_[fstrength] == -1) {\r
1384                     while (listheader.m_pos_[fstrength] == -1 && fstrength > 0) {\r
1385                         fstrength--;\r
1386                     }\r
1387                     if (listheader.m_pos_[fstrength] == -1) {\r
1388                         throw new Exception("Internal program error");\r
1389                     }\r
1390                 }\r
1391                 if (initstrength == Collator.TERTIARY) {\r
1392                     // starting with tertiary\r
1393                     m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3];\r
1394                     m_utilCEBuffer_[Collator.SECONDARY] = listheader.m_gapsLo_[fstrength * 3 + 1];\r
1395                     m_utilCEBuffer_[Collator.TERTIARY] = getCEGenerator(\r
1396                             m_utilGens_[Collator.TERTIARY],\r
1397                             listheader.m_gapsLo_, listheader.m_gapsHi_, token,\r
1398                             fstrength);\r
1399                 } else if (initstrength == Collator.SECONDARY) {\r
1400                     // secondaries\r
1401                     m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3];\r
1402                     m_utilCEBuffer_[Collator.SECONDARY] = getCEGenerator(\r
1403                             m_utilGens_[Collator.SECONDARY],\r
1404                             listheader.m_gapsLo_, listheader.m_gapsHi_, token,\r
1405                             fstrength);\r
1406                     m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(\r
1407                             m_utilGens_[Collator.TERTIARY], token,\r
1408                             Collator.TERTIARY);\r
1409                 } else {\r
1410                     // primaries\r
1411                     m_utilCEBuffer_[Collator.PRIMARY] = getCEGenerator(\r
1412                             m_utilGens_[Collator.PRIMARY],\r
1413                             listheader.m_gapsLo_, listheader.m_gapsHi_, token,\r
1414                             fstrength);\r
1415                     m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator(\r
1416                             m_utilGens_[Collator.SECONDARY], token,\r
1417                             Collator.SECONDARY);\r
1418                     m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(\r
1419                             m_utilGens_[Collator.TERTIARY], token,\r
1420                             Collator.TERTIARY);\r
1421                 }\r
1422             } else {\r
1423                 if (token.m_strength_ == Collator.TERTIARY) {\r
1424                     m_utilCEBuffer_[Collator.TERTIARY] = getNextGenerated(m_utilGens_[Collator.TERTIARY]);\r
1425                 } else if (token.m_strength_ == Collator.SECONDARY) {\r
1426                     m_utilCEBuffer_[Collator.SECONDARY] = getNextGenerated(m_utilGens_[Collator.SECONDARY]);\r
1427                     m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(\r
1428                             m_utilGens_[Collator.TERTIARY], token,\r
1429                             Collator.TERTIARY);\r
1430                 } else if (token.m_strength_ == Collator.PRIMARY) {\r
1431                     m_utilCEBuffer_[Collator.PRIMARY] = getNextGenerated(m_utilGens_[Collator.PRIMARY]);\r
1432                     m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator(\r
1433                             m_utilGens_[Collator.SECONDARY], token,\r
1434                             Collator.SECONDARY);\r
1435                     m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(\r
1436                             m_utilGens_[Collator.TERTIARY], token,\r
1437                             Collator.TERTIARY);\r
1438                 }\r
1439             }\r
1440             doCE(m_utilCEBuffer_, token);\r
1441             token = token.m_next_;\r
1442         }\r
1443     }\r
1444 \r
1445     /**\r
1446      * Get the next generated ce\r
1447      * \r
1448      * @param g\r
1449      *            ce generator\r
1450      * @return next generated ce\r
1451      */\r
1452     private int getNextGenerated(CEGenerator g) {\r
1453         g.m_current_ = nextWeight(g);\r
1454         return g.m_current_;\r
1455     }\r
1456 \r
1457     /**\r
1458      * @param g\r
1459      *            CEGenerator\r
1460      * @param token\r
1461      *            rule token\r
1462      * @param strength\r
1463      * @return ce generator\r
1464      * @exception Exception\r
1465      *                thrown when internal error occurs\r
1466      */\r
1467     private int getSimpleCEGenerator(CEGenerator g,\r
1468             CollationRuleParser.Token token, int strength) throws Exception {\r
1469         int high, low, count = 1;\r
1470         int maxbyte = (strength == Collator.TERTIARY) ? 0x3F : 0xFF;\r
1471 \r
1472         if (strength == Collator.SECONDARY) {\r
1473             low = RuleBasedCollator.COMMON_TOP_2_ << 24;\r
1474             high = 0xFFFFFFFF;\r
1475             count = 0xFF - RuleBasedCollator.COMMON_TOP_2_;\r
1476         } else {\r
1477             low = RuleBasedCollator.BYTE_COMMON_ << 24; // 0x05000000;\r
1478             high = 0x40000000;\r
1479             count = 0x40 - RuleBasedCollator.BYTE_COMMON_;\r
1480         }\r
1481 \r
1482         if (token.m_next_ != null && token.m_next_.m_strength_ == strength) {\r
1483             count = token.m_next_.m_toInsert_;\r
1484         }\r
1485 \r
1486         g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,\r
1487                 g.m_ranges_);\r
1488         g.m_current_ = RuleBasedCollator.BYTE_COMMON_ << 24;\r
1489 \r
1490         if (g.m_rangesLength_ == 0) {\r
1491             throw new Exception("Internal program error");\r
1492         }\r
1493         return g.m_current_;\r
1494     }\r
1495 \r
1496     /**\r
1497      * Combines 2 ce into one with respect to the argument strength\r
1498      * \r
1499      * @param ce1\r
1500      *            first ce\r
1501      * @param ce2\r
1502      *            second ce\r
1503      * @param strength\r
1504      *            strength to use\r
1505      * @return combined ce\r
1506      */\r
1507     private static int mergeCE(int ce1, int ce2, int strength) {\r
1508         int mask = RuleBasedCollator.CE_TERTIARY_MASK_;\r
1509         if (strength == Collator.SECONDARY) {\r
1510             mask = RuleBasedCollator.CE_SECONDARY_MASK_;\r
1511         } else if (strength == Collator.PRIMARY) {\r
1512             mask = RuleBasedCollator.CE_PRIMARY_MASK_;\r
1513         }\r
1514         ce1 &= mask;\r
1515         ce2 &= mask;\r
1516         switch (strength) {\r
1517         case Collator.PRIMARY:\r
1518             return ce1 | ce2 >>> 16;\r
1519         case Collator.SECONDARY:\r
1520             return ce1 << 16 | ce2 << 8;\r
1521         default:\r
1522             return ce1 << 24 | ce2 << 16;\r
1523         }\r
1524     }\r
1525 \r
1526     /**\r
1527      * @param g\r
1528      *            CEGenerator\r
1529      * @param lows\r
1530      *            low gap array\r
1531      * @param highs\r
1532      *            high gap array\r
1533      * @param token\r
1534      *            rule token\r
1535      * @param fstrength\r
1536      * @exception Exception\r
1537      *                thrown when internal error occurs\r
1538      */\r
1539     private int getCEGenerator(CEGenerator g, int lows[], int highs[],\r
1540             CollationRuleParser.Token token, int fstrength) throws Exception {\r
1541         int strength = token.m_strength_;\r
1542         int low = lows[fstrength * 3 + strength];\r
1543         int high = highs[fstrength * 3 + strength];\r
1544         int maxbyte = 0;\r
1545         if (strength == Collator.TERTIARY) {\r
1546             maxbyte = 0x3F;\r
1547         } else if (strength == Collator.PRIMARY) {\r
1548             maxbyte = 0xFE;\r
1549         } else {\r
1550             maxbyte = 0xFF;\r
1551         }\r
1552 \r
1553         int count = token.m_toInsert_;\r
1554 \r
1555         if (Utility.compareUnsigned(low, high) >= 0\r
1556                 && strength > Collator.PRIMARY) {\r
1557             int s = strength;\r
1558             while (true) {\r
1559                 s--;\r
1560                 if (lows[fstrength * 3 + s] != highs[fstrength * 3 + s]) {\r
1561                     if (strength == Collator.SECONDARY) {\r
1562                         if (low < (RuleBasedCollator.COMMON_TOP_2_ << 24)) {\r
1563                             // Override if low range is less than\r
1564                             // UCOL_COMMON_TOP2.\r
1565                             low = RuleBasedCollator.COMMON_TOP_2_ << 24;\r
1566                         }\r
1567                         high = 0xFFFFFFFF;\r
1568                     } else {\r
1569                         if (low < RuleBasedCollator.COMMON_BOTTOM_3 << 24) {\r
1570                             // Override if low range is less than\r
1571                             // UCOL_COMMON_BOT3.\r
1572                             low = RuleBasedCollator.COMMON_BOTTOM_3 << 24;\r
1573                         }\r
1574                         high = 0x40000000;\r
1575                     }\r
1576                     break;\r
1577                 }\r
1578                 if (s < 0) {\r
1579                     throw new Exception("Internal program error");\r
1580                 }\r
1581             }\r
1582         }\r
1583         if (low == 0) {\r
1584             low = 0x01000000;\r
1585         }\r
1586         if (strength == Collator.SECONDARY) { // similar as simple\r
1587             if (Utility.compareUnsigned(low,\r
1588                     RuleBasedCollator.COMMON_BOTTOM_2_ << 24) >= 0\r
1589                     && Utility.compareUnsigned(low,\r
1590                             RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {\r
1591                 low = RuleBasedCollator.COMMON_TOP_2_ << 24;\r
1592             }\r
1593             if (Utility.compareUnsigned(high,\r
1594                     RuleBasedCollator.COMMON_BOTTOM_2_ << 24) > 0\r
1595                     && Utility.compareUnsigned(high,\r
1596                             RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {\r
1597                 high = RuleBasedCollator.COMMON_TOP_2_ << 24;\r
1598             }\r
1599             if (Utility.compareUnsigned(low,\r
1600                     RuleBasedCollator.COMMON_BOTTOM_2_ << 24) < 0) {\r
1601                 g.m_rangesLength_ = allocateWeights(\r
1602                         RuleBasedCollator.BYTE_UNSHIFTED_MIN_ << 24, high,\r
1603                         count, maxbyte, g.m_ranges_);\r
1604                 g.m_current_ = nextWeight(g);\r
1605                 // g.m_current_ = RuleBasedCollator.COMMON_BOTTOM_2_ << 24;\r
1606                 return g.m_current_;\r
1607             }\r
1608         }\r
1609 \r
1610         g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,\r
1611                 g.m_ranges_);\r
1612         if (g.m_rangesLength_ == 0) {\r
1613             throw new Exception("Internal program error");\r
1614         }\r
1615         g.m_current_ = nextWeight(g);\r
1616         return g.m_current_;\r
1617     }\r
1618 \r
1619     /**\r
1620      * @param ceparts\r
1621      *            list of collation elements parts\r
1622      * @param token\r
1623      *            rule token\r
1624      * @exception Exception\r
1625      *                thrown when forming case bits for expansions fails\r
1626      */\r
1627     private void doCE(int ceparts[], CollationRuleParser.Token token)\r
1628             throws Exception {\r
1629         // this one makes the table and stuff\r
1630         // int noofbytes[] = new int[3];\r
1631         for (int i = 0; i < 3; i++) {\r
1632             // noofbytes[i] = countBytes(ceparts[i]);\r
1633             m_utilIntBuffer_[i] = countBytes(ceparts[i]);\r
1634         }\r
1635 \r
1636         // Here we have to pack CEs from parts\r
1637         int cei = 0;\r
1638         int value = 0;\r
1639 \r
1640         while ((cei << 1) < m_utilIntBuffer_[0] || cei < m_utilIntBuffer_[1]\r
1641                 || cei < m_utilIntBuffer_[2]) {\r
1642             if (cei > 0) {\r
1643                 value = RuleBasedCollator.CE_CONTINUATION_MARKER_;\r
1644             } else {\r
1645                 value = 0;\r
1646             }\r
1647 \r
1648             if ((cei << 1) < m_utilIntBuffer_[0]) {\r
1649                 value |= ((ceparts[0] >> (32 - ((cei + 1) << 4))) & 0xFFFF) << 16;\r
1650             }\r
1651             if (cei < m_utilIntBuffer_[1]) {\r
1652                 value |= ((ceparts[1] >> (32 - ((cei + 1) << 3))) & 0xFF) << 8;\r
1653             }\r
1654 \r
1655             if (cei < m_utilIntBuffer_[2]) {\r
1656                 value |= ((ceparts[2] >> (32 - ((cei + 1) << 3))) & 0x3F);\r
1657             }\r
1658             token.m_CE_[cei] = value;\r
1659             cei++;\r
1660         }\r
1661         if (cei == 0) { // totally ignorable\r
1662             token.m_CELength_ = 1;\r
1663             token.m_CE_[0] = 0;\r
1664         } else { // there is at least something\r
1665             token.m_CELength_ = cei;\r
1666         }\r
1667 \r
1668         // Case bits handling for expansion\r
1669         if (token.m_CE_[0] != 0) { // case bits should be set only for\r
1670                                    // non-ignorables\r
1671             int startoftokenrule = token.m_source_ & 0xFF;\r
1672             if ((token.m_source_ >>> 24) > 1) {\r
1673                 // Do it manually\r
1674                 int length = token.m_source_ >>> 24;\r
1675                 String tokenstr = token.m_rules_.substring(startoftokenrule,\r
1676                         startoftokenrule + length);\r
1677                 token.m_CE_[0] |= getCaseBits(tokenstr);\r
1678             } else {\r
1679                 // Copy it from the UCA\r
1680                 int caseCE = getFirstCE(token.m_rules_.charAt(startoftokenrule));\r
1681                 token.m_CE_[0] |= (caseCE & 0xC0);\r
1682             }\r
1683         }\r
1684     }\r
1685 \r
1686     /**\r
1687      * Count the number of non-zero bytes used in the ce\r
1688      * \r
1689      * @param ce\r
1690      * @return number of non-zero bytes used in ce\r
1691      */\r
1692     private static final int countBytes(int ce) {\r
1693         int mask = 0xFFFFFFFF;\r
1694         int result = 0;\r
1695         while (mask != 0) {\r
1696             if ((ce & mask) != 0) {\r
1697                 result++;\r
1698             }\r
1699             mask >>>= 8;\r
1700         }\r
1701         return result;\r
1702     }\r
1703 \r
1704     /**\r
1705      * We are ready to create collation elements\r
1706      * \r
1707      * @param t\r
1708      *            build table to insert\r
1709      * @param lh\r
1710      *            rule token list header\r
1711      */\r
1712     private void createElements(BuildTable t,\r
1713             CollationRuleParser.TokenListHeader lh) {\r
1714         CollationRuleParser.Token tok = lh.m_first_;\r
1715         m_utilElement_.clear();\r
1716         while (tok != null) {\r
1717             // first, check if there are any expansions\r
1718             // if there are expansions, we need to do a little bit more\r
1719             // processing since parts of expansion can be tailored, while\r
1720             // others are not\r
1721             if (tok.m_expansion_ != 0) {\r
1722                 int len = tok.m_expansion_ >>> 24;\r
1723                 int currentSequenceLen = len;\r
1724                 int expOffset = tok.m_expansion_ & 0x00FFFFFF;\r
1725                 m_utilToken_.m_source_ = currentSequenceLen | expOffset;\r
1726                 m_utilToken_.m_rules_ = m_parser_.m_source_;\r
1727 \r
1728                 while (len > 0) {\r
1729                     currentSequenceLen = len;\r
1730                     while (currentSequenceLen > 0) {\r
1731                         m_utilToken_.m_source_ = (currentSequenceLen << 24)\r
1732                                 | expOffset;\r
1733                         CollationRuleParser.Token expt = m_parser_.m_hashTable_.get(m_utilToken_);\r
1734                         if (expt != null\r
1735                                 && expt.m_strength_ != CollationRuleParser.TOKEN_RESET_) {\r
1736                             // expansion is tailored\r
1737                             int noOfCEsToCopy = expt.m_CELength_;\r
1738                             for (int j = 0; j < noOfCEsToCopy; j++) {\r
1739                                 tok.m_expCE_[tok.m_expCELength_ + j] = expt.m_CE_[j];\r
1740                             }\r
1741                             tok.m_expCELength_ += noOfCEsToCopy;\r
1742                             // never try to add codepoints and CEs.\r
1743                             // For some odd reason, it won't work.\r
1744                             expOffset += currentSequenceLen; // noOfCEsToCopy;\r
1745                             len -= currentSequenceLen; // noOfCEsToCopy;\r
1746                             break;\r
1747                         } else {\r
1748                             currentSequenceLen--;\r
1749                         }\r
1750                     }\r
1751                     if (currentSequenceLen == 0) {\r
1752                         // couldn't find any tailored subsequence, will have to\r
1753                         // get one from UCA. first, get the UChars from the\r
1754                         // rules then pick CEs out until there is no more and\r
1755                         // stuff them into expansion\r
1756                         m_utilColEIter_.setText(m_parser_.m_source_.substring(\r
1757                                 expOffset, expOffset + 1));\r
1758                         while (true) {\r
1759                             int order = m_utilColEIter_.next();\r
1760                             if (order == CollationElementIterator.NULLORDER) {\r
1761                                 break;\r
1762                             }\r
1763                             tok.m_expCE_[tok.m_expCELength_++] = order;\r
1764                         }\r
1765                         expOffset++;\r
1766                         len--;\r
1767                     }\r
1768                 }\r
1769             } else {\r
1770                 tok.m_expCELength_ = 0;\r
1771             }\r
1772 \r
1773             // set the ucaelement with obtained values\r
1774             m_utilElement_.m_CELength_ = tok.m_CELength_ + tok.m_expCELength_;\r
1775 \r
1776             // copy CEs\r
1777             System.arraycopy(tok.m_CE_, 0, m_utilElement_.m_CEs_, 0,\r
1778                     tok.m_CELength_);\r
1779             System.arraycopy(tok.m_expCE_, 0, m_utilElement_.m_CEs_,\r
1780                     tok.m_CELength_, tok.m_expCELength_);\r
1781 \r
1782             // copy UChars\r
1783             // We kept prefix and source kind of together, as it is a kind of a\r
1784             // contraction.\r
1785             // However, now we have to slice the prefix off the main thing -\r
1786             m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_;\r
1787             m_utilElement_.m_cPointsOffset_ = 0; // el.m_uchars_;\r
1788             if (tok.m_prefix_ != 0) {\r
1789                 // we will just copy the prefix here, and adjust accordingly in\r
1790                 // the addPrefix function in ucol_elm. The reason is that we\r
1791                 // need to add both composed AND decomposed elements to the\r
1792                 // unsafe table.\r
1793                 int size = tok.m_prefix_ >> 24;\r
1794                 int offset = tok.m_prefix_ & 0x00FFFFFF;\r
1795                 m_utilElement_.m_prefixChars_ = m_parser_.m_source_.substring(\r
1796                         offset, offset + size);\r
1797                 size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24);\r
1798                 offset = (tok.m_source_ & 0x00FFFFFF) + (tok.m_prefix_ >> 24);\r
1799                 m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(\r
1800                         offset, offset + size);\r
1801             } else {\r
1802                 m_utilElement_.m_prefixChars_ = null;\r
1803                 int offset = tok.m_source_ & 0x00FFFFFF;\r
1804                 int size = tok.m_source_ >>> 24;\r
1805                 m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(\r
1806                         offset, offset + size);\r
1807             }\r
1808             m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;\r
1809 \r
1810             boolean containCombinMarks = false;\r
1811             for (int i = 0; i < m_utilElement_.m_cPoints_.length()\r
1812                     - m_utilElement_.m_cPointsOffset_; i++) {\r
1813                 if (isJamo(m_utilElement_.m_cPoints_.charAt(i))) {\r
1814                     t.m_collator_.m_isJamoSpecial_ = true;\r
1815                     break;\r
1816                 }\r
1817                 if (!buildCMTabFlag) {\r
1818                     // check combining class\r
1819                     int fcd = m_nfcImpl_.getFCD16FromSingleLead(m_utilElement_.m_cPoints_.charAt(i));  // TODO: review for handling supplementary characters\r
1820                     if ((fcd & 0xff) == 0) {\r
1821                         // reset flag when current char is not combining mark.\r
1822                         containCombinMarks = false;\r
1823                     } else {\r
1824                         containCombinMarks = true;\r
1825                     }\r
1826                 }\r
1827             }\r
1828 \r
1829             if (!buildCMTabFlag && containCombinMarks) {\r
1830                 buildCMTabFlag = true;\r
1831             }\r
1832 \r
1833             /***\r
1834              * // Case bits handling m_utilElement_.m_CEs_[0] &= 0xFFFFFF3F; //\r
1835              * Clean the case bits field if (m_utilElement_.m_cPoints_.length()\r
1836              * - m_utilElement_.m_cPointsOffset_ > 1) { // Do it manually\r
1837              * m_utilElement_.m_CEs_[0] |=\r
1838              * getCaseBits(m_utilElement_.m_cPoints_); } else { // Copy it from\r
1839              * the UCA int caseCE =\r
1840              * getFirstCE(m_utilElement_.m_cPoints_.charAt(0));\r
1841              * m_utilElement_.m_CEs_[0] |= (caseCE & 0xC0); }\r
1842              ***/\r
1843             // and then, add it\r
1844             addAnElement(t, m_utilElement_);\r
1845             tok = tok.m_next_;\r
1846         }\r
1847     }\r
1848 \r
1849     /**\r
1850      * Testing if the string argument has case\r
1851      * \r
1852      * @param src\r
1853      *            string\r
1854      * @return the case for this char array\r
1855      * @exception Exception\r
1856      *                thrown when internal program error occurs\r
1857      */\r
1858     private final int getCaseBits(String src) throws Exception {\r
1859         int uCount = 0;\r
1860         int lCount = 0;\r
1861         src = Normalizer.decompose(src, true);\r
1862         m_utilColEIter_.setText(src);\r
1863         for (int i = 0; i < src.length(); i++) {\r
1864             m_utilColEIter_.setText(src.substring(i, i + 1));\r
1865             int order = m_utilColEIter_.next();\r
1866             if (RuleBasedCollator.isContinuation(order)) {\r
1867                 throw new Exception("Internal program error");\r
1868             }\r
1869             if ((order & RuleBasedCollator.CE_CASE_BIT_MASK_) == UPPER_CASE_) {\r
1870                 uCount++;\r
1871             } else {\r
1872                 char ch = src.charAt(i);\r
1873                 if (UCharacter.isLowerCase(ch)) {\r
1874                     lCount++;\r
1875                 } else {\r
1876                     if (toSmallKana(ch) == ch && toLargeKana(ch) != ch) {\r
1877                         lCount++;\r
1878                     }\r
1879                 }\r
1880             }\r
1881         }\r
1882 \r
1883         if (uCount != 0 && lCount != 0) {\r
1884             return MIXED_CASE_;\r
1885         } else if (uCount != 0) {\r
1886             return UPPER_CASE_;\r
1887         } else {\r
1888             return LOWER_CASE_;\r
1889         }\r
1890     }\r
1891 \r
1892     /**\r
1893      * Converts a char to the uppercase Kana\r
1894      * \r
1895      * @param ch\r
1896      *            character to convert\r
1897      * @return the converted Kana character\r
1898      */\r
1899     private static final char toLargeKana(char ch) {\r
1900         if (0x3042 < ch && ch < 0x30ef) { // Kana range\r
1901             switch (ch - 0x3000) {\r
1902             case 0x41:\r
1903             case 0x43:\r
1904             case 0x45:\r
1905             case 0x47:\r
1906             case 0x49:\r
1907             case 0x63:\r
1908             case 0x83:\r
1909             case 0x85:\r
1910             case 0x8E:\r
1911             case 0xA1:\r
1912             case 0xA3:\r
1913             case 0xA5:\r
1914             case 0xA7:\r
1915             case 0xA9:\r
1916             case 0xC3:\r
1917             case 0xE3:\r
1918             case 0xE5:\r
1919             case 0xEE:\r
1920                 ch++;\r
1921                 break;\r
1922             case 0xF5:\r
1923                 ch = 0x30AB;\r
1924                 break;\r
1925             case 0xF6:\r
1926                 ch = 0x30B1;\r
1927                 break;\r
1928             }\r
1929         }\r
1930         return ch;\r
1931     }\r
1932 \r
1933     /**\r
1934      * Converts a char to the lowercase Kana\r
1935      * \r
1936      * @param ch\r
1937      *            character to convert\r
1938      * @return the converted Kana character\r
1939      */\r
1940     private static final char toSmallKana(char ch) {\r
1941         if (0x3042 < ch && ch < 0x30ef) { // Kana range\r
1942             switch (ch - 0x3000) {\r
1943             case 0x42:\r
1944             case 0x44:\r
1945             case 0x46:\r
1946             case 0x48:\r
1947             case 0x4A:\r
1948             case 0x64:\r
1949             case 0x84:\r
1950             case 0x86:\r
1951             case 0x8F:\r
1952             case 0xA2:\r
1953             case 0xA4:\r
1954             case 0xA6:\r
1955             case 0xA8:\r
1956             case 0xAA:\r
1957             case 0xC4:\r
1958             case 0xE4:\r
1959             case 0xE6:\r
1960             case 0xEF:\r
1961                 ch--;\r
1962                 break;\r
1963             case 0xAB:\r
1964                 ch = 0x30F5;\r
1965                 break;\r
1966             case 0xB1:\r
1967                 ch = 0x30F6;\r
1968                 break;\r
1969             }\r
1970         }\r
1971         return ch;\r
1972     }\r
1973 \r
1974     /**\r
1975      * This should be connected to special Jamo handling.\r
1976      */\r
1977     private int getFirstCE(char ch) {\r
1978         m_utilColEIter_.setText(UCharacter.toString(ch));\r
1979         return m_utilColEIter_.next();\r
1980     }\r
1981 \r
1982     /**\r
1983      * This adds a read element, while testing for existence\r
1984      * \r
1985      * @param t\r
1986      *            build table\r
1987      * @param element\r
1988      * @return ce\r
1989      */\r
1990     private int addAnElement(BuildTable t, Elements element) {\r
1991         Vector<Integer> expansions = t.m_expansions_;\r
1992         element.m_mapCE_ = 0;\r
1993 \r
1994         if (element.m_CELength_ == 1) {\r
1995             element.m_mapCE_ = element.m_CEs_[0];\r
1996 \r
1997         } else {\r
1998             // unfortunately, it looks like we have to look for a long primary\r
1999             // here since in canonical closure we are going to hit some long\r
2000             // primaries from the first phase, and they will come back as\r
2001             // continuations/expansions destroying the effect of the previous\r
2002             // opitimization. A long primary is a three byte primary with\r
2003             // starting secondaries and tertiaries. It can appear in long runs\r
2004             // of only primary differences (like east Asian tailorings) also,\r
2005             // it should not be an expansion, as expansions would break with\r
2006             // this\r
2007             if (element.m_CELength_ == 2 // a two CE expansion\r
2008                     && RuleBasedCollator.isContinuation(element.m_CEs_[1])\r
2009                     && (element.m_CEs_[1] & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that\r
2010                                                                                                               // has\r
2011                                                                                                               // only\r
2012                                                                                                               // primaries\r
2013                                                                                                               // in\r
2014                                                                                                               // continuation\r
2015                     && (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_)\r
2016                     // a common secondary\r
2017                     && ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_) // and\r
2018                                                                                       // a\r
2019                                                                                       // common\r
2020                                                                                       // tertiary\r
2021             ) {\r
2022                 element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_\r
2023                 // a long primary special\r
2024                         | (CE_LONG_PRIMARY_TAG_ << 24)\r
2025                         // first and second byte of primary\r
2026                         | ((element.m_CEs_[0] >> 8) & 0xFFFF00)\r
2027                         // third byte of primary\r
2028                         | ((element.m_CEs_[1] >> 24) & 0xFF);\r
2029             } else {\r
2030                 // omitting expansion offset in builder\r
2031                 // (HEADER_SIZE_ >> 2)\r
2032                 int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_\r
2033                         | (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)\r
2034                         | (addExpansion(expansions, element.m_CEs_[0]) << 4)\r
2035                         & 0xFFFFF0;\r
2036 \r
2037                 for (int i = 1; i < element.m_CELength_; i++) {\r
2038                     addExpansion(expansions, element.m_CEs_[i]);\r
2039                 }\r
2040                 if (element.m_CELength_ <= 0xF) {\r
2041                     expansion |= element.m_CELength_;\r
2042                 } else {\r
2043                     addExpansion(expansions, 0);\r
2044                 }\r
2045                 element.m_mapCE_ = expansion;\r
2046                 setMaxExpansion(element.m_CEs_[element.m_CELength_ - 1],\r
2047                         (byte) element.m_CELength_, t.m_maxExpansions_);\r
2048                 if (isJamo(element.m_cPoints_.charAt(0))) {\r
2049                     t.m_collator_.m_isJamoSpecial_ = true;\r
2050                     setMaxJamoExpansion(element.m_cPoints_.charAt(0),\r
2051                             element.m_CEs_[element.m_CELength_ - 1],\r
2052                             (byte) element.m_CELength_, t.m_maxJamoExpansions_);\r
2053                 }\r
2054             }\r
2055         }\r
2056 \r
2057         // We treat digits differently - they are "uber special" and should be\r
2058         // processed differently if numeric collation is on.\r
2059         int uniChar = 0;\r
2060         if ((element.m_uchars_.length() == 2)\r
2061                 && UTF16.isLeadSurrogate(element.m_uchars_.charAt(0))) {\r
2062             uniChar = UCharacterProperty.getRawSupplementary(element.m_uchars_\r
2063                     .charAt(0), element.m_uchars_.charAt(1));\r
2064         } else if (element.m_uchars_.length() == 1) {\r
2065             uniChar = element.m_uchars_.charAt(0);\r
2066         }\r
2067 \r
2068         // Here, we either have one normal CE OR mapCE is set. Therefore, we\r
2069         // stuff only one element to the expansion buffer. When we encounter a\r
2070         // digit and we don't do numeric collation, we will just pick the CE\r
2071         // we have and break out of case (see ucol.cpp ucol_prv_getSpecialCE\r
2072         // && ucol_prv_getSpecialPrevCE). If we picked a special, further\r
2073         // processing will occur. If it's a simple CE, we'll return due\r
2074         // to how the loop is constructed.\r
2075         if (uniChar != 0 && UCharacter.isDigit(uniChar)) {\r
2076             // prepare the element\r
2077             int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_\r
2078                     | (CollationElementIterator.CE_DIGIT_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)\r
2079                     | 1;\r
2080             if (element.m_mapCE_ != 0) {\r
2081                 // if there is an expansion, we'll pick it here\r
2082                 expansion |= (addExpansion(expansions, element.m_mapCE_) << 4);\r
2083             } else {\r
2084                 expansion |= (addExpansion(expansions, element.m_CEs_[0]) << 4);\r
2085             }\r
2086             element.m_mapCE_ = expansion;\r
2087         }\r
2088 \r
2089         // here we want to add the prefix structure.\r
2090         // I will try to process it as a reverse contraction, if possible.\r
2091         // prefix buffer is already reversed.\r
2092 \r
2093         if (element.m_prefixChars_ != null\r
2094                 && element.m_prefixChars_.length() - element.m_prefix_ > 0) {\r
2095             // We keep the seen prefix starter elements in a hashtable we need\r
2096             // it to be able to distinguish between the simple codepoints and\r
2097             // prefix starters. Also, we need to use it for canonical closure.\r
2098             m_utilElement2_.m_caseBit_ = element.m_caseBit_;\r
2099             m_utilElement2_.m_CELength_ = element.m_CELength_;\r
2100             m_utilElement2_.m_CEs_ = element.m_CEs_;\r
2101             m_utilElement2_.m_mapCE_ = element.m_mapCE_;\r
2102             // m_utilElement2_.m_prefixChars_ = element.m_prefixChars_;\r
2103             m_utilElement2_.m_sizePrim_ = element.m_sizePrim_;\r
2104             m_utilElement2_.m_sizeSec_ = element.m_sizeSec_;\r
2105             m_utilElement2_.m_sizeTer_ = element.m_sizeTer_;\r
2106             m_utilElement2_.m_variableTop_ = element.m_variableTop_;\r
2107             m_utilElement2_.m_prefix_ = element.m_prefix_;\r
2108             m_utilElement2_.m_prefixChars_ = Normalizer.compose(\r
2109                     element.m_prefixChars_, false);\r
2110             m_utilElement2_.m_uchars_ = element.m_uchars_;\r
2111             m_utilElement2_.m_cPoints_ = element.m_cPoints_;\r
2112             m_utilElement2_.m_cPointsOffset_ = 0;\r
2113 \r
2114             if (t.m_prefixLookup_ != null) {\r
2115                 Elements uCE = t.m_prefixLookup_.get(element);\r
2116                 if (uCE != null) {\r
2117                     // there is already a set of code points here\r
2118                     element.m_mapCE_ = addPrefix(t, uCE.m_mapCE_, element);\r
2119                 } else { // no code points, so this spot is clean\r
2120                     element.m_mapCE_ = addPrefix(t, CE_NOT_FOUND_, element);\r
2121                     uCE = new Elements(element);\r
2122                     uCE.m_cPoints_ = uCE.m_uchars_;\r
2123                     t.m_prefixLookup_.put(uCE, uCE);\r
2124                 }\r
2125                 if (m_utilElement2_.m_prefixChars_.length() != element.m_prefixChars_\r
2126                         .length()\r
2127                         - element.m_prefix_\r
2128                         || !m_utilElement2_.m_prefixChars_.regionMatches(0,\r
2129                                 element.m_prefixChars_, element.m_prefix_,\r
2130                                 m_utilElement2_.m_prefixChars_.length())) {\r
2131                     // do it!\r
2132                     m_utilElement2_.m_mapCE_ = addPrefix(t, element.m_mapCE_,\r
2133                             m_utilElement2_);\r
2134                 }\r
2135             }\r
2136         }\r
2137 \r
2138         // We need to use the canonical iterator here\r
2139         // the way we do it is to generate the canonically equivalent strings\r
2140         // for the contraction and then add the sequences that pass FCD check\r
2141         if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1\r
2142                 && !(element.m_cPoints_.length() - element.m_cPointsOffset_ == 2\r
2143                         && UTF16.isLeadSurrogate(element.m_cPoints_.charAt(0)) && UTF16\r
2144                         .isTrailSurrogate(element.m_cPoints_.charAt(1)))) {\r
2145             // this is a contraction, we should check whether a composed form\r
2146             // should also be included\r
2147             m_utilCanIter_.setSource(element.m_cPoints_);\r
2148             String source = m_utilCanIter_.next();\r
2149             while (source != null && source.length() > 0) {\r
2150                 if (Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.NO) {\r
2151                     element.m_uchars_ = source;\r
2152                     element.m_cPoints_ = element.m_uchars_;\r
2153                     finalizeAddition(t, element);\r
2154                 }\r
2155                 source = m_utilCanIter_.next();\r
2156             }\r
2157 \r
2158             return element.m_mapCE_;\r
2159         } else {\r
2160             return finalizeAddition(t, element);\r
2161         }\r
2162     }\r
2163 \r
2164     /**\r
2165      * Adds an expansion ce to the expansion vector\r
2166      * \r
2167      * @param expansions\r
2168      *            vector to add to\r
2169      * @param value\r
2170      *            of the expansion\r
2171      * @return the current position of the new element\r
2172      */\r
2173     private static final int addExpansion(Vector<Integer> expansions, int value) {\r
2174         expansions.add(new Integer(value));\r
2175         return expansions.size() - 1;\r
2176     }\r
2177 \r
2178     /**\r
2179      * Looks for the maximum length of all expansion sequences ending with the\r
2180      * same collation element. The size required for maxexpansion and maxsize is\r
2181      * returned if the arrays are too small.\r
2182      * \r
2183      * @param endexpansion\r
2184      *            the last expansion collation element to be added\r
2185      * @param expansionsize\r
2186      *            size of the expansion\r
2187      * @param maxexpansion\r
2188      *            data structure to store the maximum expansion data.\r
2189      * @returns size of the maxexpansion and maxsize used.\r
2190      */\r
2191     private static int setMaxExpansion(int endexpansion, byte expansionsize,\r
2192             MaxExpansionTable maxexpansion) {\r
2193         int start = 0;\r
2194         int limit = maxexpansion.m_endExpansionCE_.size();\r
2195         long unsigned = (long) endexpansion;\r
2196         unsigned &= 0xFFFFFFFFl;\r
2197 \r
2198         // using binary search to determine if last expansion element is\r
2199         // already in the array\r
2200         int result = -1;\r
2201         while (start < limit - 1) {\r
2202             int mid = start + ((limit - start) >> 1);\r
2203             long unsignedce = (maxexpansion.m_endExpansionCE_\r
2204                     .get(mid)).intValue();\r
2205             unsignedce &= 0xFFFFFFFFl;\r
2206             if (unsigned <= unsignedce) {\r
2207                 limit = mid;\r
2208             } else {\r
2209                 start = mid;\r
2210             }\r
2211         }\r
2212 \r
2213         if ((maxexpansion.m_endExpansionCE_.get(start)).intValue() == endexpansion) {\r
2214             result = start;\r
2215         } else if ((maxexpansion.m_endExpansionCE_.get(limit))\r
2216                 .intValue() == endexpansion) {\r
2217             result = limit;\r
2218         }\r
2219         if (result > -1) {\r
2220             // found the ce in expansion, we'll just modify the size if it\r
2221             // is smaller\r
2222             Object currentsize = maxexpansion.m_expansionCESize_.get(result);\r
2223             if (((Byte) currentsize).byteValue() < expansionsize) {\r
2224                 maxexpansion.m_expansionCESize_.set(result, new Byte(\r
2225                         expansionsize));\r
2226             }\r
2227         } else {\r
2228             // we'll need to squeeze the value into the array. initial\r
2229             // implementation. shifting the subarray down by 1\r
2230             maxexpansion.m_endExpansionCE_.insertElementAt(new Integer(\r
2231                     endexpansion), start + 1);\r
2232             maxexpansion.m_expansionCESize_.insertElementAt(new Byte(\r
2233                     expansionsize), start + 1);\r
2234         }\r
2235         return maxexpansion.m_endExpansionCE_.size();\r
2236     }\r
2237 \r
2238     /**\r
2239      * Sets the maximum length of all jamo expansion sequences ending with the\r
2240      * same collation element. The size required for maxexpansion and maxsize is\r
2241      * returned if the arrays are too small.\r
2242      * \r
2243      * @param ch\r
2244      *            the jamo codepoint\r
2245      * @param endexpansion\r
2246      *            the last expansion collation element to be added\r
2247      * @param expansionsize\r
2248      *            size of the expansion\r
2249      * @param maxexpansion\r
2250      *            data structure to store the maximum expansion data.\r
2251      * @returns size of the maxexpansion and maxsize used.\r
2252      */\r
2253     private static int setMaxJamoExpansion(char ch, int endexpansion,\r
2254             byte expansionsize, MaxJamoExpansionTable maxexpansion) {\r
2255         boolean isV = true;\r
2256         if (ch >= 0x1100 && ch <= 0x1112) {\r
2257             // determines L for Jamo, doesn't need to store this since it is\r
2258             // never at the end of a expansion\r
2259             if (maxexpansion.m_maxLSize_ < expansionsize) {\r
2260                 maxexpansion.m_maxLSize_ = expansionsize;\r
2261             }\r
2262             return maxexpansion.m_endExpansionCE_.size();\r
2263         }\r
2264 \r
2265         if (ch >= 0x1161 && ch <= 0x1175) {\r
2266             // determines V for Jamo\r
2267             if (maxexpansion.m_maxVSize_ < expansionsize) {\r
2268                 maxexpansion.m_maxVSize_ = expansionsize;\r
2269             }\r
2270         }\r
2271 \r
2272         if (ch >= 0x11A8 && ch <= 0x11C2) {\r
2273             isV = false;\r
2274             // determines T for Jamo\r
2275             if (maxexpansion.m_maxTSize_ < expansionsize) {\r
2276                 maxexpansion.m_maxTSize_ = expansionsize;\r
2277             }\r
2278         }\r
2279 \r
2280         int pos = maxexpansion.m_endExpansionCE_.size();\r
2281         while (pos > 0) {\r
2282             pos--;\r
2283             if ((maxexpansion.m_endExpansionCE_.get(pos)).intValue() == endexpansion) {\r
2284                 return maxexpansion.m_endExpansionCE_.size();\r
2285             }\r
2286         }\r
2287         maxexpansion.m_endExpansionCE_.add(new Integer(endexpansion));\r
2288         maxexpansion.m_isV_.add(isV ? Boolean.TRUE : Boolean.FALSE);\r
2289 \r
2290         return maxexpansion.m_endExpansionCE_.size();\r
2291     }\r
2292 \r
2293     /**\r
2294      * Adds a prefix to the table\r
2295      * \r
2296      * @param t\r
2297      *            build table to update\r
2298      * @param CE\r
2299      *            collation element to add\r
2300      * @param element\r
2301      *            rule element to add\r
2302      * @return modified ce\r
2303      */\r
2304     private int addPrefix(BuildTable t, int CE, Elements element) {\r
2305         // currently the longest prefix we're supporting in Japanese is two\r
2306         // characters long. Although this table could quite easily mimic\r
2307         // complete contraction stuff there is no good reason to make a general\r
2308         // solution, as it would require some error prone messing.\r
2309         ContractionTable contractions = t.m_contractions_;\r
2310         String oldCP = element.m_cPoints_;\r
2311         int oldCPOffset = element.m_cPointsOffset_;\r
2312 \r
2313         contractions.m_currentTag_ = CE_SPEC_PROC_TAG_;\r
2314         // here, we will normalize & add prefix to the table.\r
2315         int size = element.m_prefixChars_.length() - element.m_prefix_;\r
2316         for (int j = 1; j < size; j++) {\r
2317             // First add NFD prefix chars to unsafe CP hash table\r
2318             // Unless it is a trail surrogate, which is handled algoritmically\r
2319             // and shouldn't take up space in the table.\r
2320             char ch = element.m_prefixChars_.charAt(j + element.m_prefix_);\r
2321             if (!UTF16.isTrailSurrogate(ch)) {\r
2322                 unsafeCPSet(t.m_unsafeCP_, ch);\r
2323             }\r
2324         }\r
2325 \r
2326         // StringBuffer reversed = new StringBuffer();\r
2327         m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());\r
2328         for (int j = 0; j < size; j++) {\r
2329             // prefixes are going to be looked up backwards\r
2330             // therefore, we will promptly reverse the prefix buffer...\r
2331             int offset = element.m_prefixChars_.length() - j - 1;\r
2332             m_utilStringBuffer_.append(element.m_prefixChars_.charAt(offset));\r
2333         }\r
2334         element.m_prefixChars_ = m_utilStringBuffer_.toString();\r
2335         element.m_prefix_ = 0;\r
2336 \r
2337         // the first codepoint is also unsafe, as it forms a 'contraction' with\r
2338         // the prefix\r
2339         if (!UTF16.isTrailSurrogate(element.m_cPoints_.charAt(0))) {\r
2340             unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_.charAt(0));\r
2341         }\r
2342 \r
2343         element.m_cPoints_ = element.m_prefixChars_;\r
2344         element.m_cPointsOffset_ = element.m_prefix_;\r
2345 \r
2346         // Add the last char of the contraction to the contraction-end hash\r
2347         // table. unless it is a trail surrogate, which is handled\r
2348         // algorithmically and shouldn't be in the table\r
2349         if (!UTF16.isTrailSurrogate(element.m_cPoints_\r
2350                 .charAt(element.m_cPoints_.length() - 1))) {\r
2351             ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_\r
2352                     .charAt(element.m_cPoints_.length() - 1));\r
2353         }\r
2354         // First we need to check if contractions starts with a surrogate\r
2355         // int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_);\r
2356 \r
2357         // If there are any Jamos in the contraction, we should turn on special\r
2358         // processing for Jamos\r
2359         if (isJamo(element.m_prefixChars_.charAt(element.m_prefix_))) {\r
2360             t.m_collator_.m_isJamoSpecial_ = true;\r
2361         }\r
2362         // then we need to deal with it\r
2363         // we could aready have something in table - or we might not\r
2364         if (!isPrefix(CE)) {\r
2365             // if it wasn't contraction, we wouldn't end up here\r
2366             int firstContractionOffset = addContraction(contractions,\r
2367                     CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, CE);\r
2368             int newCE = processContraction(contractions, element, CE_NOT_FOUND_);\r
2369             addContraction(contractions, firstContractionOffset,\r
2370                     element.m_prefixChars_.charAt(element.m_prefix_), newCE);\r
2371             addContraction(contractions, firstContractionOffset, (char) 0xFFFF,\r
2372                     CE);\r
2373             CE = constructSpecialCE(CE_SPEC_PROC_TAG_, firstContractionOffset);\r
2374         } else {\r
2375             // we are adding to existing contraction\r
2376             // there were already some elements in the table, so we need to add\r
2377             // a new contraction\r
2378             // Two things can happen here: either the codepoint is already in\r
2379             // the table, or it is not\r
2380             char ch = element.m_prefixChars_.charAt(element.m_prefix_);\r
2381             int position = findCP(contractions, CE, ch);\r
2382             if (position > 0) {\r
2383                 // if it is we just continue down the chain\r
2384                 int eCE = getCE(contractions, CE, position);\r
2385                 int newCE = processContraction(contractions, element, eCE);\r
2386                 setContraction(contractions, CE, position, ch, newCE);\r
2387             } else {\r
2388                 // if it isn't, we will have to create a new sequence\r
2389                 processContraction(contractions, element, CE_NOT_FOUND_);\r
2390                 insertContraction(contractions, CE, ch, element.m_mapCE_);\r
2391             }\r
2392         }\r
2393 \r
2394         element.m_cPoints_ = oldCP;\r
2395         element.m_cPointsOffset_ = oldCPOffset;\r
2396 \r
2397         return CE;\r
2398     }\r
2399 \r
2400     /**\r
2401      * Checks if the argument ce is a contraction\r
2402      * \r
2403      * @param CE\r
2404      *            collation element\r
2405      * @return true if argument ce is a contraction\r
2406      */\r
2407     private static final boolean isContraction(int CE) {\r
2408         return isSpecial(CE) && (getCETag(CE) == CE_CONTRACTION_TAG_);\r
2409     }\r
2410 \r
2411     /**\r
2412      * Checks if the argument ce has a prefix\r
2413      * \r
2414      * @param CE\r
2415      *            collation element\r
2416      * @return true if argument ce has a prefix\r
2417      */\r
2418     private static final boolean isPrefix(int CE) {\r
2419         return isSpecial(CE) && (getCETag(CE) == CE_SPEC_PROC_TAG_);\r
2420     }\r
2421 \r
2422     /**\r
2423      * Checks if the argument ce is special\r
2424      * \r
2425      * @param CE\r
2426      *            collation element\r
2427      * @return true if argument ce is special\r
2428      */\r
2429     private static final boolean isSpecial(int CE) {\r
2430         return (CE & RuleBasedCollator.CE_SPECIAL_FLAG_) == 0xF0000000;\r
2431     }\r
2432 \r
2433     /**\r
2434      * Checks if the argument ce has a prefix\r
2435      * \r
2436      * @param CE\r
2437      *            collation element\r
2438      * @return true if argument ce has a prefix\r
2439      */\r
2440     private static final int getCETag(int CE) {\r
2441         return (CE & RuleBasedCollator.CE_TAG_MASK_) >>> RuleBasedCollator.CE_TAG_SHIFT_;\r
2442     }\r
2443 \r
2444     /**\r
2445      * Gets the ce at position in contraction table\r
2446      * \r
2447      * @param table\r
2448      *            contraction table\r
2449      * @param position\r
2450      *            offset to the contraction table\r
2451      * @return ce\r
2452      */\r
2453     private static final int getCE(ContractionTable table, int element,\r
2454             int position) {\r
2455         element &= 0xFFFFFF;\r
2456         BasicContractionTable tbl = getBasicContractionTable(table, element);\r
2457 \r
2458         if (tbl == null) {\r
2459             return CE_NOT_FOUND_;\r
2460         }\r
2461         if (position > tbl.m_CEs_.size() || position == -1) {\r
2462             return CE_NOT_FOUND_;\r
2463         } else {\r
2464             return tbl.m_CEs_.get(position).intValue();\r
2465         }\r
2466     }\r
2467 \r
2468     /**\r
2469      * Sets the unsafe character\r
2470      * \r
2471      * @param table\r
2472      *            unsafe table\r
2473      * @param c\r
2474      *            character to be added\r
2475      */\r
2476     private static final void unsafeCPSet(byte table[], char c) {\r
2477         int hash = c;\r
2478         if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {\r
2479             if (hash >= 0xd800 && hash <= 0xf8ff) {\r
2480                 // Part of a surrogate, or in private use area.\r
2481                 // These don't go in the table\r
2482                 return;\r
2483             }\r
2484             hash = (hash & UNSAFECP_TABLE_MASK_) + 256;\r
2485         }\r
2486         table[hash >> 3] |= (1 << (hash & 7));\r
2487     }\r
2488 \r
2489     /**\r
2490      * Sets the contraction end character\r
2491      * \r
2492      * @param table\r
2493      *            contraction end table\r
2494      * @param c\r
2495      *            character to be added\r
2496      */\r
2497     private static final void ContrEndCPSet(byte table[], char c) {\r
2498         int hash = c;\r
2499         if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {\r
2500             hash = (hash & UNSAFECP_TABLE_MASK_) + 256;\r
2501         }\r
2502         table[hash >> 3] |= (1 << (hash & 7));\r
2503     }\r
2504 \r
2505     /**\r
2506      * Adds more contractions in table. If element is non existant, it creates\r
2507      * on. Returns element handle\r
2508      * \r
2509      * @param table\r
2510      *            contraction table\r
2511      * @param element\r
2512      *            offset to the contraction table\r
2513      * @param codePoint\r
2514      *            codepoint to add\r
2515      * @param value\r
2516      * @return collation element\r
2517      */\r
2518     private static int addContraction(ContractionTable table, int element,\r
2519             char codePoint, int value) {\r
2520         BasicContractionTable tbl = getBasicContractionTable(table, element);\r
2521         if (tbl == null) {\r
2522             tbl = addAContractionElement(table);\r
2523             element = table.m_elements_.size() - 1;\r
2524         }\r
2525 \r
2526         tbl.m_CEs_.add(new Integer(value));\r
2527         tbl.m_codePoints_.append(codePoint);\r
2528         return constructSpecialCE(table.m_currentTag_, element);\r
2529     }\r
2530 \r
2531     /**\r
2532      * Adds a contraction element to the table\r
2533      * \r
2534      * @param table\r
2535      *            contraction table to update\r
2536      * @return contraction\r
2537      */\r
2538     private static BasicContractionTable addAContractionElement(\r
2539             ContractionTable table) {\r
2540         BasicContractionTable result = new BasicContractionTable();\r
2541         table.m_elements_.add(result);\r
2542         return result;\r
2543     }\r
2544 \r
2545     /**\r
2546      * Constructs a special ce\r
2547      * \r
2548      * @param tag\r
2549      *            special tag\r
2550      * @param CE\r
2551      *            collation element\r
2552      * @return a contraction ce\r
2553      */\r
2554     private static final int constructSpecialCE(int tag, int CE) {\r
2555         return RuleBasedCollator.CE_SPECIAL_FLAG_\r
2556                 | (tag << RuleBasedCollator.CE_TAG_SHIFT_) | (CE & 0xFFFFFF);\r
2557     }\r
2558 \r
2559     /**\r
2560      * Sets and inserts the element that has a contraction\r
2561      * \r
2562      * @param contractions\r
2563      *            contraction table\r
2564      * @param element\r
2565      *            contracting element\r
2566      * @param existingCE\r
2567      * @return contraction ce\r
2568      */\r
2569     private static int processContraction(ContractionTable contractions,\r
2570             Elements element, int existingCE) {\r
2571         int firstContractionOffset = 0;\r
2572         // end of recursion\r
2573         if (element.m_cPoints_.length() - element.m_cPointsOffset_ == 1) {\r
2574             if (isContractionTableElement(existingCE)\r
2575                     && getCETag(existingCE) == contractions.m_currentTag_) {\r
2576                 changeContraction(contractions, existingCE, (char) 0,\r
2577                         element.m_mapCE_);\r
2578                 changeContraction(contractions, existingCE, (char) 0xFFFF,\r
2579                         element.m_mapCE_);\r
2580                 return existingCE;\r
2581             } else {\r
2582                 // can't do just that. existingCe might be a contraction,\r
2583                 // meaning that we need to do another step\r
2584                 return element.m_mapCE_;\r
2585             }\r
2586         }\r
2587 \r
2588         // this recursion currently feeds on the only element we have...\r
2589         // We will have to copy it in order to accomodate for both backward\r
2590         // and forward cycles\r
2591         // we encountered either an empty space or a non-contraction element\r
2592         // this means we are constructing a new contraction sequence\r
2593         element.m_cPointsOffset_++;\r
2594         if (!isContractionTableElement(existingCE)) {\r
2595             // if it wasn't contraction, we wouldn't end up here\r
2596             firstContractionOffset = addContraction(contractions,\r
2597                     CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, existingCE);\r
2598             int newCE = processContraction(contractions, element, CE_NOT_FOUND_);\r
2599             addContraction(contractions, firstContractionOffset,\r
2600                     element.m_cPoints_.charAt(element.m_cPointsOffset_), newCE);\r
2601             addContraction(contractions, firstContractionOffset, (char) 0xFFFF,\r
2602                     existingCE);\r
2603             existingCE = constructSpecialCE(contractions.m_currentTag_,\r
2604                     firstContractionOffset);\r
2605         } else {\r
2606             // we are adding to existing contraction\r
2607             // there were already some elements in the table, so we need to add\r
2608             // a new contraction\r
2609             // Two things can happen here: either the codepoint is already in\r
2610             // the table, or it is not\r
2611             int position = findCP(contractions, existingCE, element.m_cPoints_\r
2612                     .charAt(element.m_cPointsOffset_));\r
2613             if (position > 0) {\r
2614                 // if it is we just continue down the chain\r
2615                 int eCE = getCE(contractions, existingCE, position);\r
2616                 int newCE = processContraction(contractions, element, eCE);\r
2617                 setContraction(contractions, existingCE, position,\r
2618                         element.m_cPoints_.charAt(element.m_cPointsOffset_),\r
2619                         newCE);\r
2620             } else {\r
2621                 // if it isn't, we will have to create a new sequence\r
2622                 int newCE = processContraction(contractions, element,\r
2623                         CE_NOT_FOUND_);\r
2624                 insertContraction(contractions, existingCE, element.m_cPoints_\r
2625                         .charAt(element.m_cPointsOffset_), newCE);\r
2626             }\r
2627         }\r
2628         element.m_cPointsOffset_--;\r
2629         return existingCE;\r
2630     }\r
2631 \r
2632     /**\r
2633      * Checks if CE belongs to the contraction table\r
2634      * \r
2635      * @param CE\r
2636      *            collation element to test\r
2637      * @return true if CE belongs to the contraction table\r
2638      */\r
2639     private static final boolean isContractionTableElement(int CE) {\r
2640         return isSpecial(CE)\r
2641                 && (getCETag(CE) == CE_CONTRACTION_TAG_ || getCETag(CE) == CE_SPEC_PROC_TAG_);\r
2642     }\r
2643 \r
2644     /**\r
2645      * Gets the codepoint\r
2646      * \r
2647      * @param table\r
2648      *            contraction table\r
2649      * @param element\r
2650      *            offset to the contraction element in the table\r
2651      * @param codePoint\r
2652      *            code point to look for\r
2653      * @return the offset to the code point\r
2654      */\r
2655     private static int findCP(ContractionTable table, int element,\r
2656             char codePoint) {\r
2657         BasicContractionTable tbl = getBasicContractionTable(table, element);\r
2658         if (tbl == null) {\r
2659             return -1;\r
2660         }\r
2661 \r
2662         int position = 0;\r
2663         while (codePoint > tbl.m_codePoints_.charAt(position)) {\r
2664             position++;\r
2665             if (position > tbl.m_codePoints_.length()) {\r
2666                 return -1;\r
2667             }\r
2668         }\r
2669         if (codePoint == tbl.m_codePoints_.charAt(position)) {\r
2670             return position;\r
2671         } else {\r
2672             return -1;\r
2673         }\r
2674     }\r
2675 \r
2676     /**\r
2677      * Gets the contraction element out of the contraction table\r
2678      * \r
2679      * @param table\r
2680      *            contraction table\r
2681      * @param offset\r
2682      *            to the element in the contraction table\r
2683      * @return basic contraction element at offset in the contraction table\r
2684      */\r
2685     private static final BasicContractionTable getBasicContractionTable(\r
2686             ContractionTable table, int offset) {\r
2687         offset &= 0xFFFFFF;\r
2688         if (offset == 0xFFFFFF) {\r
2689             return null;\r
2690         }\r
2691         return table.m_elements_.get(offset);\r
2692     }\r
2693 \r
2694     /**\r
2695      * Changes the contraction element\r
2696      * \r
2697      * @param table\r
2698      *            contraction table\r
2699      * @param element\r
2700      *            offset to the element in the contraction table\r
2701      * @param codePoint\r
2702      *            codepoint\r
2703      * @param newCE\r
2704      *            new collation element\r
2705      * @return basic contraction element at offset in the contraction table\r
2706      */\r
2707     private static final int changeContraction(ContractionTable table,\r
2708             int element, char codePoint, int newCE) {\r
2709         BasicContractionTable tbl = getBasicContractionTable(table, element);\r
2710         if (tbl == null) {\r
2711             return 0;\r
2712         }\r
2713         int position = 0;\r
2714         while (codePoint > tbl.m_codePoints_.charAt(position)) {\r
2715             position++;\r
2716             if (position > tbl.m_codePoints_.length()) {\r
2717                 return CE_NOT_FOUND_;\r
2718             }\r
2719         }\r
2720         if (codePoint == tbl.m_codePoints_.charAt(position)) {\r
2721             tbl.m_CEs_.set(position, new Integer(newCE));\r
2722             return element & 0xFFFFFF;\r
2723         } else {\r
2724             return CE_NOT_FOUND_;\r
2725         }\r
2726     }\r
2727 \r
2728     /**\r
2729      * Sets a part of contraction sequence in table. If element is non existant,\r
2730      * it creates on. Returns element handle.\r
2731      * \r
2732      * @param table\r
2733      *            contraction table\r
2734      * @param element\r
2735      *            offset to the contraction table\r
2736      * @param offset\r
2737      * @param codePoint\r
2738      *            contraction character\r
2739      * @param value\r
2740      *            ce value\r
2741      * @return new contraction ce\r
2742      */\r
2743     private static final int setContraction(ContractionTable table,\r
2744             int element, int offset, char codePoint, int value) {\r
2745         element &= 0xFFFFFF;\r
2746         BasicContractionTable tbl = getBasicContractionTable(table, element);\r
2747         if (tbl == null) {\r
2748             tbl = addAContractionElement(table);\r
2749             element = table.m_elements_.size() - 1;\r
2750         }\r
2751 \r
2752         tbl.m_CEs_.set(offset, new Integer(value));\r
2753         tbl.m_codePoints_.setCharAt(offset, codePoint);\r
2754         return constructSpecialCE(table.m_currentTag_, element);\r
2755     }\r
2756 \r
2757     /**\r
2758      * Inserts a part of contraction sequence in table. Sequences behind the\r
2759      * offset are moved back. If element is non existent, it creates on.\r
2760      * \r
2761      * @param table\r
2762      *            contraction\r
2763      * @param element\r
2764      *            offset to the table contraction\r
2765      * @param codePoint\r
2766      *            code point\r
2767      * @param value\r
2768      *            collation element value\r
2769      * @return contraction collation element\r
2770      */\r
2771     private static final int insertContraction(ContractionTable table,\r
2772             int element, char codePoint, int value) {\r
2773         element &= 0xFFFFFF;\r
2774         BasicContractionTable tbl = getBasicContractionTable(table, element);\r
2775         if (tbl == null) {\r
2776             tbl = addAContractionElement(table);\r
2777             element = table.m_elements_.size() - 1;\r
2778         }\r
2779 \r
2780         int offset = 0;\r
2781         while (tbl.m_codePoints_.charAt(offset) < codePoint\r
2782                 && offset < tbl.m_codePoints_.length()) {\r
2783             offset++;\r
2784         }\r
2785 \r
2786         tbl.m_CEs_.insertElementAt(new Integer(value), offset);\r
2787         tbl.m_codePoints_.insert(offset, codePoint);\r
2788 \r
2789         return constructSpecialCE(table.m_currentTag_, element);\r
2790     }\r
2791 \r
2792     /**\r
2793      * Finalize addition\r
2794      * \r
2795      * @param t\r
2796      *            build table\r
2797      * @param element\r
2798      *            to add\r
2799      */\r
2800     private final static int finalizeAddition(BuildTable t, Elements element) {\r
2801         int CE = CE_NOT_FOUND_;\r
2802         // This should add a completely ignorable element to the\r
2803         // unsafe table, so that backward iteration will skip\r
2804         // over it when treating contractions.\r
2805         if (element.m_mapCE_ == 0) {\r
2806             for (int i = 0; i < element.m_cPoints_.length(); i++) {\r
2807                 char ch = element.m_cPoints_.charAt(i);\r
2808                 if (!UTF16.isTrailSurrogate(ch)) {\r
2809                     unsafeCPSet(t.m_unsafeCP_, ch);\r
2810                 }\r
2811             }\r
2812         }\r
2813 \r
2814         if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1) {\r
2815             // we're adding a contraction\r
2816             int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_);\r
2817             CE = t.m_mapping_.getValue(cp);\r
2818             CE = addContraction(t, CE, element);\r
2819         } else {\r
2820             // easy case\r
2821             CE = t.m_mapping_.getValue(element.m_cPoints_\r
2822                     .charAt(element.m_cPointsOffset_));\r
2823 \r
2824             if (CE != CE_NOT_FOUND_) {\r
2825                 if (isContractionTableElement(CE)) {\r
2826                     // adding a non contraction element (thai, expansion,\r
2827                     // single) to already existing contraction\r
2828                     if (!isPrefix(element.m_mapCE_)) {\r
2829                         // we cannot reenter prefix elements - as we are going\r
2830                         // to create a dead loop\r
2831                         // Only expansions and regular CEs can go here...\r
2832                         // Contractions will never happen in this place\r
2833                         setContraction(t.m_contractions_, CE, 0, (char) 0,\r
2834                                 element.m_mapCE_);\r
2835                         // This loop has to change the CE at the end of\r
2836                         // contraction REDO!\r
2837                         changeLastCE(t.m_contractions_, CE, element.m_mapCE_);\r
2838                     }\r
2839                 } else {\r
2840                     t.m_mapping_\r
2841                             .setValue(element.m_cPoints_\r
2842                                     .charAt(element.m_cPointsOffset_),\r
2843                                     element.m_mapCE_);\r
2844                     if (element.m_prefixChars_ != null\r
2845                             && element.m_prefixChars_.length() > 0\r
2846                             && getCETag(CE) != CE_IMPLICIT_TAG_) {\r
2847                         // Add CE for standalone precontext char.\r
2848                         Elements origElem = new Elements();\r
2849                         origElem.m_prefixChars_ = null;\r
2850                         origElem.m_uchars_ = element.m_cPoints_;\r
2851                         origElem.m_cPoints_ = origElem.m_uchars_;\r
2852                         origElem.m_CEs_[0] = CE;\r
2853                         origElem.m_mapCE_ = CE;\r
2854                         origElem.m_CELength_ = 1;\r
2855                         finalizeAddition(t, origElem);\r
2856                     }\r
2857                 }\r
2858             } else {\r
2859                 t.m_mapping_.setValue(element.m_cPoints_\r
2860                         .charAt(element.m_cPointsOffset_), element.m_mapCE_);\r
2861             }\r
2862         }\r
2863         return CE;\r
2864     }\r
2865 \r
2866     /**\r
2867      * Note regarding surrogate handling: We are interested only in the single\r
2868      * or leading surrogates in a contraction. If a surrogate is somewhere else\r
2869      * in the contraction, it is going to be handled as a pair of code units, as\r
2870      * it doesn't affect the performance AND handling surrogates specially would\r
2871      * complicate code way too much.\r
2872      */\r
2873     private static int addContraction(BuildTable t, int CE, Elements element) {\r
2874         ContractionTable contractions = t.m_contractions_;\r
2875         contractions.m_currentTag_ = CE_CONTRACTION_TAG_;\r
2876 \r
2877         // First we need to check if contractions starts with a surrogate\r
2878         int cp = UTF16.charAt(element.m_cPoints_, 0);\r
2879         int cpsize = 1;\r
2880         if (UCharacter.isSupplementary(cp)) {\r
2881             cpsize = 2;\r
2882         }\r
2883         if (cpsize < element.m_cPoints_.length()) {\r
2884             // This is a real contraction, if there are other characters after\r
2885             // the first\r
2886             int size = element.m_cPoints_.length() - element.m_cPointsOffset_;\r
2887             for (int j = 1; j < size; j++) {\r
2888                 // First add contraction chars to unsafe CP hash table\r
2889                 // Unless it is a trail surrogate, which is handled\r
2890                 // algoritmically and shouldn't take up space in the table.\r
2891                 if (!UTF16.isTrailSurrogate(element.m_cPoints_\r
2892                         .charAt(element.m_cPointsOffset_ + j))) {\r
2893                     unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_\r
2894                             .charAt(element.m_cPointsOffset_ + j));\r
2895                 }\r
2896             }\r
2897             // Add the last char of the contraction to the contraction-end\r
2898             // hash table. unless it is a trail surrogate, which is handled\r
2899             // algorithmically and shouldn't be in the table\r
2900             if (!UTF16.isTrailSurrogate(element.m_cPoints_\r
2901                     .charAt(element.m_cPoints_.length() - 1))) {\r
2902                 ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_\r
2903                         .charAt(element.m_cPoints_.length() - 1));\r
2904             }\r
2905 \r
2906             // If there are any Jamos in the contraction, we should turn on\r
2907             // special processing for Jamos\r
2908             if (isJamo(element.m_cPoints_.charAt(element.m_cPointsOffset_))) {\r
2909                 t.m_collator_.m_isJamoSpecial_ = true;\r
2910             }\r
2911             // then we need to deal with it\r
2912             // we could aready have something in table - or we might not\r
2913             element.m_cPointsOffset_ += cpsize;\r
2914             if (!isContraction(CE)) {\r
2915                 // if it wasn't contraction, we wouldn't end up here\r
2916                 int firstContractionOffset = addContraction(contractions,\r
2917                         CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, CE);\r
2918                 int newCE = processContraction(contractions, element,\r
2919                         CE_NOT_FOUND_);\r
2920                 addContraction(contractions, firstContractionOffset,\r
2921                         element.m_cPoints_.charAt(element.m_cPointsOffset_),\r
2922                         newCE);\r
2923                 addContraction(contractions, firstContractionOffset,\r
2924                         (char) 0xFFFF, CE);\r
2925                 CE = constructSpecialCE(CE_CONTRACTION_TAG_,\r
2926                         firstContractionOffset);\r
2927             } else {\r
2928                 // we are adding to existing contraction\r
2929                 // there were already some elements in the table, so we need to\r
2930                 // add a new contraction\r
2931                 // Two things can happen here: either the codepoint is already\r
2932                 // in the table, or it is not\r
2933                 int position = findCP(contractions, CE, element.m_cPoints_\r
2934                         .charAt(element.m_cPointsOffset_));\r
2935                 if (position > 0) {\r
2936                     // if it is we just continue down the chain\r
2937                     int eCE = getCE(contractions, CE, position);\r
2938                     int newCE = processContraction(contractions, element, eCE);\r
2939                     setContraction(\r
2940                             contractions,\r
2941                             CE,\r
2942                             position,\r
2943                             element.m_cPoints_.charAt(element.m_cPointsOffset_),\r
2944                             newCE);\r
2945                 } else {\r
2946                     // if it isn't, we will have to create a new sequence\r
2947                     int newCE = processContraction(contractions, element,\r
2948                             CE_NOT_FOUND_);\r
2949                     insertContraction(contractions, CE, element.m_cPoints_\r
2950                             .charAt(element.m_cPointsOffset_), newCE);\r
2951                 }\r
2952             }\r
2953             element.m_cPointsOffset_ -= cpsize;\r
2954             t.m_mapping_.setValue(cp, CE);\r
2955         } else if (!isContraction(CE)) {\r
2956             // this is just a surrogate, and there is no contraction\r
2957             t.m_mapping_.setValue(cp, element.m_mapCE_);\r
2958         } else {\r
2959             // fill out the first stage of the contraction with the surrogate\r
2960             // CE\r
2961             changeContraction(contractions, CE, (char) 0, element.m_mapCE_);\r
2962             changeContraction(contractions, CE, (char) 0xFFFF, element.m_mapCE_);\r
2963         }\r
2964         return CE;\r
2965     }\r
2966 \r
2967     /**\r
2968      * this is for adding non contractions\r
2969      * \r
2970      * @param table\r
2971      *            contraction table\r
2972      * @param element\r
2973      *            offset to the contraction table\r
2974      * @param value\r
2975      *            collation element value\r
2976      * @return new collation element\r
2977      */\r
2978     private static final int changeLastCE(ContractionTable table, int element,\r
2979             int value) {\r
2980         BasicContractionTable tbl = getBasicContractionTable(table, element);\r
2981         if (tbl == null) {\r
2982             return 0;\r
2983         }\r
2984 \r
2985         tbl.m_CEs_.set(tbl.m_CEs_.size() - 1, new Integer(value));\r
2986         return constructSpecialCE(table.m_currentTag_, element & 0xFFFFFF);\r
2987     }\r
2988 \r
2989     /**\r
2990      * Given a set of ranges calculated by allocWeights(), iterate through the\r
2991      * weights. Sets the next weight in cegenerator.m_current_.\r
2992      * \r
2993      * @param cegenerator\r
2994      *            object that contains ranges weight range array and its\r
2995      *            rangeCount\r
2996      * @return the next weight\r
2997      */\r
2998     private static int nextWeight(CEGenerator cegenerator) {\r
2999         if (cegenerator.m_rangesLength_ > 0) {\r
3000             // get maxByte from the .count field\r
3001             int maxByte = cegenerator.m_ranges_[0].m_count_;\r
3002             // get the next weight\r
3003             int weight = cegenerator.m_ranges_[0].m_start_;\r
3004             if (weight == cegenerator.m_ranges_[0].m_end_) {\r
3005                 // this range is finished, remove it and move the following\r
3006                 // ones up\r
3007                 cegenerator.m_rangesLength_--;\r
3008                 if (cegenerator.m_rangesLength_ > 0) {\r
3009                     System.arraycopy(cegenerator.m_ranges_, 1,\r
3010                             cegenerator.m_ranges_, 0,\r
3011                             cegenerator.m_rangesLength_);\r
3012                     cegenerator.m_ranges_[0].m_count_ = maxByte;\r
3013                     // keep maxByte in ranges[0]\r
3014                 }\r
3015             } else {\r
3016                 // increment the weight for the next value\r
3017                 cegenerator.m_ranges_[0].m_start_ = incWeight(weight,\r
3018                         cegenerator.m_ranges_[0].m_length2_, maxByte);\r
3019             }\r
3020             return weight;\r
3021         }\r
3022         return -1;\r
3023     }\r
3024 \r
3025     /**\r
3026      * Increment the collation weight\r
3027      * \r
3028      * @param weight\r
3029      *            to increment\r
3030      * @param length\r
3031      * @param maxByte\r
3032      * @return new incremented weight\r
3033      */\r
3034     private static final int incWeight(int weight, int length, int maxByte) {\r
3035         while (true) {\r
3036             int b = getWeightByte(weight, length);\r
3037             if (b < maxByte) {\r
3038                 return setWeightByte(weight, length, b + 1);\r
3039             } else {\r
3040                 // roll over, set this byte to BYTE_FIRST_TAILORED_ and\r
3041                 // increment the previous one\r
3042                 weight = setWeightByte(weight, length,\r
3043                         RuleBasedCollator.BYTE_FIRST_TAILORED_);\r
3044                 --length;\r
3045             }\r
3046         }\r
3047     }\r
3048 \r
3049     /**\r
3050      * Gets the weight byte\r
3051      * \r
3052      * @param weight\r
3053      * @param index\r
3054      * @return byte\r
3055      */\r
3056     private static final int getWeightByte(int weight, int index) {\r
3057         return (weight >> ((4 - index) << 3)) & 0xff;\r
3058     }\r
3059 \r
3060     /**\r
3061      * Set the weight byte in table\r
3062      * \r
3063      * @param weight\r
3064      * @param index\r
3065      * @param b\r
3066      *            byte\r
3067      */\r
3068     private static final int setWeightByte(int weight, int index, int b) {\r
3069         index <<= 3;\r
3070         // 0xffffffff except a 00 "hole" for the index-th byte\r
3071         int mask = 0xffffffff >>> index;\r
3072         index = 32 - index;\r
3073         mask |= 0xffffff00 << index;\r
3074         return (weight & mask) | (b << index);\r
3075     }\r
3076 \r
3077     /**\r
3078      * Call getWeightRanges and then determine heuristically which ranges to use\r
3079      * for a given number of weights between (excluding) two limits\r
3080      * \r
3081      * @param lowerLimit\r
3082      * @param upperLimit\r
3083      * @param n\r
3084      * @param maxByte\r
3085      * @param ranges\r
3086      * @return\r
3087      */\r
3088     private int allocateWeights(int lowerLimit, int upperLimit, int n,\r
3089             int maxByte, WeightRange ranges[]) {\r
3090         // number of usable byte values 3..maxByte\r
3091         int countBytes = maxByte - RuleBasedCollator.BYTE_FIRST_TAILORED_ + 1;\r
3092         // [0] unused, [5] to make index checks unnecessary, m_utilCountBuffer_\r
3093         // countBytes to the power of index, m_utilLongBuffer_ for unsignedness\r
3094         // gcc requires explicit initialization\r
3095         m_utilLongBuffer_[0] = 1;\r
3096         m_utilLongBuffer_[1] = countBytes;\r
3097         m_utilLongBuffer_[2] = m_utilLongBuffer_[1] * countBytes;\r
3098         m_utilLongBuffer_[3] = m_utilLongBuffer_[2] * countBytes;\r
3099         m_utilLongBuffer_[4] = m_utilLongBuffer_[3] * countBytes;\r
3100         int rangeCount = getWeightRanges(lowerLimit, upperLimit, maxByte,\r
3101                 countBytes, ranges);\r
3102         if (rangeCount <= 0) {\r
3103             return 0;\r
3104         }\r
3105         // what is the maximum number of weights with these ranges?\r
3106         long maxCount = 0;\r
3107         for (int i = 0; i < rangeCount; ++i) {\r
3108             maxCount += (long) ranges[i].m_count_\r
3109                     * m_utilLongBuffer_[4 - ranges[i].m_length_];\r
3110         }\r
3111         if (maxCount < n) {\r
3112             return 0;\r
3113         }\r
3114         // set the length2 and count2 fields\r
3115         for (int i = 0; i < rangeCount; ++i) {\r
3116             ranges[i].m_length2_ = ranges[i].m_length_;\r
3117             ranges[i].m_count2_ = ranges[i].m_count_;\r
3118         }\r
3119         // try until we find suitably large ranges\r
3120         while (true) {\r
3121             // get the smallest number of bytes in a range\r
3122             int minLength = ranges[0].m_length2_;\r
3123             // sum up the number of elements that fit into ranges of each byte\r
3124             // length\r
3125             Arrays.fill(m_utilCountBuffer_, 0);\r
3126             for (int i = 0; i < rangeCount; ++i) {\r
3127                 m_utilCountBuffer_[ranges[i].m_length2_] += ranges[i].m_count2_;\r
3128             }\r
3129             // now try to allocate n elements in the available short ranges\r
3130             if (n <= m_utilCountBuffer_[minLength]\r
3131                     + m_utilCountBuffer_[minLength + 1]) {\r
3132                 // trivial cases, use the first few ranges\r
3133                 maxCount = 0;\r
3134                 rangeCount = 0;\r
3135                 do {\r
3136                     maxCount += ranges[rangeCount].m_count2_;\r
3137                     ++rangeCount;\r
3138                 } while (n > maxCount);\r
3139                 break;\r
3140             } else if (n <= ranges[0].m_count2_ * countBytes) {\r
3141                 // easy case, just make this one range large enough by\r
3142                 // lengthening it once more, possibly split it\r
3143                 rangeCount = 1;\r
3144                 // calculate how to split the range between maxLength-1\r
3145                 // (count1) and maxLength (count2)\r
3146                 long power_1 = m_utilLongBuffer_[minLength\r
3147                         - ranges[0].m_length_];\r
3148                 long power = power_1 * countBytes;\r
3149                 int count2 = (int) ((n + power - 1) / power);\r
3150                 int count1 = ranges[0].m_count_ - count2;\r
3151                 // split the range\r
3152                 if (count1 < 1) {\r
3153                     // lengthen the entire range to maxLength\r
3154                     lengthenRange(ranges, 0, maxByte, countBytes);\r
3155                 } else {\r
3156                     // really split the range\r
3157                     // create a new range with the end and initial and current\r
3158                     // length of the old one\r
3159                     rangeCount = 2;\r
3160                     ranges[1].m_end_ = ranges[0].m_end_;\r
3161                     ranges[1].m_length_ = ranges[0].m_length_;\r
3162                     ranges[1].m_length2_ = minLength;\r
3163                     // set the end of the first range according to count1\r
3164                     int i = ranges[0].m_length_;\r
3165                     int b = getWeightByte(ranges[0].m_start_, i) + count1 - 1;\r
3166                     // ranges[0].count and count1 may be >countBytes from\r
3167                     // merging adjacent ranges; b > maxByte is possible\r
3168                     if (b <= maxByte) {\r
3169                         ranges[0].m_end_ = setWeightByte(ranges[0].m_start_, i,\r
3170                                 b);\r
3171                     } else {\r
3172                         ranges[0].m_end_ = setWeightByte(incWeight(\r
3173                                 ranges[0].m_start_, i - 1, maxByte), i, b\r
3174                                 - countBytes);\r
3175                     }\r
3176                     // set the bytes in the end weight at length + 1..length2\r
3177                     // to maxByte\r
3178                     b = (maxByte << 24) | (maxByte << 16) | (maxByte << 8)\r
3179                             | maxByte; // this used to be 0xffffffff\r
3180                     ranges[0].m_end_ = truncateWeight(ranges[0].m_end_, i)\r
3181                             | (b >>> (i << 3)) & (b << ((4 - minLength) << 3));\r
3182                     // set the start of the second range to immediately follow\r
3183                     // the end of the first one\r
3184                     ranges[1].m_start_ = incWeight(ranges[0].m_end_, minLength,\r
3185                             maxByte);\r
3186                     // set the count values (informational)\r
3187                     ranges[0].m_count_ = count1;\r
3188                     ranges[1].m_count_ = count2;\r
3189 \r
3190                     ranges[0].m_count2_ = (int) (count1 * power_1);\r
3191                     // will be *countBytes when lengthened\r
3192                     ranges[1].m_count2_ = (int) (count2 * power_1);\r
3193 \r
3194                     // lengthen the second range to maxLength\r
3195                     lengthenRange(ranges, 1, maxByte, countBytes);\r
3196                 }\r
3197                 break;\r
3198             }\r
3199             // no good match, lengthen all minLength ranges and iterate\r
3200             for (int i = 0; ranges[i].m_length2_ == minLength; ++i) {\r
3201                 lengthenRange(ranges, i, maxByte, countBytes);\r
3202             }\r
3203         }\r
3204 \r
3205         if (rangeCount > 1) {\r
3206             // sort the ranges by weight values\r
3207             Arrays.sort(ranges, 0, rangeCount);\r
3208         }\r
3209 \r
3210         // set maxByte in ranges[0] for ucol_nextWeight()\r
3211         ranges[0].m_count_ = maxByte;\r
3212 \r
3213         return rangeCount;\r
3214     }\r
3215 \r
3216     /**\r
3217      * Updates the range length\r
3218      * \r
3219      * @param range\r
3220      *            weight range array\r
3221      * @param offset\r
3222      *            to weight range array\r
3223      * @param maxByte\r
3224      * @param countBytes\r
3225      * @return new length\r
3226      */\r
3227     private static final int lengthenRange(WeightRange range[], int offset,\r
3228             int maxByte, int countBytes) {\r
3229         int length = range[offset].m_length2_ + 1;\r
3230         range[offset].m_start_ = setWeightTrail(range[offset].m_start_, length,\r
3231                 RuleBasedCollator.BYTE_FIRST_TAILORED_);\r
3232         range[offset].m_end_ = setWeightTrail(range[offset].m_end_, length,\r
3233                 maxByte);\r
3234         range[offset].m_count2_ *= countBytes;\r
3235         range[offset].m_length2_ = length;\r
3236         return length;\r
3237     }\r
3238 \r
3239     /**\r
3240      * Gets the weight\r
3241      * \r
3242      * @param weight\r
3243      * @param length\r
3244      * @param trail\r
3245      * @return new weight\r
3246      */\r
3247     private static final int setWeightTrail(int weight, int length, int trail) {\r
3248         length = (4 - length) << 3;\r
3249         return (weight & (0xffffff00 << length)) | (trail << length);\r
3250     }\r
3251 \r
3252     /**\r
3253      * take two CE weights and calculate the possible ranges of weights between\r
3254      * the two limits, excluding them for weights with up to 4 bytes there are\r
3255      * up to 2*4-1=7 ranges\r
3256      * \r
3257      * @param lowerLimit\r
3258      * @param upperLimit\r
3259      * @param maxByte\r
3260      * @param countBytes\r
3261      * @param ranges\r
3262      * @return weight ranges\r
3263      */\r
3264     private int getWeightRanges(int lowerLimit, int upperLimit, int maxByte,\r
3265             int countBytes, WeightRange ranges[]) {\r
3266         // assume that both lowerLimit & upperLimit are not 0\r
3267         // get the lengths of the limits\r
3268         int lowerLength = lengthOfWeight(lowerLimit);\r
3269         int upperLength = lengthOfWeight(upperLimit);\r
3270         if (Utility.compareUnsigned(lowerLimit, upperLimit) >= 0) {\r
3271             return 0;\r
3272         }\r
3273         // check that neither is a prefix of the other\r
3274         if (lowerLength < upperLength) {\r
3275             if (lowerLimit == truncateWeight(upperLimit, lowerLength)) {\r
3276                 return 0;\r
3277             }\r
3278         }\r
3279         // if the upper limit is a prefix of the lower limit then the earlier\r
3280         // test lowerLimit >= upperLimit has caught it\r
3281         // reset local variables\r
3282         // With the limit lengths of 1..4, there are up to 7 ranges for\r
3283         // allocation:\r
3284         // range minimum length\r
3285         // lower[4] 4\r
3286         // lower[3] 3\r
3287         // lower[2] 2\r
3288         // middle 1\r
3289         // upper[2] 2\r
3290         // upper[3] 3\r
3291         // upper[4] 4\r
3292         // We are now going to calculate up to 7 ranges.\r
3293         // Some of them will typically overlap, so we will then have to merge\r
3294         // and eliminate ranges.\r
3295 \r
3296         // We have to clean cruft from previous invocations\r
3297         // before doing anything. C++ already does that\r
3298         for (int length = 0; length < 5; length++) {\r
3299             m_utilLowerWeightRange_[length].clear();\r
3300             m_utilUpperWeightRange_[length].clear();\r
3301         }\r
3302         m_utilWeightRange_.clear();\r
3303 \r
3304         int weight = lowerLimit;\r
3305         for (int length = lowerLength; length >= 2; --length) {\r
3306             m_utilLowerWeightRange_[length].clear();\r
3307             int trail = getWeightByte(weight, length);\r
3308             if (trail < maxByte) {\r
3309                 m_utilLowerWeightRange_[length].m_start_ = incWeightTrail(\r
3310                         weight, length);\r
3311                 m_utilLowerWeightRange_[length].m_end_ = setWeightTrail(weight,\r
3312                         length, maxByte);\r
3313                 m_utilLowerWeightRange_[length].m_length_ = length;\r
3314                 m_utilLowerWeightRange_[length].m_count_ = maxByte - trail;\r
3315             }\r
3316             weight = truncateWeight(weight, length - 1);\r
3317         }\r
3318         m_utilWeightRange_.m_start_ = incWeightTrail(weight, 1);\r
3319 \r
3320         weight = upperLimit;\r
3321         // [0] and [1] are not used - this simplifies indexing,\r
3322         // m_utilUpperWeightRange_\r
3323 \r
3324         for (int length = upperLength; length >= 2; length--) {\r
3325             int trail = getWeightByte(weight, length);\r
3326             if (trail > RuleBasedCollator.BYTE_FIRST_TAILORED_) {\r
3327                 m_utilUpperWeightRange_[length].m_start_ = setWeightTrail(\r
3328                         weight, length, RuleBasedCollator.BYTE_FIRST_TAILORED_);\r
3329                 m_utilUpperWeightRange_[length].m_end_ = decWeightTrail(weight,\r
3330                         length);\r
3331                 m_utilUpperWeightRange_[length].m_length_ = length;\r
3332                 m_utilUpperWeightRange_[length].m_count_ = trail\r
3333                         - RuleBasedCollator.BYTE_FIRST_TAILORED_;\r
3334             }\r
3335             weight = truncateWeight(weight, length - 1);\r
3336         }\r
3337         m_utilWeightRange_.m_end_ = decWeightTrail(weight, 1);\r
3338 \r
3339         // set the middle range\r
3340         m_utilWeightRange_.m_length_ = 1;\r
3341         if (Utility.compareUnsigned(m_utilWeightRange_.m_end_,\r
3342                 m_utilWeightRange_.m_start_) >= 0) {\r
3343             // if (m_utilWeightRange_.m_end_ >= m_utilWeightRange_.m_start_) {\r
3344             m_utilWeightRange_.m_count_ = ((m_utilWeightRange_.m_end_ - m_utilWeightRange_.m_start_) >>> 24) + 1;\r
3345         } else {\r
3346             // eliminate overlaps\r
3347             // remove the middle range\r
3348             m_utilWeightRange_.m_count_ = 0;\r
3349             // reduce or remove the lower ranges that go beyond upperLimit\r
3350             for (int length = 4; length >= 2; --length) {\r
3351                 if (m_utilLowerWeightRange_[length].m_count_ > 0\r
3352                         && m_utilUpperWeightRange_[length].m_count_ > 0) {\r
3353                     int start = m_utilUpperWeightRange_[length].m_start_;\r
3354                     int end = m_utilLowerWeightRange_[length].m_end_;\r
3355                     if (end >= start\r
3356                             || incWeight(end, length, maxByte) == start) {\r
3357                         // lower and upper ranges collide or are directly\r
3358                         // adjacent: merge these two and remove all shorter\r
3359                         // ranges\r
3360                         start = m_utilLowerWeightRange_[length].m_start_;\r
3361                         end = m_utilLowerWeightRange_[length].m_end_ = m_utilUpperWeightRange_[length].m_end_;\r
3362                         // merging directly adjacent ranges needs to subtract\r
3363                         // the 0/1 gaps in between;\r
3364                         // it may result in a range with count>countBytes\r
3365                         m_utilLowerWeightRange_[length].m_count_ = getWeightByte(\r
3366                                 end, length)\r
3367                                 - getWeightByte(start, length)\r
3368                                 + 1\r
3369                                 + countBytes\r
3370                                 * (getWeightByte(end, length - 1) - getWeightByte(\r
3371                                         start, length - 1));\r
3372                         m_utilUpperWeightRange_[length].m_count_ = 0;\r
3373                         while (--length >= 2) {\r
3374                             m_utilLowerWeightRange_[length].m_count_ = m_utilUpperWeightRange_[length].m_count_ = 0;\r
3375                         }\r
3376                         break;\r
3377                     }\r
3378                 }\r
3379             }\r
3380         }\r
3381 \r
3382         // copy the ranges, shortest first, into the result array\r
3383         int rangeCount = 0;\r
3384         if (m_utilWeightRange_.m_count_ > 0) {\r
3385             ranges[0] = new WeightRange(m_utilWeightRange_);\r
3386             rangeCount = 1;\r
3387         }\r
3388         for (int length = 2; length <= 4; ++length) {\r
3389             // copy upper first so that later the middle range is more likely\r
3390             // the first one to use\r
3391             if (m_utilUpperWeightRange_[length].m_count_ > 0) {\r
3392                 ranges[rangeCount] = new WeightRange(\r
3393                         m_utilUpperWeightRange_[length]);\r
3394                 ++rangeCount;\r
3395             }\r
3396             if (m_utilLowerWeightRange_[length].m_count_ > 0) {\r
3397                 ranges[rangeCount] = new WeightRange(\r
3398                         m_utilLowerWeightRange_[length]);\r
3399                 ++rangeCount;\r
3400             }\r
3401         }\r
3402         return rangeCount;\r
3403     }\r
3404 \r
3405     /**\r
3406      * Truncates the weight with length\r
3407      * \r
3408      * @param weight\r
3409      * @param length\r
3410      * @return truncated weight\r
3411      */\r
3412     private static final int truncateWeight(int weight, int length) {\r
3413         return weight & (0xffffffff << ((4 - length) << 3));\r
3414     }\r
3415 \r
3416     /**\r
3417      * Length of the weight\r
3418      * \r
3419      * @param weight\r
3420      * @return length of the weight\r
3421      */\r
3422     private static final int lengthOfWeight(int weight) {\r
3423         if ((weight & 0xffffff) == 0) {\r
3424             return 1;\r
3425         } else if ((weight & 0xffff) == 0) {\r
3426             return 2;\r
3427         } else if ((weight & 0xff) == 0) {\r
3428             return 3;\r
3429         }\r
3430         return 4;\r
3431     }\r
3432 \r
3433     /**\r
3434      * Increment the weight trail\r
3435      * \r
3436      * @param weight\r
3437      * @param length\r
3438      * @return new weight\r
3439      */\r
3440     private static final int incWeightTrail(int weight, int length) {\r
3441         return weight + (1 << ((4 - length) << 3));\r
3442     }\r
3443 \r
3444     /**\r
3445      * Decrement the weight trail\r
3446      * \r
3447      * @param weight\r
3448      * @param length\r
3449      * @return new weight\r
3450      */\r
3451     private static int decWeightTrail(int weight, int length) {\r
3452         return weight - (1 << ((4 - length) << 3));\r
3453     }\r
3454 \r
3455     /**\r
3456      * Gets the codepoint\r
3457      * \r
3458      * @param tbl\r
3459      *            contraction table\r
3460      * @param codePoint\r
3461      *            code point to look for\r
3462      * @return the offset to the code point\r
3463      */\r
3464     private static int findCP(BasicContractionTable tbl, char codePoint) {\r
3465         int position = 0;\r
3466         while (codePoint > tbl.m_codePoints_.charAt(position)) {\r
3467             position++;\r
3468             if (position > tbl.m_codePoints_.length()) {\r
3469                 return -1;\r
3470             }\r
3471         }\r
3472         if (codePoint == tbl.m_codePoints_.charAt(position)) {\r
3473             return position;\r
3474         } else {\r
3475             return -1;\r
3476         }\r
3477     }\r
3478 \r
3479     /**\r
3480      * Finds a contraction ce\r
3481      * \r
3482      * @param table\r
3483      * @param element\r
3484      * @param ch\r
3485      * @return ce\r
3486      */\r
3487     private static int findCE(ContractionTable table, int element, char ch) {\r
3488         if (table == null) {\r
3489             return CE_NOT_FOUND_;\r
3490         }\r
3491         BasicContractionTable tbl = getBasicContractionTable(table, element);\r
3492         if (tbl == null) {\r
3493             return CE_NOT_FOUND_;\r
3494         }\r
3495         int position = findCP(tbl, ch);\r
3496         if (position > tbl.m_CEs_.size() || position < 0) {\r
3497             return CE_NOT_FOUND_;\r
3498         }\r
3499         return tbl.m_CEs_.get(position).intValue();\r
3500     }\r
3501 \r
3502     /**\r
3503      * Checks if the string is tailored in the contraction\r
3504      * \r
3505      * @param table\r
3506      *            contraction table\r
3507      * @param element\r
3508      * @param array\r
3509      *            character array to check\r
3510      * @param offset\r
3511      *            array offset\r
3512      * @return true if it is tailored\r
3513      */\r
3514     private static boolean isTailored(ContractionTable table, int element,\r
3515             char array[], int offset) {\r
3516         while (array[offset] != 0) {\r
3517             element = findCE(table, element, array[offset]);\r
3518             if (element == CE_NOT_FOUND_) {\r
3519                 return false;\r
3520             }\r
3521             if (!isContractionTableElement(element)) {\r
3522                 return true;\r
3523             }\r
3524             offset++;\r
3525         }\r
3526         if (getCE(table, element, 0) != CE_NOT_FOUND_) {\r
3527             return true;\r
3528         } else {\r
3529             return false;\r
3530         }\r
3531     }\r
3532 \r
3533     /**\r
3534      * Assemble RuleBasedCollator\r
3535      * \r
3536      * @param t\r
3537      *            build table\r
3538      * @param collator\r
3539      *            to update\r
3540      */\r
3541     private void assembleTable(BuildTable t, RuleBasedCollator collator) {\r
3542         IntTrieBuilder mapping = t.m_mapping_;\r
3543         Vector<Integer> expansions = t.m_expansions_;\r
3544         ContractionTable contractions = t.m_contractions_;\r
3545         MaxExpansionTable maxexpansion = t.m_maxExpansions_;\r
3546 \r
3547         // contraction offset has to be in since we are building on the\r
3548         // UCA contractions\r
3549         // int beforeContractions = (HEADER_SIZE_\r
3550         // + paddedsize(expansions.size() << 2)) >>> 1;\r
3551         collator.m_contractionOffset_ = 0;\r
3552         int contractionsSize = constructTable(contractions);\r
3553 \r
3554         // the following operation depends on the trie data. Therefore, we have\r
3555         // to do it before the trie is compacted\r
3556         // sets jamo expansions\r
3557         getMaxExpansionJamo(mapping, maxexpansion, t.m_maxJamoExpansions_,\r
3558                 collator.m_isJamoSpecial_);\r
3559 \r
3560         // TODO: LATIN1 array is now in the utrie - it should be removed from\r
3561         // the calculation\r
3562         setAttributes(collator, t.m_options_);\r
3563         // copy expansions\r
3564         int size = expansions.size();\r
3565         collator.m_expansion_ = new int[size];\r
3566         for (int i = 0; i < size; i++) {\r
3567             collator.m_expansion_[i] = expansions.get(i).intValue();\r
3568         }\r
3569         // contractions block\r
3570         if (contractionsSize != 0) {\r
3571             // copy contraction index\r
3572             collator.m_contractionIndex_ = new char[contractionsSize];\r
3573             contractions.m_codePoints_.getChars(0, contractionsSize,\r
3574                     collator.m_contractionIndex_, 0);\r
3575             // copy contraction collation elements\r
3576             collator.m_contractionCE_ = new int[contractionsSize];\r
3577             for (int i = 0; i < contractionsSize; i++) {\r
3578                 collator.m_contractionCE_[i] = contractions.m_CEs_.get(i).intValue();\r
3579             }\r
3580         }\r
3581         // copy mapping table\r
3582         collator.m_trie_ = mapping.serialize(t,\r
3583                 RuleBasedCollator.DataManipulate.getInstance());\r
3584         // copy max expansion table\r
3585         // not copying the first element which is a dummy\r
3586         // to be in synch with icu4c's builder, we continue to use the\r
3587         // expansion offset\r
3588         // omitting expansion offset in builder\r
3589         collator.m_expansionOffset_ = 0;\r
3590         size = maxexpansion.m_endExpansionCE_.size();\r
3591         collator.m_expansionEndCE_ = new int[size - 1];\r
3592         for (int i = 1; i < size; i++) {\r
3593             collator.m_expansionEndCE_[i - 1] = maxexpansion.m_endExpansionCE_\r
3594                     .get(i).intValue();\r
3595         }\r
3596         collator.m_expansionEndCEMaxSize_ = new byte[size - 1];\r
3597         for (int i = 1; i < size; i++) {\r
3598             collator.m_expansionEndCEMaxSize_[i - 1] = maxexpansion.m_expansionCESize_\r
3599                     .get(i).byteValue();\r
3600         }\r
3601         // Unsafe chars table. Finish it off, then copy it.\r
3602         unsafeCPAddCCNZ(t);\r
3603         // Or in unsafebits from UCA, making a combined table.\r
3604         for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) {\r
3605             t.m_unsafeCP_[i] |= RuleBasedCollator.UCA_.m_unsafe_[i];\r
3606         }\r
3607         collator.m_unsafe_ = t.m_unsafeCP_;\r
3608 \r
3609         // Finish building Contraction Ending chars hash table and then copy it\r
3610         // out.\r
3611         // Or in unsafebits from UCA, making a combined table\r
3612         for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) {\r
3613             t.m_contrEndCP_[i] |= RuleBasedCollator.UCA_.m_contractionEnd_[i];\r
3614         }\r
3615         collator.m_contractionEnd_ = t.m_contrEndCP_;\r
3616     }\r
3617 \r
3618     /**\r
3619      * Sets this collator to use the all options and tables in UCA.\r
3620      * \r
3621      * @param collator\r
3622      *            which attribute is to be set\r
3623      * @param option\r
3624      *            to set with\r
3625      */\r
3626     private static final void setAttributes(RuleBasedCollator collator,\r
3627             CollationRuleParser.OptionSet option) {\r
3628         collator.latinOneFailed_ = true;\r
3629         collator.m_caseFirst_ = option.m_caseFirst_;\r
3630         collator.setDecomposition(option.m_decomposition_);\r
3631         collator\r
3632                 .setAlternateHandlingShifted(option.m_isAlternateHandlingShifted_);\r
3633         collator.setCaseLevel(option.m_isCaseLevel_);\r
3634         collator.setFrenchCollation(option.m_isFrenchCollation_);\r
3635         collator.m_isHiragana4_ = option.m_isHiragana4_;\r
3636         collator.setStrength(option.m_strength_);\r
3637         collator.m_variableTopValue_ = option.m_variableTopValue_;\r
3638         collator.latinOneFailed_ = false;\r
3639     }\r
3640 \r
3641     /**\r
3642      * Constructing the contraction table\r
3643      * \r
3644      * @param table\r
3645      *            contraction table\r
3646      * @return\r
3647      */\r
3648     private int constructTable(ContractionTable table) {\r
3649         // See how much memory we need\r
3650         int tsize = table.m_elements_.size();\r
3651         if (tsize == 0) {\r
3652             return 0;\r
3653         }\r
3654         table.m_offsets_.clear();\r
3655         int position = 0;\r
3656         for (int i = 0; i < tsize; i++) {\r
3657             table.m_offsets_.add(new Integer(position));\r
3658             position += table.m_elements_.get(i).m_CEs_\r
3659                     .size();\r
3660         }\r
3661         table.m_CEs_.clear();\r
3662         table.m_codePoints_.delete(0, table.m_codePoints_.length());\r
3663         // Now stuff the things in\r
3664         StringBuilder cpPointer = table.m_codePoints_;\r
3665         Vector<Integer> CEPointer = table.m_CEs_;\r
3666         for (int i = 0; i < tsize; i++) {\r
3667             BasicContractionTable bct = table.m_elements_.get(i);\r
3668             int size = bct.m_CEs_.size();\r
3669             char ccMax = 0;\r
3670             char ccMin = 255;\r
3671             int offset = CEPointer.size();\r
3672             CEPointer.add(bct.m_CEs_.get(0));\r
3673             for (int j = 1; j < size; j++) {\r
3674                 char ch = bct.m_codePoints_.charAt(j);\r
3675                 char cc = (char) (UCharacter.getCombiningClass(ch) & 0xFF);\r
3676                 if (cc > ccMax) {\r
3677                     ccMax = cc;\r
3678                 }\r
3679                 if (cc < ccMin) {\r
3680                     ccMin = cc;\r
3681                 }\r
3682                 cpPointer.append(ch);\r
3683                 CEPointer.add(bct.m_CEs_.get(j));\r
3684             }\r
3685             cpPointer.insert(offset,\r
3686                     (char) (((ccMin == ccMax) ? 1 : 0 << 8) | ccMax));\r
3687             for (int j = 0; j < size; j++) {\r
3688                 if (isContractionTableElement(CEPointer.get(offset + j).intValue())) {\r
3689                     int ce = CEPointer.get(offset + j).intValue();\r
3690                     CEPointer.set(offset + j,\r
3691                             new Integer(constructSpecialCE(getCETag(ce),\r
3692                                     table.m_offsets_.get(getContractionOffset(ce))\r
3693                                     .intValue())));\r
3694                 }\r
3695             }\r
3696         }\r
3697 \r
3698         for (int i = 0; i <= 0x10FFFF; i++) {\r
3699             int CE = table.m_mapping_.getValue(i);\r
3700             if (isContractionTableElement(CE)) {\r
3701                 CE = constructSpecialCE(getCETag(CE),\r
3702                         table.m_offsets_.get(getContractionOffset(CE)).intValue());\r
3703                 table.m_mapping_.setValue(i, CE);\r
3704             }\r
3705         }\r
3706         return position;\r
3707     }\r
3708 \r
3709     /**\r
3710      * Get contraction offset\r
3711      * \r
3712      * @param ce\r
3713      *            collation element\r
3714      * @return contraction offset\r
3715      */\r
3716     private static final int getContractionOffset(int ce) {\r
3717         return ce & 0xFFFFFF;\r
3718     }\r
3719 \r
3720     /**\r
3721      * Gets the maximum Jamo expansion\r
3722      * \r
3723      * @param mapping\r
3724      *            trie table\r
3725      * @param maxexpansion\r
3726      *            maximum expansion table\r
3727      * @param maxjamoexpansion\r
3728      *            maximum jamo expansion table\r
3729      * @param jamospecial\r
3730      *            is jamo special?\r
3731      */\r
3732     private static void getMaxExpansionJamo(IntTrieBuilder mapping,\r
3733             MaxExpansionTable maxexpansion,\r
3734             MaxJamoExpansionTable maxjamoexpansion, boolean jamospecial) {\r
3735         int VBASE = 0x1161;\r
3736         int TBASE = 0x11A8;\r
3737         int VCOUNT = 21;\r
3738         int TCOUNT = 28;\r
3739         int v = VBASE + VCOUNT - 1;\r
3740         int t = TBASE + TCOUNT - 1;\r
3741 \r
3742         while (v >= VBASE) {\r
3743             int ce = mapping.getValue(v);\r
3744             if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) {\r
3745                 setMaxExpansion(ce, (byte) 2, maxexpansion);\r
3746             }\r
3747             v--;\r
3748         }\r
3749 \r
3750         while (t >= TBASE) {\r
3751             int ce = mapping.getValue(t);\r
3752             if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) {\r
3753                 setMaxExpansion(ce, (byte) 3, maxexpansion);\r
3754             }\r
3755             t--;\r
3756         }\r
3757         // According to the docs, 99% of the time, the Jamo will not be special\r
3758         if (jamospecial) {\r
3759             // gets the max expansion in all unicode characters\r
3760             int count = maxjamoexpansion.m_endExpansionCE_.size();\r
3761             byte maxTSize = (byte) (maxjamoexpansion.m_maxLSize_\r
3762                     + maxjamoexpansion.m_maxVSize_ + maxjamoexpansion.m_maxTSize_);\r
3763             byte maxVSize = (byte) (maxjamoexpansion.m_maxLSize_ + maxjamoexpansion.m_maxVSize_);\r
3764 \r
3765             while (count > 0) {\r
3766                 count--;\r
3767                 if ((maxjamoexpansion.m_isV_.get(count))\r
3768                         .booleanValue() == true) {\r
3769                     setMaxExpansion(\r
3770                             (maxjamoexpansion.m_endExpansionCE_\r
3771                                     .get(count)).intValue(), maxVSize,\r
3772                             maxexpansion);\r
3773                 } else {\r
3774                     setMaxExpansion(\r
3775                             (maxjamoexpansion.m_endExpansionCE_\r
3776                                     .get(count)).intValue(), maxTSize,\r
3777                             maxexpansion);\r
3778                 }\r
3779             }\r
3780         }\r
3781     }\r
3782 \r
3783     /**\r
3784      * To the UnsafeCP hash table, add all chars with combining class != 0\r
3785      * \r
3786      * @param t\r
3787      *            build table\r
3788      */\r
3789     private final void unsafeCPAddCCNZ(BuildTable t) {\r
3790         boolean buildCMTable = (buildCMTabFlag & (t.cmLookup == null));\r
3791         char[] cm = null; // combining mark array\r
3792         int[] index = new int[256];\r
3793         int count = 0;\r
3794 \r
3795         if (buildCMTable) {\r
3796             cm = new char[0x10000];\r
3797         }\r
3798         for (char c = 0; c < 0xffff; c++) {\r
3799             int fcd = m_nfcImpl_.getFCD16FromSingleLead(c);  // TODO: review for handling supplementary characters\r
3800             if (fcd >= 0x100 || // if the leading combining class(c) > 0 ||\r
3801                     (UTF16.isLeadSurrogate(c) && fcd != 0)) {\r
3802                 // c is a leading surrogate with some FCD data\r
3803                 unsafeCPSet(t.m_unsafeCP_, c);\r
3804                 if (buildCMTable && (fcd != 0)) {\r
3805                     int cc = (fcd & 0xff);\r
3806                     int pos = (cc << 8) + index[cc];\r
3807                     cm[pos] = c;\r
3808                     index[cc]++;\r
3809                     count++;\r
3810                 }\r
3811             }\r
3812         }\r
3813 \r
3814         if (t.m_prefixLookup_ != null) {\r
3815             Enumeration<Elements> els = t.m_prefixLookup_.elements();\r
3816             while (els.hasMoreElements()) {\r
3817                 Elements e = els.nextElement();\r
3818                 // codepoints here are in the NFD form. We need to add the\r
3819                 // first code point of the NFC form to unsafe, because\r
3820                 // strcoll needs to backup over them.\r
3821                 // weiv: This is wrong! See the comment above.\r
3822                 // String decomp = Normalizer.decompose(e.m_cPoints_, true);\r
3823                 // unsafeCPSet(t.m_unsafeCP_, decomp.charAt(0));\r
3824                 // it should be:\r
3825                 String comp = Normalizer.compose(e.m_cPoints_, false);\r
3826                 unsafeCPSet(t.m_unsafeCP_, comp.charAt(0));\r
3827             }\r
3828         }\r
3829 \r
3830         if (buildCMTable) {\r
3831             t.cmLookup = new CombinClassTable();\r
3832             t.cmLookup.generate(cm, count, index);\r
3833         }\r
3834     }\r
3835 \r
3836     /**\r
3837      * Create closure\r
3838      * \r
3839      * @param t\r
3840      *            build table\r
3841      * @param collator\r
3842      *            RuleBasedCollator\r
3843      * @param colEl\r
3844      *            collation element iterator\r
3845      * @param start\r
3846      * @param limit\r
3847      * @param type\r
3848      *            character type\r
3849      * @return\r
3850      */\r
3851     private boolean enumCategoryRangeClosureCategory(BuildTable t,\r
3852             RuleBasedCollator collator, CollationElementIterator colEl,\r
3853             int start, int limit, int type) {\r
3854         if (type != UCharacterCategory.UNASSIGNED\r
3855                 && type != UCharacterCategory.PRIVATE_USE) {\r
3856             // if the range is assigned - we might ommit more categories later\r
3857 \r
3858             for (int u32 = start; u32 < limit; u32++) {\r
3859                 String decomp = m_nfcImpl_.getDecomposition(u32);\r
3860                 if (decomp != null) {\r
3861                     String comp = UCharacter.toString(u32);\r
3862                     if (!collator.equals(comp, decomp)) {\r
3863                         m_utilElement_.m_cPoints_ = decomp;\r
3864                         m_utilElement_.m_prefix_ = 0;\r
3865                         Elements prefix = t.m_prefixLookup_.get(m_utilElement_);\r
3866                         if (prefix == null) {\r
3867                             m_utilElement_.m_cPoints_ = comp;\r
3868                             m_utilElement_.m_prefix_ = 0;\r
3869                             m_utilElement_.m_prefixChars_ = null;\r
3870                             colEl.setText(decomp);\r
3871                             int ce = colEl.next();\r
3872                             m_utilElement_.m_CELength_ = 0;\r
3873                             while (ce != CollationElementIterator.NULLORDER) {\r
3874                                 m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = ce;\r
3875                                 ce = colEl.next();\r
3876                             }\r
3877                         } else {\r
3878                             m_utilElement_.m_cPoints_ = comp;\r
3879                             m_utilElement_.m_prefix_ = 0;\r
3880                             m_utilElement_.m_prefixChars_ = null;\r
3881                             m_utilElement_.m_CELength_ = 1;\r
3882                             m_utilElement_.m_CEs_[0] = prefix.m_mapCE_;\r
3883                             // This character uses a prefix. We have to add it\r
3884                             // to the unsafe table, as it decomposed form is\r
3885                             // already in. In Japanese, this happens for \u309e\r
3886                             // & \u30fe\r
3887                             // Since unsafeCPSet is static in ucol_elm, we are\r
3888                             // going to wrap it up in the unsafeCPAddCCNZ\r
3889                             // function\r
3890                         }\r
3891                         addAnElement(t, m_utilElement_);\r
3892                     }\r
3893                 }\r
3894             }\r
3895         }\r
3896         return true;\r
3897     }\r
3898 \r
3899     /**\r
3900      * Determine if a character is a Jamo\r
3901      * \r
3902      * @param ch\r
3903      *            character to test\r
3904      * @return true if ch is a Jamo, false otherwise\r
3905      */\r
3906     private static final boolean isJamo(char ch) {\r
3907         return (ch >= 0x1100 && ch <= 0x1112) || (ch >= 0x1175 && ch <= 0x1161)\r
3908                 || (ch >= 0x11A8 && ch <= 0x11C2);\r
3909     }\r
3910 \r
3911     /**\r
3912      * Produces canonical closure\r
3913      */\r
3914     private void canonicalClosure(BuildTable t) {\r
3915         BuildTable temp = new BuildTable(t);\r
3916         assembleTable(temp, temp.m_collator_);\r
3917         // produce canonical closure\r
3918         CollationElementIterator coleiter = temp.m_collator_\r
3919                 .getCollationElementIterator("");\r
3920         RangeValueIterator typeiter = UCharacter.getTypeIterator();\r
3921         RangeValueIterator.Element element = new RangeValueIterator.Element();\r
3922         while (typeiter.next(element)) {\r
3923             enumCategoryRangeClosureCategory(t, temp.m_collator_, coleiter,\r
3924                     element.start, element.limit, element.value);\r
3925         }\r
3926 \r
3927         t.cmLookup = temp.cmLookup;\r
3928         temp.cmLookup = null;\r
3929 \r
3930         for (int i = 0; i < m_parser_.m_resultLength_; i++) {\r
3931             char baseChar, firstCM;\r
3932             // now we need to generate the CEs\r
3933             // We stuff the initial value in the buffers, and increase the\r
3934             // appropriate buffer according to strength */\r
3935             // createElements(t, m_parser_.m_listHeader_[i]);\r
3936             CollationRuleParser.Token tok = m_parser_.m_listHeader_[i].m_first_;\r
3937             m_utilElement_.clear();\r
3938             while (tok != null) {\r
3939                 m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_;\r
3940                 m_utilElement_.m_cPointsOffset_ = 0; // el.m_uchars_;\r
3941                 if (tok.m_prefix_ != 0) {\r
3942                     // we will just copy the prefix here, and adjust accordingly\r
3943                     // in\r
3944                     // the addPrefix function in ucol_elm. The reason is that we\r
3945                     // need to add both composed AND decomposed elements to the\r
3946                     // unsafe table.\r
3947                     int size = tok.m_prefix_ >> 24;\r
3948                     int offset = tok.m_prefix_ & 0x00FFFFFF;\r
3949                     m_utilElement_.m_prefixChars_ = m_parser_.m_source_\r
3950                             .substring(offset, offset + size);\r
3951                     size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24);\r
3952                     offset = (tok.m_source_ & 0x00FFFFFF)\r
3953                             + (tok.m_prefix_ >> 24);\r
3954                     m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(\r
3955                             offset, offset + size);\r
3956                 } else {\r
3957                     m_utilElement_.m_prefixChars_ = null;\r
3958                     int offset = tok.m_source_ & 0x00FFFFFF;\r
3959                     int size = tok.m_source_ >>> 24;\r
3960                     m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(\r
3961                             offset, offset + size);\r
3962                 }\r
3963                 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;\r
3964 \r
3965                 baseChar = firstCM = 0; // reset\r
3966                 for (int j = 0; j < m_utilElement_.m_cPoints_.length()\r
3967                         - m_utilElement_.m_cPointsOffset_; j++) {\r
3968 \r
3969                     int fcd = m_nfcImpl_.getFCD16FromSingleLead(m_utilElement_.m_cPoints_.charAt(j));  // TODO: review for handling supplementary characters\r
3970                     if ((fcd & 0xff) == 0) {\r
3971                         baseChar = m_utilElement_.m_cPoints_.charAt(j);\r
3972                     } else {\r
3973                         if ((baseChar != 0) && (firstCM == 0)) {\r
3974                             firstCM = m_utilElement_.m_cPoints_.charAt(j); // first\r
3975                                                                            // combining\r
3976                                                                            // mark\r
3977                         }\r
3978                     }\r
3979                 }\r
3980 \r
3981                 if ((baseChar != 0) && (firstCM != 0)) {\r
3982                     addTailCanonicalClosures(t, temp.m_collator_, coleiter,\r
3983                             baseChar, firstCM);\r
3984                 }\r
3985                 tok = tok.m_next_;\r
3986             }\r
3987         }\r
3988     }\r
3989 \r
3990     private void addTailCanonicalClosures(BuildTable t,\r
3991             RuleBasedCollator m_collator, CollationElementIterator colEl,\r
3992             char baseChar, char cMark) {\r
3993         if (t.cmLookup == null) {\r
3994             return;\r
3995         }\r
3996         CombinClassTable cmLookup = t.cmLookup;\r
3997         int[] index = cmLookup.index;\r
3998         int cClass = m_nfcImpl_.getFCD16FromSingleLead(cMark) & 0xff;  // TODO: review for handling supplementary characters\r
3999         int maxIndex = 0;\r
4000         char[] precompCh = new char[256];\r
4001         int[] precompClass = new int[256];\r
4002         int precompLen = 0;\r
4003         Elements element = new Elements();\r
4004 \r
4005         if (cClass > 0) {\r
4006             maxIndex = index[cClass - 1];\r
4007         }\r
4008         for (int i = 0; i < maxIndex; i++) {\r
4009             StringBuilder decompBuf = new StringBuilder();\r
4010             decompBuf.append(baseChar).append(cmLookup.cPoints[i]);\r
4011             String comp = Normalizer.compose(decompBuf.toString(), false);\r
4012             if (comp.length() == 1) {\r
4013                 precompCh[precompLen] = comp.charAt(0);\r
4014                 precompClass[precompLen] = (m_nfcImpl_.getFCD16FromSingleLead(cmLookup.cPoints[i]) & 0xff);  // TODO: review for handling supplementary characters\r
4015                 precompLen++;\r
4016                 StringBuilder decomp = new StringBuilder();\r
4017                 for (int j = 0; j < m_utilElement_.m_cPoints_.length(); j++) {\r
4018                     if (m_utilElement_.m_cPoints_.charAt(j) == cMark) {\r
4019                         decomp.append(cmLookup.cPoints[i]);\r
4020                     } else {\r
4021                         decomp.append(m_utilElement_.m_cPoints_.charAt(j));\r
4022                     }\r
4023                 }\r
4024                 comp = Normalizer.compose(decomp.toString(), false);\r
4025                 StringBuilder buf = new StringBuilder(comp);\r
4026                 buf.append(cMark);\r
4027                 decomp.append(cMark);\r
4028                 comp = buf.toString();\r
4029 \r
4030                 element.m_cPoints_ = decomp.toString();\r
4031                 element.m_CELength_ = 0;\r
4032                 element.m_prefix_ = 0;\r
4033                 Elements prefix = t.m_prefixLookup_.get(element);\r
4034                 element.m_cPoints_ = comp;\r
4035                 element.m_uchars_ = comp;\r
4036 \r
4037                 if (prefix == null) {\r
4038                     element.m_prefix_ = 0;\r
4039                     element.m_prefixChars_ = null;\r
4040                     colEl.setText(decomp.toString());\r
4041                     int ce = colEl.next();\r
4042                     element.m_CELength_ = 0;\r
4043                     while (ce != CollationElementIterator.NULLORDER) {\r
4044                         element.m_CEs_[element.m_CELength_++] = ce;\r
4045                         ce = colEl.next();\r
4046                     }\r
4047                 } else {\r
4048                     element.m_cPoints_ = comp;\r
4049                     element.m_prefix_ = 0;\r
4050                     element.m_prefixChars_ = null;\r
4051                     element.m_CELength_ = 1;\r
4052                     element.m_CEs_[0] = prefix.m_mapCE_;\r
4053                 }\r
4054                 setMapCE(t, element);\r
4055                 finalizeAddition(t, element);\r
4056 \r
4057                 if (comp.length() > 2) {\r
4058                     // This is a fix for tailoring contractions with accented\r
4059                     // character at the end of contraction string.\r
4060                     addFCD4AccentedContractions(t, colEl, comp, element);\r
4061                 }\r
4062                 if (precompLen > 1) {\r
4063                     precompLen = addMultiCMontractions(t, colEl, element,\r
4064                             precompCh, precompClass, precompLen, cMark, i,\r
4065                             decomp.toString());\r
4066                 }\r
4067             }\r
4068         }\r
4069 \r
4070     }\r
4071 \r
4072     private void setMapCE(BuildTable t, Elements element) {\r
4073         Vector<Integer> expansions = t.m_expansions_;\r
4074         element.m_mapCE_ = 0;\r
4075 \r
4076         if (element.m_CELength_ == 2 // a two CE expansion\r
4077                 && RuleBasedCollator.isContinuation(element.m_CEs_[1])\r
4078                 && (element.m_CEs_[1] & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that\r
4079                                                                                                           // has\r
4080                                                                                                           // only\r
4081                                                                                                           // primaries\r
4082                                                                                                           // in\r
4083                                                                                                           // continuation\r
4084                 && (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_)\r
4085                 // a common secondary\r
4086                 && ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_)) { // and\r
4087                                                                                      // a\r
4088                                                                                      // common\r
4089                                                                                      // tertiary\r
4090 \r
4091             element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_\r
4092             // a long primary special\r
4093                     | (CE_LONG_PRIMARY_TAG_ << 24)\r
4094                     // first and second byte of primary\r
4095                     | ((element.m_CEs_[0] >> 8) & 0xFFFF00)\r
4096                     // third byte of primary\r
4097                     | ((element.m_CEs_[1] >> 24) & 0xFF);\r
4098         } else {\r
4099             // omitting expansion offset in builder\r
4100             // (HEADER_SIZE_ >> 2)\r
4101             int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_\r
4102                     | (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)\r
4103                     | (addExpansion(expansions, element.m_CEs_[0]) << 4)\r
4104                     & 0xFFFFF0;\r
4105 \r
4106             for (int i = 1; i < element.m_CELength_; i++) {\r
4107                 addExpansion(expansions, element.m_CEs_[i]);\r
4108             }\r
4109             if (element.m_CELength_ <= 0xF) {\r
4110                 expansion |= element.m_CELength_;\r
4111             } else {\r
4112                 addExpansion(expansions, 0);\r
4113             }\r
4114             element.m_mapCE_ = expansion;\r
4115             setMaxExpansion(element.m_CEs_[element.m_CELength_ - 1],\r
4116                     (byte) element.m_CELength_, t.m_maxExpansions_);\r
4117         }\r
4118     }\r
4119 \r
4120     private int addMultiCMontractions(BuildTable t,\r
4121             CollationElementIterator colEl, Elements element, char[] precompCh,\r
4122             int[] precompClass, int maxComp, char cMark, int cmPos,\r
4123             String decomp) {\r
4124 \r
4125         CombinClassTable cmLookup = t.cmLookup;\r
4126         char[] combiningMarks = { cMark };\r
4127         int cMarkClass = UCharacter.getCombiningClass(cMark) & 0xFF;\r
4128         String comMark = new String(combiningMarks);\r
4129         int noOfPrecomposedChs = maxComp;\r
4130 \r
4131         for (int j = 0; j < maxComp; j++) {\r
4132             int count = 0;\r
4133             StringBuilder temp;\r
4134 \r
4135             do {\r
4136                 String newDecomp, comp;\r
4137 \r
4138                 if (count == 0) { // Decompose the saved precomposed char.\r
4139                     newDecomp = Normalizer.decompose(\r
4140                             new String(precompCh, j, 1), false);\r
4141                     temp = new StringBuilder(newDecomp);\r
4142                     temp.append(cmLookup.cPoints[cmPos]);\r
4143                     newDecomp = temp.toString();\r
4144                 } else {\r
4145                     temp = new StringBuilder(decomp);\r
4146                     temp.append(precompCh[j]);\r
4147                     newDecomp = temp.toString();\r
4148                 }\r
4149                 comp = Normalizer.compose(newDecomp, false);\r
4150                 if (comp.length() == 1) {\r
4151                     temp.append(cMark);\r
4152                     element.m_cPoints_ = temp.toString();\r
4153                     element.m_CELength_ = 0;\r
4154                     element.m_prefix_ = 0;\r
4155                     Elements prefix = t.m_prefixLookup_.get(element);\r
4156                     element.m_cPoints_ = comp + comMark;\r
4157                     if (prefix == null) {\r
4158                         element.m_prefix_ = 0;\r
4159                         element.m_prefixChars_ = null;\r
4160                         colEl.setText(temp.toString());\r
4161                         int ce = colEl.next();\r
4162                         element.m_CELength_ = 0;\r
4163                         while (ce != CollationElementIterator.NULLORDER) {\r
4164                             element.m_CEs_[element.m_CELength_++] = ce;\r
4165                             ce = colEl.next();\r
4166                         }\r
4167                     } else {\r
4168                         element.m_cPoints_ = comp;\r
4169                         element.m_prefix_ = 0;\r
4170                         element.m_prefixChars_ = null;\r
4171                         element.m_CELength_ = 1;\r
4172                         element.m_CEs_[0] = prefix.m_mapCE_;\r
4173                     }\r
4174                     setMapCE(t, element);\r
4175                     finalizeAddition(t, element);\r
4176                     precompCh[noOfPrecomposedChs] = comp.charAt(0);\r
4177                     precompClass[noOfPrecomposedChs] = cMarkClass;\r
4178                     noOfPrecomposedChs++;\r
4179                 }\r
4180             } while (++count < 2 && (precompClass[j] == cMarkClass));\r
4181         }\r
4182         return noOfPrecomposedChs;\r
4183     }\r
4184 \r
4185     private void addFCD4AccentedContractions(BuildTable t,\r
4186             CollationElementIterator colEl, String data, Elements element) {\r
4187         String decomp = Normalizer.decompose(data, false);\r
4188         String comp = Normalizer.compose(data, false);\r
4189 \r
4190         element.m_cPoints_ = decomp;\r
4191         element.m_CELength_ = 0;\r
4192         element.m_prefix_ = 0;\r
4193         Elements prefix = t.m_prefixLookup_.get(element);\r
4194         if (prefix == null) {\r
4195             element.m_cPoints_ = comp;\r
4196             element.m_prefix_ = 0;\r
4197             element.m_prefixChars_ = null;\r
4198             element.m_CELength_ = 0;\r
4199             colEl.setText(decomp);\r
4200             int ce = colEl.next();\r
4201             element.m_CELength_ = 0;\r
4202             while (ce != CollationElementIterator.NULLORDER) {\r
4203                 element.m_CEs_[element.m_CELength_++] = ce;\r
4204                 ce = colEl.next();\r
4205             }\r
4206             addAnElement(t, element);\r
4207         }\r
4208     }\r
4209 \r
4210     private void processUCACompleteIgnorables(BuildTable t) {\r
4211         TrieIterator trieiterator = new TrieIterator(\r
4212                 RuleBasedCollator.UCA_.m_trie_);\r
4213         RangeValueIterator.Element element = new RangeValueIterator.Element();\r
4214         while (trieiterator.next(element)) {\r
4215             int start = element.start;\r
4216             int limit = element.limit;\r
4217             if (element.value == 0) {\r
4218                 while (start < limit) {\r
4219                     int CE = t.m_mapping_.getValue(start);\r
4220                     if (CE == CE_NOT_FOUND_) {\r
4221                         m_utilElement_.m_prefix_ = 0;\r
4222                         m_utilElement_.m_uchars_ = UCharacter.toString(start);\r
4223                         m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;\r
4224                         m_utilElement_.m_cPointsOffset_ = 0;\r
4225                         m_utilElement_.m_CELength_ = 1;\r
4226                         m_utilElement_.m_CEs_[0] = 0;\r
4227                         addAnElement(t, m_utilElement_);\r
4228                     }\r
4229                     start++;\r
4230                 }\r
4231             }\r
4232         }\r
4233     }\r
4234 }\r