]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_2_1-src/src/com/ibm/icu/text/CollationRuleParser.java
icu4jsrc
[Dictionary.git] / jars / icu4j-4_2_1-src / src / com / ibm / icu / text / CollationRuleParser.java
1 /**\r
2 *******************************************************************************\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and    *\r
4 * others. All Rights Reserved.                                                *\r
5 *******************************************************************************\r
6 */\r
7 package com.ibm.icu.text;\r
8 \r
9 import java.text.ParseException;\r
10 import java.util.Hashtable;\r
11 import java.util.Arrays;\r
12 import com.ibm.icu.lang.UCharacter;\r
13 import com.ibm.icu.impl.UCharacterProperty;\r
14 \r
15 /**\r
16 * Class for parsing collation rules, produces a list of tokens that will be\r
17 * turned into collation elements\r
18 * @author Syn Wee Quek\r
19 * @since release 2.2, June 7 2002\r
20 */\r
21 final class CollationRuleParser\r
22 {\r
23     // public data members ---------------------------------------------------\r
24 \r
25     // package private constructors ------------------------------------------\r
26 \r
27     /**\r
28      * <p>RuleBasedCollator constructor that takes the rules.\r
29      * Please see RuleBasedCollator class description for more details on the\r
30      * collation rule syntax.</p>\r
31      * @see java.util.Locale\r
32      * @param rules the collation rules to build the collation table from.\r
33      * @exception ParseException thrown when argument rules have an invalid\r
34      *            syntax.\r
35      */\r
36     CollationRuleParser(String rules) throws ParseException\r
37     {\r
38         extractSetsFromRules(rules);\r
39         m_source_ = new StringBuffer(Normalizer.decompose(rules, false).trim());\r
40         m_rules_ = m_source_.toString();\r
41         m_current_ = 0;\r
42         m_extraCurrent_ = m_source_.length();\r
43         m_variableTop_ = null;\r
44         m_parsedToken_ = new ParsedToken();\r
45         m_hashTable_ = new Hashtable();\r
46         m_options_ = new OptionSet(RuleBasedCollator.UCA_);\r
47         m_listHeader_ = new TokenListHeader[512];\r
48         m_resultLength_ = 0;\r
49         // call assembleTokenList() manually, so that we can\r
50         // init a parser and manually parse tokens\r
51         //assembleTokenList();\r
52     }\r
53 \r
54     // package private inner classes -----------------------------------------\r
55 \r
56     /**\r
57      * Collation options set\r
58      */\r
59     static class OptionSet\r
60     {\r
61         // package private constructor ---------------------------------------\r
62 \r
63         /**\r
64          * Initializes the option set with the argument collators\r
65          * @param collator option to use\r
66          */\r
67         OptionSet(RuleBasedCollator collator)\r
68         {\r
69             m_variableTopValue_ = collator.m_variableTopValue_;\r
70             m_isFrenchCollation_ = collator.isFrenchCollation();\r
71             m_isAlternateHandlingShifted_\r
72                                    = collator.isAlternateHandlingShifted();\r
73             m_caseFirst_ = collator.m_caseFirst_;\r
74             m_isCaseLevel_ = collator.isCaseLevel();\r
75             m_decomposition_ = collator.getDecomposition();\r
76             m_strength_ = collator.getStrength();\r
77             m_isHiragana4_ = collator.m_isHiragana4_;\r
78         }\r
79 \r
80         // package private data members --------------------------------------\r
81 \r
82         int m_variableTopValue_;\r
83         boolean m_isFrenchCollation_;\r
84         /**\r
85          * Attribute for handling variable elements\r
86          */\r
87         boolean m_isAlternateHandlingShifted_;\r
88         /**\r
89          * who goes first, lower case or uppercase\r
90          */\r
91         int m_caseFirst_;\r
92         /**\r
93          * do we have an extra case level\r
94          */\r
95         boolean m_isCaseLevel_;\r
96         /**\r
97          * attribute for normalization\r
98          */\r
99         int m_decomposition_;\r
100         /**\r
101          * attribute for strength\r
102          */\r
103         int m_strength_;\r
104         /**\r
105          * attribute for special Hiragana\r
106          */\r
107         boolean m_isHiragana4_;\r
108     }\r
109 \r
110     /**\r
111      * List of tokens used by the collation rules\r
112      */\r
113     static class TokenListHeader\r
114     {\r
115         Token m_first_;\r
116         Token m_last_;\r
117         Token m_reset_;\r
118         boolean m_indirect_;\r
119         int m_baseCE_;\r
120         int m_baseContCE_;\r
121         int m_nextCE_;\r
122         int m_nextContCE_;\r
123         int m_previousCE_;\r
124         int m_previousContCE_;\r
125         int m_pos_[] = new int[Collator.IDENTICAL + 1];\r
126         int m_gapsLo_[] = new int[3 * (Collator.TERTIARY + 1)];\r
127         int m_gapsHi_[] = new int[3 * (Collator.TERTIARY + 1)];\r
128         int m_numStr_[] = new int[3 * (Collator.TERTIARY + 1)];\r
129         Token m_fStrToken_[] = new Token[Collator.TERTIARY + 1];\r
130         Token m_lStrToken_[] = new Token[Collator.TERTIARY + 1];\r
131     }\r
132 \r
133     /**\r
134      * Token wrapper for collation rules\r
135      */\r
136     static class Token\r
137     {\r
138        // package private data members ---------------------------------------\r
139 \r
140        int m_CE_[];\r
141        int m_CELength_;\r
142        int m_expCE_[];\r
143        int m_expCELength_;\r
144        int m_source_;\r
145        int m_expansion_;\r
146        int m_prefix_;\r
147        int m_strength_;\r
148        int m_toInsert_;\r
149        int m_polarity_; // 1 for <, <<, <<<, , ; and 0 for >, >>, >>>\r
150        TokenListHeader m_listHeader_;\r
151        Token m_previous_;\r
152        Token m_next_;\r
153        StringBuffer m_rules_;\r
154        char m_flags_;\r
155 \r
156        // package private constructors ---------------------------------------\r
157 \r
158        Token()\r
159        {\r
160            m_CE_ = new int[128];\r
161            m_expCE_ = new int[128];\r
162            // TODO: this should also handle reverse\r
163            m_polarity_ = TOKEN_POLARITY_POSITIVE_;\r
164            m_next_ = null;\r
165            m_previous_ = null;\r
166            m_CELength_ = 0;\r
167            m_expCELength_ = 0;\r
168        }\r
169 \r
170        // package private methods --------------------------------------------\r
171 \r
172        /**\r
173         * Hashcode calculation for token\r
174         * @return the hashcode\r
175         */\r
176        public int hashCode()\r
177        {\r
178            int result = 0;\r
179            int len = (m_source_ & 0xFF000000) >>> 24;\r
180            int inc = ((len - 32) / 32) + 1;\r
181 \r
182            int start = m_source_ & 0x00FFFFFF;\r
183            int limit = start + len;\r
184 \r
185            while (start < limit) {\r
186                result = (result * 37) + m_rules_.charAt(start);\r
187                start += inc;\r
188            }\r
189            return result;\r
190        }\r
191 \r
192        /**\r
193         * Equals calculation\r
194         * @param target object to compare\r
195         * @return true if target is the same as this object\r
196         */\r
197        public boolean equals(Object target)\r
198        {\r
199            if (target == this) {\r
200                return true;\r
201            }\r
202            if (target instanceof Token) {\r
203                Token t = (Token)target;\r
204                int sstart = m_source_ & 0x00FFFFFF;\r
205                int tstart = t.m_source_ & 0x00FFFFFF;\r
206                int slimit = (m_source_ & 0xFF000000) >> 24;\r
207                int tlimit = (m_source_ & 0xFF000000) >> 24;\r
208 \r
209                int end = sstart + slimit - 1;\r
210 \r
211                if (m_source_ == 0 || t.m_source_ == 0) {\r
212                    return false;\r
213                }\r
214                if (slimit != tlimit) {\r
215                    return false;\r
216                }\r
217                if (m_source_ == t.m_source_) {\r
218                    return true;\r
219                }\r
220 \r
221                while (sstart < end\r
222                       && m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart))\r
223                {\r
224                    ++ sstart;\r
225                    ++ tstart;\r
226                }\r
227                if (m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) {\r
228                    return true;\r
229                }\r
230            }\r
231            return false;\r
232         }\r
233     }\r
234 \r
235     // package private data member -------------------------------------------\r
236 \r
237     /**\r
238      * Indicator that the token is resetted yet, ie & in the rules\r
239      */\r
240     static final int TOKEN_RESET_ = 0xDEADBEEF;\r
241 \r
242     /**\r
243      * Size of the number of tokens\r
244      */\r
245     int m_resultLength_;\r
246     /**\r
247      * List of parsed tokens\r
248      */\r
249     TokenListHeader m_listHeader_[];\r
250     /**\r
251      * Variable top token\r
252      */\r
253     Token m_variableTop_;\r
254     /**\r
255      * Collation options\r
256      */\r
257     OptionSet m_options_;\r
258     /**\r
259      * Normalized collation rules with some extra characters\r
260      */\r
261     StringBuffer m_source_;\r
262     /**\r
263      * Hash table to keep all tokens\r
264      */\r
265     Hashtable m_hashTable_;\r
266 \r
267     // package private method ------------------------------------------------\r
268 \r
269     void setDefaultOptionsInCollator(RuleBasedCollator collator)\r
270     {\r
271         collator.m_defaultStrength_ = m_options_.m_strength_;\r
272         collator.m_defaultDecomposition_ = m_options_.m_decomposition_;\r
273         collator.m_defaultIsFrenchCollation_ = m_options_.m_isFrenchCollation_;\r
274         collator.m_defaultIsAlternateHandlingShifted_\r
275                                     = m_options_.m_isAlternateHandlingShifted_;\r
276         collator.m_defaultIsCaseLevel_ = m_options_.m_isCaseLevel_;\r
277         collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_;\r
278         collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_;\r
279         collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_;\r
280     }\r
281 \r
282     // private inner classes -------------------------------------------------\r
283 \r
284     /**\r
285      * This is a token that has been parsed but not yet processed. Used to\r
286      * reduce the number of arguments in the parser\r
287      */\r
288     private static class ParsedToken\r
289     {\r
290         // private constructor ----------------------------------------------\r
291 \r
292         /**\r
293          * Empty constructor\r
294          */\r
295         ParsedToken()\r
296         {\r
297             m_charsLen_ = 0;\r
298             m_charsOffset_ = 0;\r
299             m_extensionLen_ = 0;\r
300             m_extensionOffset_ = 0;\r
301             m_prefixLen_ = 0;\r
302             m_prefixOffset_ = 0;\r
303             m_flags_ = 0;\r
304             m_strength_ = TOKEN_UNSET_;\r
305         }\r
306 \r
307         // private data members ---------------------------------------------\r
308 \r
309         int m_strength_;\r
310         int m_charsOffset_;\r
311         int m_charsLen_;\r
312         int m_extensionOffset_;\r
313         int m_extensionLen_;\r
314         int m_prefixOffset_;\r
315         int m_prefixLen_;\r
316         char m_flags_;\r
317         char m_indirectIndex_;\r
318     }\r
319 \r
320     /**\r
321      * Boundary wrappers\r
322      */\r
323     private static class IndirectBoundaries\r
324     {\r
325         // package private constructor ---------------------------------------\r
326 \r
327         IndirectBoundaries(int startce[], int limitce[])\r
328         {\r
329             // Set values for the top - TODO: once we have values for all the\r
330             // indirects, we are going to initalize here.\r
331             m_startCE_ = startce[0];\r
332             m_startContCE_ = startce[1];\r
333             if (limitce != null) {\r
334                 m_limitCE_ = limitce[0];\r
335                 m_limitContCE_ = limitce[1];\r
336             }\r
337             else {\r
338                 m_limitCE_ = 0;\r
339                 m_limitContCE_ = 0;\r
340             }\r
341         }\r
342 \r
343         // package private data members --------------------------------------\r
344 \r
345         int m_startCE_;\r
346         int m_startContCE_;\r
347         int m_limitCE_;\r
348         int m_limitContCE_;\r
349     }\r
350 \r
351     /**\r
352      * Collation option rule tag\r
353      */\r
354     private static class TokenOption\r
355     {\r
356         // package private constructor ---------------------------------------\r
357 \r
358         TokenOption(String name, int attribute, String suboptions[],\r
359                     int suboptionattributevalue[])\r
360         {\r
361             m_name_ = name;\r
362             m_attribute_ = attribute;\r
363             m_subOptions_ = suboptions;\r
364             m_subOptionAttributeValues_ = suboptionattributevalue;\r
365         }\r
366 \r
367         // package private data member ---------------------------------------\r
368 \r
369         private String m_name_;\r
370         private int m_attribute_;\r
371         private String m_subOptions_[];\r
372         private int m_subOptionAttributeValues_[];\r
373     }\r
374 \r
375     // private variables -----------------------------------------------------\r
376 \r
377     /**\r
378      * Current parsed token\r
379      */\r
380     private ParsedToken m_parsedToken_;\r
381     /**\r
382      * Collation rule\r
383      */\r
384     private String m_rules_;\r
385     private int m_current_;\r
386     /**\r
387      * End of the option while reading.\r
388      * Need it for UnicodeSet reading support.\r
389      */\r
390     private int m_optionEnd_;\r
391     /*\r
392      * Current offset in m_source\r
393      */\r
394     //private int m_sourceLimit_;\r
395     /**\r
396      * Offset to m_source_ ofr the extra expansion characters\r
397      */\r
398     private int m_extraCurrent_;\r
399 \r
400     /**\r
401      * UnicodeSet that contains code points to be copied from the UCA\r
402      */\r
403     UnicodeSet m_copySet_;\r
404 \r
405     /**\r
406      * UnicodeSet that contains code points for which we want to remove\r
407      * UCA contractions. It implies copying of these code points from\r
408      * the UCA.\r
409      */\r
410     UnicodeSet m_removeSet_;\r
411     /*\r
412      * This is space for the extra strings that need to be unquoted during the\r
413      * parsing of the rules\r
414      */\r
415     //private static final int TOKEN_EXTRA_RULE_SPACE_SIZE_ = 2048;\r
416     /**\r
417      * Indicator that the token is not set yet\r
418      */\r
419     private static final int TOKEN_UNSET_ = 0xFFFFFFFF;\r
420     /*\r
421      * Indicator that the rule is in the > polarity, ie everything on the\r
422      * right of the rule is less than\r
423      */\r
424     //private static final int TOKEN_POLARITY_NEGATIVE_ = 0;\r
425     /**\r
426      * Indicator that the rule is in the < polarity, ie everything on the\r
427      * right of the rule is greater than\r
428      */\r
429     private static final int TOKEN_POLARITY_POSITIVE_ = 1;\r
430     /**\r
431      * Flag mask to determine if top is set\r
432      */\r
433     private static final int TOKEN_TOP_MASK_ = 0x04;\r
434     /**\r
435      * Flag mask to determine if variable top is set\r
436      */\r
437     private static final int TOKEN_VARIABLE_TOP_MASK_ = 0x08;\r
438     /**\r
439      * Flag mask to determine if a before attribute is set\r
440      */\r
441     private static final int TOKEN_BEFORE_ = 0x03;\r
442     /**\r
443      * For use in parsing token options\r
444      */\r
445     private static final int TOKEN_SUCCESS_MASK_ = 0x10;\r
446 \r
447     /**\r
448      * These values are used for finding CE values for indirect positioning.\r
449      * Indirect positioning is a mechanism for allowing resets on symbolic\r
450      * values. It only works for resets and you cannot tailor indirect names.\r
451      * An indirect name can define either an anchor point or a range. An anchor\r
452      * point behaves in exactly the same way as a code point in reset would,\r
453      * except that it cannot be tailored. A range (we currently only know for\r
454      * the [top] range will explicitly set the upper bound for generated CEs,\r
455      * thus allowing for better control over how many CEs can be squeezed\r
456      * between in the range without performance penalty. In that respect, we use\r
457      * [top] for tailoring of locales that use CJK characters. Other indirect\r
458      * values are currently a pure convenience, they can be used to assure that\r
459      * the CEs will be always positioned in the same place relative to a point\r
460      * with known properties (e.g. first primary ignorable).\r
461      */\r
462     private static final IndirectBoundaries INDIRECT_BOUNDARIES_[];\r
463 \r
464 //    /**\r
465 //     * Inverse UCA constants\r
466 //     */\r
467 //    private static final int INVERSE_SIZE_MASK_ = 0xFFF00000;\r
468 //    private static final int INVERSE_OFFSET_MASK_ = 0x000FFFFF;\r
469 //    private static final int INVERSE_SHIFT_VALUE_ = 20;\r
470 \r
471     /**\r
472      * Collation option tags\r
473      * [last variable] last variable value\r
474      * [last primary ignorable] largest CE for primary ignorable\r
475      * [last secondary ignorable] largest CE for secondary ignorable\r
476      * [last tertiary ignorable] largest CE for tertiary ignorable\r
477      * [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)\r
478      */\r
479     private static final TokenOption RULES_OPTIONS_[];\r
480 \r
481     static\r
482     {\r
483         INDIRECT_BOUNDARIES_ = new IndirectBoundaries[15];\r
484         // UCOL_RESET_TOP_VALUE\r
485         INDIRECT_BOUNDARIES_[0] = new IndirectBoundaries(\r
486                         RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,\r
487                         RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);\r
488         // UCOL_FIRST_PRIMARY_IGNORABLE\r
489         INDIRECT_BOUNDARIES_[1] = new IndirectBoundaries(\r
490                     RuleBasedCollator.UCA_CONSTANTS_.FIRST_PRIMARY_IGNORABLE_,\r
491                     null);\r
492         // UCOL_LAST_PRIMARY_IGNORABLE\r
493         INDIRECT_BOUNDARIES_[2] = new IndirectBoundaries(\r
494                     RuleBasedCollator.UCA_CONSTANTS_.LAST_PRIMARY_IGNORABLE_,\r
495                     null);\r
496 \r
497         // UCOL_FIRST_SECONDARY_IGNORABLE\r
498         INDIRECT_BOUNDARIES_[3] = new IndirectBoundaries(\r
499                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_SECONDARY_IGNORABLE_,\r
500                    null);\r
501         // UCOL_LAST_SECONDARY_IGNORABLE\r
502         INDIRECT_BOUNDARIES_[4] = new IndirectBoundaries(\r
503                    RuleBasedCollator.UCA_CONSTANTS_.LAST_SECONDARY_IGNORABLE_,\r
504                    null);\r
505         // UCOL_FIRST_TERTIARY_IGNORABLE\r
506         INDIRECT_BOUNDARIES_[5] = new IndirectBoundaries(\r
507                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_TERTIARY_IGNORABLE_,\r
508                    null);\r
509         // UCOL_LAST_TERTIARY_IGNORABLE\r
510         INDIRECT_BOUNDARIES_[6] = new IndirectBoundaries(\r
511                    RuleBasedCollator.UCA_CONSTANTS_.LAST_TERTIARY_IGNORABLE_,\r
512                    null);\r
513         // UCOL_FIRST_VARIABLE;\r
514         INDIRECT_BOUNDARIES_[7] = new IndirectBoundaries(\r
515                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_VARIABLE_,\r
516                    null);\r
517         // UCOL_LAST_VARIABLE\r
518         INDIRECT_BOUNDARIES_[8] = new IndirectBoundaries(\r
519                    RuleBasedCollator.UCA_CONSTANTS_.LAST_VARIABLE_,\r
520                    null);\r
521         // UCOL_FIRST_NON_VARIABLE\r
522         INDIRECT_BOUNDARIES_[9] = new IndirectBoundaries(\r
523                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_NON_VARIABLE_,\r
524                    null);\r
525         // UCOL_LAST_NON_VARIABLE\r
526         INDIRECT_BOUNDARIES_[10] = new IndirectBoundaries(\r
527                    RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,\r
528                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);\r
529         // UCOL_FIRST_IMPLICIT\r
530         INDIRECT_BOUNDARIES_[11] = new IndirectBoundaries(\r
531                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_,\r
532                    null);\r
533         // UCOL_LAST_IMPLICIT\r
534         INDIRECT_BOUNDARIES_[12] = new IndirectBoundaries(\r
535                    RuleBasedCollator.UCA_CONSTANTS_.LAST_IMPLICIT_,\r
536                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_);\r
537         // UCOL_FIRST_TRAILING\r
538         INDIRECT_BOUNDARIES_[13] = new IndirectBoundaries(\r
539                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_,\r
540                    null);\r
541         // UCOL_LAST_TRAILING\r
542         INDIRECT_BOUNDARIES_[14] = new IndirectBoundaries(\r
543                    RuleBasedCollator.UCA_CONSTANTS_.LAST_TRAILING_,\r
544                    null);\r
545         INDIRECT_BOUNDARIES_[14].m_limitCE_\r
546                  = RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_SPECIAL_MIN_ << 24;\r
547 \r
548         RULES_OPTIONS_ = new TokenOption[19];\r
549         String option[] = {"non-ignorable", "shifted"};\r
550         int value[] = {RuleBasedCollator.AttributeValue.NON_IGNORABLE_,\r
551                        RuleBasedCollator.AttributeValue.SHIFTED_};\r
552         RULES_OPTIONS_[0] = new TokenOption("alternate",\r
553                               RuleBasedCollator.Attribute.ALTERNATE_HANDLING_,\r
554                               option, value);\r
555         option = new String[1];\r
556         option[0] = "2";\r
557         value = new int[1];\r
558         value[0] = RuleBasedCollator.AttributeValue.ON_;\r
559         RULES_OPTIONS_[1] = new TokenOption("backwards",\r
560                                  RuleBasedCollator.Attribute.FRENCH_COLLATION_,\r
561                                  option, value);\r
562         String offonoption[] = new String[2];\r
563         offonoption[0] = "off";\r
564         offonoption[1] = "on";\r
565         int offonvalue[] = new int[2];\r
566         offonvalue[0] = RuleBasedCollator.AttributeValue.OFF_;\r
567         offonvalue[1] = RuleBasedCollator.AttributeValue.ON_;\r
568         RULES_OPTIONS_[2] = new TokenOption("caseLevel",\r
569                                        RuleBasedCollator.Attribute.CASE_LEVEL_,\r
570                                        offonoption, offonvalue);\r
571         option = new String[3];\r
572         option[0] = "lower";\r
573         option[1] = "upper";\r
574         option[2] = "off";\r
575         value = new int[3];\r
576         value[0] = RuleBasedCollator.AttributeValue.LOWER_FIRST_;\r
577         value[1] = RuleBasedCollator.AttributeValue.UPPER_FIRST_;\r
578         value[2] = RuleBasedCollator.AttributeValue.OFF_;\r
579         RULES_OPTIONS_[3] = new TokenOption("caseFirst",\r
580                                        RuleBasedCollator.Attribute.CASE_FIRST_,\r
581                                        option, value);\r
582         RULES_OPTIONS_[4] = new TokenOption("normalization",\r
583                                RuleBasedCollator.Attribute.NORMALIZATION_MODE_,\r
584                                offonoption, offonvalue);\r
585         RULES_OPTIONS_[5] = new TokenOption("hiraganaQ",\r
586                          RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_,\r
587                          offonoption, offonvalue);\r
588         option = new String[5];\r
589         option[0] = "1";\r
590         option[1] = "2";\r
591         option[2] = "3";\r
592         option[3] = "4";\r
593         option[4] = "I";\r
594         value = new int[5];\r
595         value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;\r
596         value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;\r
597         value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;\r
598         value[3] = RuleBasedCollator.AttributeValue.QUATERNARY_;\r
599         value[4] = RuleBasedCollator.AttributeValue.IDENTICAL_;\r
600         RULES_OPTIONS_[6] = new TokenOption("strength",\r
601                                          RuleBasedCollator.Attribute.STRENGTH_,\r
602                                          option, value);\r
603         RULES_OPTIONS_[7] = new TokenOption("variable top",\r
604                                   RuleBasedCollator.Attribute.LIMIT_,\r
605                                   null, null);\r
606         RULES_OPTIONS_[8] = new TokenOption("rearrange",\r
607                                   RuleBasedCollator.Attribute.LIMIT_,\r
608                                   null, null);\r
609         option = new String[3];\r
610         option[0] = "1";\r
611         option[1] = "2";\r
612         option[2] = "3";\r
613         value = new int[3];\r
614         value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;\r
615         value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;\r
616         value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;\r
617         RULES_OPTIONS_[9] = new TokenOption("before",\r
618                                   RuleBasedCollator.Attribute.LIMIT_,\r
619                                   option, value);\r
620         RULES_OPTIONS_[10] = new TokenOption("top",\r
621                                   RuleBasedCollator.Attribute.LIMIT_,\r
622                                   null, null);\r
623         String firstlastoption[] = new String[7];\r
624         firstlastoption[0] = "primary";\r
625         firstlastoption[1] = "secondary";\r
626         firstlastoption[2] = "tertiary";\r
627         firstlastoption[3] = "variable";\r
628         firstlastoption[4] = "regular";\r
629         firstlastoption[5] = "implicit";\r
630         firstlastoption[6] = "trailing";\r
631 \r
632         int firstlastvalue[] = new int[7];\r
633         Arrays.fill(firstlastvalue, RuleBasedCollator.AttributeValue.PRIMARY_);\r
634 \r
635         RULES_OPTIONS_[11] = new TokenOption("first",\r
636                                   RuleBasedCollator.Attribute.LIMIT_,\r
637                                   firstlastoption, firstlastvalue);\r
638         RULES_OPTIONS_[12] = new TokenOption("last",\r
639                                   RuleBasedCollator.Attribute.LIMIT_,\r
640                                   firstlastoption, firstlastvalue);\r
641         RULES_OPTIONS_[13] = new TokenOption("optimize",\r
642                                   RuleBasedCollator.Attribute.LIMIT_,\r
643                                   null, null);\r
644         RULES_OPTIONS_[14] = new TokenOption("suppressContractions",\r
645                                   RuleBasedCollator.Attribute.LIMIT_,\r
646                                   null, null);\r
647         RULES_OPTIONS_[15] = new TokenOption("undefined",\r
648                                   RuleBasedCollator.Attribute.LIMIT_,\r
649                                   null, null);\r
650         RULES_OPTIONS_[16] = new TokenOption("scriptOrder",\r
651                                   RuleBasedCollator.Attribute.LIMIT_,\r
652                                   null, null);\r
653         RULES_OPTIONS_[17] = new TokenOption("charsetname",\r
654                                   RuleBasedCollator.Attribute.LIMIT_,\r
655                                   null, null);\r
656         RULES_OPTIONS_[18] = new TokenOption("charset",\r
657                                   RuleBasedCollator.Attribute.LIMIT_,\r
658                                   null, null);\r
659     }\r
660 \r
661     /**\r
662      * Utility data members\r
663      */\r
664     private Token m_utilToken_ = new Token();\r
665     private CollationElementIterator m_UCAColEIter_\r
666                       = RuleBasedCollator.UCA_.getCollationElementIterator("");\r
667     private int m_utilCEBuffer_[] = new int[2];\r
668 \r
669     // private methods -------------------------------------------------------\r
670 \r
671     /**\r
672      * Assembles the token list\r
673      * @exception ParseException thrown when rules syntax fails\r
674      */\r
675     int assembleTokenList() throws ParseException\r
676     {\r
677         Token lastToken = null;\r
678         m_parsedToken_.m_strength_ = TOKEN_UNSET_;\r
679         int sourcelimit = m_source_.length();\r
680         int expandNext = 0;\r
681 \r
682         while (m_current_ < sourcelimit) {\r
683             m_parsedToken_.m_prefixOffset_ = 0;\r
684             if (parseNextToken(lastToken == null) < 0) {\r
685                 // we have reached the end\r
686                 continue;\r
687             }\r
688             char specs = m_parsedToken_.m_flags_;\r
689             boolean variableTop = ((specs & TOKEN_VARIABLE_TOP_MASK_) != 0);\r
690             boolean top = ((specs & TOKEN_TOP_MASK_) != 0);\r
691             int lastStrength = TOKEN_UNSET_;\r
692             if (lastToken != null) {\r
693                 lastStrength = lastToken.m_strength_;\r
694             }\r
695             m_utilToken_.m_source_ = m_parsedToken_.m_charsLen_ << 24\r
696                                              | m_parsedToken_.m_charsOffset_;\r
697             m_utilToken_.m_rules_ = m_source_;\r
698             // 4 Lookup each source in the CharsToToken map, and find a\r
699             // sourcetoken\r
700             Token sourceToken = (Token)m_hashTable_.get(m_utilToken_);\r
701             if (m_parsedToken_.m_strength_ != TOKEN_RESET_) {\r
702                 if (lastToken == null) {\r
703                     // this means that rules haven't started properly\r
704                     throwParseException(m_source_.toString(), 0);\r
705                 }\r
706                 //  6 Otherwise (when relation != reset)\r
707                 if (sourceToken == null) {\r
708                     // If sourceToken is null, create new one\r
709                     sourceToken = new Token();\r
710                      sourceToken.m_rules_ = m_source_;\r
711                     sourceToken.m_source_ = m_parsedToken_.m_charsLen_ << 24\r
712                                            | m_parsedToken_.m_charsOffset_;\r
713                     sourceToken.m_prefix_ = m_parsedToken_.m_prefixLen_ << 24\r
714                                            | m_parsedToken_.m_prefixOffset_;\r
715                     // TODO: this should also handle reverse\r
716                     sourceToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;\r
717                     sourceToken.m_next_ = null;\r
718                      sourceToken.m_previous_ = null;\r
719                     sourceToken.m_CELength_ = 0;\r
720                     sourceToken.m_expCELength_ = 0;\r
721                     m_hashTable_.put(sourceToken, sourceToken);\r
722                 }\r
723                 else {\r
724                     // we could have fished out a reset here\r
725                     if (sourceToken.m_strength_ != TOKEN_RESET_\r
726                         && lastToken != sourceToken) {\r
727                         // otherwise remove sourceToken from where it was.\r
728                         if (sourceToken.m_next_ != null) {\r
729                             if (sourceToken.m_next_.m_strength_\r
730                                                    > sourceToken.m_strength_) {\r
731                                 sourceToken.m_next_.m_strength_\r
732                                                    = sourceToken.m_strength_;\r
733                             }\r
734                             sourceToken.m_next_.m_previous_\r
735                                                     = sourceToken.m_previous_;\r
736                         }\r
737                         else {\r
738                             sourceToken.m_listHeader_.m_last_\r
739                                                     = sourceToken.m_previous_;\r
740                         }\r
741                         if (sourceToken.m_previous_ != null) {\r
742                             sourceToken.m_previous_.m_next_\r
743                                                         = sourceToken.m_next_;\r
744                         }\r
745                         else {\r
746                             sourceToken.m_listHeader_.m_first_\r
747                                                         = sourceToken.m_next_;\r
748                         }\r
749                         sourceToken.m_next_ = null;\r
750                         sourceToken.m_previous_ = null;\r
751                     }\r
752                 }\r
753                 sourceToken.m_strength_ = m_parsedToken_.m_strength_;\r
754                 sourceToken.m_listHeader_ = lastToken.m_listHeader_;\r
755 \r
756                 // 1.  Find the strongest strength in each list, and set\r
757                 // strongestP and strongestN accordingly in the headers.\r
758                 if (lastStrength == TOKEN_RESET_\r
759                     || sourceToken.m_listHeader_.m_first_ == null) {\r
760                     // If LAST is a reset insert sourceToken in the list.\r
761                     if (sourceToken.m_listHeader_.m_first_ == null) {\r
762                         sourceToken.m_listHeader_.m_first_ = sourceToken;\r
763                         sourceToken.m_listHeader_.m_last_ = sourceToken;\r
764                     }\r
765                     else { // we need to find a place for us\r
766                            // and we'll get in front of the same strength\r
767                         if (sourceToken.m_listHeader_.m_first_.m_strength_\r
768                                                  <= sourceToken.m_strength_) {\r
769                             sourceToken.m_next_\r
770                                           = sourceToken.m_listHeader_.m_first_;\r
771                             sourceToken.m_next_.m_previous_ = sourceToken;\r
772                             sourceToken.m_listHeader_.m_first_ = sourceToken;\r
773                             sourceToken.m_previous_ = null;\r
774                         }\r
775                         else {\r
776                             lastToken = sourceToken.m_listHeader_.m_first_;\r
777                             while (lastToken.m_next_ != null\r
778                                    && lastToken.m_next_.m_strength_\r
779                                                  > sourceToken.m_strength_) {\r
780                                 lastToken = lastToken.m_next_;\r
781                             }\r
782                             if (lastToken.m_next_ != null) {\r
783                                 lastToken.m_next_.m_previous_ = sourceToken;\r
784                             }\r
785                             else {\r
786                                 sourceToken.m_listHeader_.m_last_\r
787                                                                = sourceToken;\r
788                             }\r
789                             sourceToken.m_previous_ = lastToken;\r
790                             sourceToken.m_next_ = lastToken.m_next_;\r
791                             lastToken.m_next_ = sourceToken;\r
792                         }\r
793                     }\r
794                 }\r
795                 else {\r
796                     // Otherwise (when LAST is not a reset)\r
797                     // if polarity (LAST) == polarity(relation), insert\r
798                     // sourceToken after LAST, otherwise insert before.\r
799                     // when inserting after or before, search to the next\r
800                     // position with the same strength in that direction.\r
801                     // (This is called postpone insertion).\r
802                     if (sourceToken != lastToken) {\r
803                         if (lastToken.m_polarity_ == sourceToken.m_polarity_) {\r
804                             while (lastToken.m_next_ != null\r
805                                    && lastToken.m_next_.m_strength_\r
806                                                    > sourceToken.m_strength_) {\r
807                                 lastToken = lastToken.m_next_;\r
808                             }\r
809                             sourceToken.m_previous_ = lastToken;\r
810                             if (lastToken.m_next_ != null) {\r
811                                 lastToken.m_next_.m_previous_ = sourceToken;\r
812                             }\r
813                             else {\r
814                                 sourceToken.m_listHeader_.m_last_ = sourceToken;\r
815                             }\r
816                             sourceToken.m_next_ = lastToken.m_next_;\r
817                             lastToken.m_next_ = sourceToken;\r
818                         }\r
819                         else {\r
820                             while (lastToken.m_previous_ != null\r
821                                    && lastToken.m_previous_.m_strength_\r
822                                                 > sourceToken.m_strength_) {\r
823                                 lastToken = lastToken.m_previous_;\r
824                             }\r
825                             sourceToken.m_next_ = lastToken;\r
826                             if (lastToken.m_previous_ != null) {\r
827                                 lastToken.m_previous_.m_next_ = sourceToken;\r
828                             }\r
829                             else {\r
830                                 sourceToken.m_listHeader_.m_first_\r
831                                                                  = sourceToken;\r
832                             }\r
833                             sourceToken.m_previous_ = lastToken.m_previous_;\r
834                             lastToken.m_previous_ = sourceToken;\r
835                         }\r
836                     }\r
837                     else { // repeated one thing twice in rules, stay with the\r
838                            // stronger strength\r
839                         if (lastStrength < sourceToken.m_strength_) {\r
840                             sourceToken.m_strength_ = lastStrength;\r
841                         }\r
842                     }\r
843                 }\r
844                 // if the token was a variable top, we're gonna put it in\r
845                 if (variableTop == true && m_variableTop_ == null) {\r
846                     variableTop = false;\r
847                     m_variableTop_ = sourceToken;\r
848                 }\r
849                 // Treat the expansions.\r
850                 // There are two types of expansions: explicit (x / y) and\r
851                 // reset based propagating expansions\r
852                 // (&abc * d * e <=> &ab * d / c * e / c)\r
853                 // if both of them are in effect for a token, they are combined.\r
854                sourceToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24\r
855                                           | m_parsedToken_.m_extensionOffset_;\r
856                if (expandNext != 0) {\r
857                    if (sourceToken.m_strength_ == RuleBasedCollator.PRIMARY) {\r
858                        // primary strength kills off the implicit expansion\r
859                        expandNext = 0;\r
860                    }\r
861                    else if (sourceToken.m_expansion_ == 0) {\r
862                        // if there is no expansion, implicit is just added to\r
863                        // the token\r
864                        sourceToken.m_expansion_ = expandNext;\r
865                    }\r
866                    else {\r
867                        // there is both explicit and implicit expansion.\r
868                        // We need to make a combination\r
869                        int start = expandNext & 0xFFFFFF;\r
870                        int size = expandNext >>> 24;\r
871                        if (size > 0) {\r
872                           m_source_.append(m_source_.substring(start,\r
873                                                                start + size));\r
874                        }\r
875                           start = m_parsedToken_.m_extensionOffset_;\r
876                        m_source_.append(m_source_.substring(start,\r
877                                       start + m_parsedToken_.m_extensionLen_));\r
878                        sourceToken.m_expansion_ = (size\r
879                                        + m_parsedToken_.m_extensionLen_) << 24\r
880                                        | m_extraCurrent_;\r
881                        m_extraCurrent_ += size + m_parsedToken_.m_extensionLen_;\r
882                    }\r
883                 }\r
884                // if the previous token was a reset before, the strength of this\r
885                // token must match the strength of before. Otherwise we have an\r
886                // undefined situation.\r
887                // In other words, we currently have a cludge which we use to\r
888                // represent &a >> x. This is written as &[before 2]a << x.\r
889                if((lastToken.m_flags_ & TOKEN_BEFORE_) != 0) {\r
890                    int beforeStrength = (lastToken.m_flags_ & TOKEN_BEFORE_) - 1;\r
891                    if(beforeStrength != sourceToken.m_strength_) {\r
892                           throwParseException(m_source_.toString(), m_current_);\r
893                    }\r
894                }\r
895 \r
896             }\r
897             else {\r
898                 if (lastToken != null && lastStrength == TOKEN_RESET_) {\r
899                     // if the previous token was also a reset, this means that\r
900                     // we have two consecutive resets and we want to remove the\r
901                     // previous one if empty\r
902                     if (m_resultLength_ > 0 && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {\r
903                         m_resultLength_ --;\r
904                     }\r
905                 }\r
906                 if (sourceToken == null) {\r
907                     // this is a reset, but it might still be somewhere in the\r
908                     // tailoring, in shorter form\r
909                     int searchCharsLen = m_parsedToken_.m_charsLen_;\r
910                     while (searchCharsLen > 1 && sourceToken == null) {\r
911                         searchCharsLen --;\r
912                         // key = searchCharsLen << 24 | charsOffset;\r
913                         m_utilToken_.m_source_ = searchCharsLen << 24\r
914                                              | m_parsedToken_.m_charsOffset_;\r
915                         m_utilToken_.m_rules_ = m_source_;\r
916                         sourceToken = (Token)m_hashTable_.get(m_utilToken_);\r
917                     }\r
918                     if (sourceToken != null) {\r
919                         expandNext = (m_parsedToken_.m_charsLen_\r
920                                                       - searchCharsLen) << 24\r
921                                         | (m_parsedToken_.m_charsOffset_\r
922                                            + searchCharsLen);\r
923                     }\r
924                 }\r
925                 if ((specs & TOKEN_BEFORE_) != 0) {\r
926                     if (top == false) {\r
927                         // we're doing before & there is no indirection\r
928                         int strength = (specs & TOKEN_BEFORE_) - 1;\r
929                         if (sourceToken != null\r
930                             && sourceToken.m_strength_ != TOKEN_RESET_) {\r
931                             // this is a before that is already ordered in the UCA\r
932                             // - so we need to get the previous with good strength\r
933                             while (sourceToken.m_strength_ > strength\r
934                                    && sourceToken.m_previous_ != null) {\r
935                                 sourceToken = sourceToken.m_previous_;\r
936                             }\r
937                             // here, either we hit the strength or NULL\r
938                             if (sourceToken.m_strength_ == strength) {\r
939                                 if (sourceToken.m_previous_ != null) {\r
940                                     sourceToken = sourceToken.m_previous_;\r
941                                 }\r
942                                 else { // start of list\r
943                                     sourceToken\r
944                                          = sourceToken.m_listHeader_.m_reset_;\r
945                                 }\r
946                             }\r
947                             else { // we hit NULL, we should be doing the else part\r
948                                 sourceToken\r
949                                          = sourceToken.m_listHeader_.m_reset_;\r
950                                 sourceToken = getVirginBefore(sourceToken,\r
951                                                               strength);\r
952                             }\r
953                         }\r
954                         else {\r
955                             sourceToken\r
956                                       = getVirginBefore(sourceToken, strength);\r
957                         }\r
958                     }\r
959                     else {\r
960                         // this is both before and indirection\r
961                         top = false;\r
962                         m_listHeader_[m_resultLength_] = new TokenListHeader();\r
963                         m_listHeader_[m_resultLength_].m_previousCE_ = 0;\r
964                         m_listHeader_[m_resultLength_].m_previousContCE_ = 0;\r
965                         m_listHeader_[m_resultLength_].m_indirect_ = true;\r
966                         // we need to do slightly more work. we need to get the\r
967                         // baseCE using the inverse UCA & getPrevious. The next\r
968                         // bound is not set, and will be decided in ucol_bld\r
969                         int strength = (specs & TOKEN_BEFORE_) - 1;\r
970                         int baseCE = INDIRECT_BOUNDARIES_[\r
971                                    m_parsedToken_.m_indirectIndex_].m_startCE_;\r
972                         int baseContCE = INDIRECT_BOUNDARIES_[\r
973                                m_parsedToken_.m_indirectIndex_].m_startContCE_;\r
974                         int ce[] = new int[2];\r
975                         if((baseCE >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)\r
976                         && (baseCE >>> 24 <=  RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */\r
977                             int primary = baseCE & RuleBasedCollator.CE_PRIMARY_MASK_ | (baseContCE & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;\r
978                             int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);\r
979                             int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);\r
980                             ce[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;\r
981                             ce[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;\r
982                         } else {\r
983                             CollationParsedRuleBuilder.InverseUCA invuca\r
984                                 = CollationParsedRuleBuilder.INVERSE_UCA_;\r
985                             invuca.getInversePrevCE(baseCE, baseContCE, strength,\r
986                                     ce);\r
987                         }\r
988                         m_listHeader_[m_resultLength_].m_baseCE_ = ce[0];\r
989                         m_listHeader_[m_resultLength_].m_baseContCE_ = ce[1];\r
990                         m_listHeader_[m_resultLength_].m_nextCE_ = 0;\r
991                         m_listHeader_[m_resultLength_].m_nextContCE_ = 0;\r
992 \r
993                         sourceToken = new Token();\r
994                         expandNext = initAReset(0, sourceToken);\r
995                     }\r
996                 }\r
997                 // 5 If the relation is a reset:\r
998                 // If sourceToken is null\r
999                 // Create new list, create new sourceToken, make the baseCE\r
1000                 // from source, put the sourceToken in ListHeader of the new\r
1001                 // list\r
1002                 if (sourceToken == null) {\r
1003                     if (m_listHeader_[m_resultLength_] == null) {\r
1004                         m_listHeader_[m_resultLength_] = new TokenListHeader();\r
1005                     }\r
1006                     // 3 Consider each item: relation, source, and expansion:\r
1007                     // e.g. ...< x / y ...\r
1008                     // First convert all expansions into normal form.\r
1009                     // Examples:\r
1010                     // If "xy" doesn't occur earlier in the list or in the UCA,\r
1011                     // convert &xy * c * d * ... into &x * c/y * d * ...\r
1012                     // Note: reset values can never have expansions, although\r
1013                     // they can cause the very next item to have one. They may\r
1014                     // be contractions, if they are found earlier in the list.\r
1015                     if (top == false) {\r
1016                         CollationElementIterator coleiter\r
1017                         = RuleBasedCollator.UCA_.getCollationElementIterator(\r
1018                             m_source_.substring(m_parsedToken_.m_charsOffset_,\r
1019                                                 m_parsedToken_.m_charsOffset_\r
1020                                                 + m_parsedToken_.m_charsLen_));\r
1021 \r
1022                         int CE = coleiter.next();\r
1023                         // offset to the character in the full rule string\r
1024                         int expand = coleiter.getOffset()\r
1025                                      + m_parsedToken_.m_charsOffset_;\r
1026                         int SecondCE = coleiter.next();\r
1027 \r
1028                         m_listHeader_[m_resultLength_].m_baseCE_\r
1029                                                              = CE & 0xFFFFFF3F;\r
1030                         if (RuleBasedCollator.isContinuation(SecondCE)) {\r
1031                             m_listHeader_[m_resultLength_].m_baseContCE_\r
1032                                                                     = SecondCE;\r
1033                         }\r
1034                         else {\r
1035                             m_listHeader_[m_resultLength_].m_baseContCE_ = 0;\r
1036                         }\r
1037                         m_listHeader_[m_resultLength_].m_nextCE_ = 0;\r
1038                         m_listHeader_[m_resultLength_].m_nextContCE_ = 0;\r
1039                         m_listHeader_[m_resultLength_].m_previousCE_ = 0;\r
1040                         m_listHeader_[m_resultLength_].m_previousContCE_ = 0;\r
1041                         m_listHeader_[m_resultLength_].m_indirect_ = false;\r
1042                         sourceToken = new Token();\r
1043                         expandNext = initAReset(expand, sourceToken);\r
1044                     }\r
1045                     else { // top == TRUE\r
1046                         top = false;\r
1047                         m_listHeader_[m_resultLength_].m_previousCE_ = 0;\r
1048                         m_listHeader_[m_resultLength_].m_previousContCE_ = 0;\r
1049                         m_listHeader_[m_resultLength_].m_indirect_ = true;\r
1050                         IndirectBoundaries ib = INDIRECT_BOUNDARIES_[\r
1051                                               m_parsedToken_.m_indirectIndex_];\r
1052                         m_listHeader_[m_resultLength_].m_baseCE_\r
1053                                                                = ib.m_startCE_;\r
1054                         m_listHeader_[m_resultLength_].m_baseContCE_\r
1055                                                            = ib.m_startContCE_;\r
1056                         m_listHeader_[m_resultLength_].m_nextCE_\r
1057                                                                = ib.m_limitCE_;\r
1058                         m_listHeader_[m_resultLength_].m_nextContCE_\r
1059                                                            = ib.m_limitContCE_;\r
1060                         sourceToken = new Token();\r
1061                         expandNext = initAReset(0, sourceToken);\r
1062                     }\r
1063                 }\r
1064                 else { // reset to something already in rules\r
1065                     top = false;\r
1066                 }\r
1067             }\r
1068             // 7 After all this, set LAST to point to sourceToken, and goto\r
1069             // step 3.\r
1070             lastToken = sourceToken;\r
1071         }\r
1072 \r
1073         if (m_resultLength_ > 0\r
1074             && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {\r
1075             m_resultLength_ --;\r
1076         }\r
1077         return m_resultLength_;\r
1078     }\r
1079 \r
1080     /**\r
1081      * Formats and throws a ParseException\r
1082      * @param rules collation rule that failed\r
1083      * @param offset failed offset in rules\r
1084      * @throws ParseException with failure information\r
1085      */\r
1086     private static final void throwParseException(String rules, int offset)\r
1087                                                           throws ParseException\r
1088     {\r
1089         // for pre-context\r
1090         String precontext = rules.substring(0, offset);\r
1091         String postcontext = rules.substring(offset, rules.length());\r
1092         StringBuffer error = new StringBuffer(\r
1093                                     "Parse error occurred in rule at offset ");\r
1094         error.append(offset);\r
1095         error.append("\n after the prefix \"");\r
1096         error.append(precontext);\r
1097         error.append("\" before the suffix \"");\r
1098         error.append(postcontext);\r
1099         throw new ParseException(error.toString(), offset);\r
1100     }\r
1101 \r
1102     private final boolean doSetTop() {\r
1103         m_parsedToken_.m_charsOffset_ = m_extraCurrent_;\r
1104         m_source_.append((char)0xFFFE);\r
1105         IndirectBoundaries ib =\r
1106                   INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_];\r
1107         m_source_.append((char)(ib.m_startCE_ >> 16));\r
1108         m_source_.append((char)(ib.m_startCE_ & 0xFFFF));\r
1109         m_extraCurrent_ += 3;\r
1110         if (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_\r
1111                                                        ].m_startContCE_ == 0) {\r
1112             m_parsedToken_.m_charsLen_ = 3;\r
1113         }\r
1114         else {\r
1115             m_source_.append((char)(INDIRECT_BOUNDARIES_[\r
1116                                         m_parsedToken_.m_indirectIndex_\r
1117                                     ].m_startContCE_ >> 16));\r
1118             m_source_.append((char)(INDIRECT_BOUNDARIES_[\r
1119                                         m_parsedToken_.m_indirectIndex_\r
1120                                     ].m_startContCE_ & 0xFFFF));\r
1121             m_extraCurrent_ += 2;\r
1122             m_parsedToken_.m_charsLen_ = 5;\r
1123         }\r
1124         return true;\r
1125     }\r
1126 \r
1127     private static boolean isCharNewLine(char c) {\r
1128         switch (c) {\r
1129         case 0x000A: /* LF */\r
1130         case 0x000D: /* CR */\r
1131         case 0x000C: /* FF */\r
1132         case 0x0085: /* NEL */\r
1133         case 0x2028: /* LS */\r
1134         case 0x2029: /* PS */\r
1135             return true;\r
1136         default:\r
1137             return false;\r
1138         }\r
1139     }\r
1140 \r
1141     /**\r
1142      * Getting the next token\r
1143      *\r
1144      * @param startofrules\r
1145      *            flag indicating if we are at the start of rules\r
1146      * @return the offset of the rules\r
1147      * @exception ParseException\r
1148      *                thrown when rule parsing fails\r
1149      */\r
1150     private int parseNextToken(boolean startofrules) throws ParseException\r
1151     {\r
1152         // parsing part\r
1153         boolean variabletop = false;\r
1154         boolean top = false;\r
1155         boolean inchars = true;\r
1156         boolean inquote = false;\r
1157         boolean wasinquote = false;\r
1158         byte before = 0;\r
1159         boolean isescaped = false;\r
1160         int /*newcharslen = 0,*/ newextensionlen = 0;\r
1161         int /*charsoffset = 0,*/ extensionoffset = 0;\r
1162         int newstrength = TOKEN_UNSET_;\r
1163 \r
1164         m_parsedToken_.m_charsLen_ = 0;\r
1165         m_parsedToken_.m_charsOffset_ = 0;\r
1166         m_parsedToken_.m_prefixOffset_ = 0;\r
1167         m_parsedToken_.m_prefixLen_ = 0;\r
1168         m_parsedToken_.m_indirectIndex_ = 0;\r
1169 \r
1170         int limit = m_rules_.length();\r
1171         while (m_current_ < limit) {\r
1172             char ch = m_source_.charAt(m_current_);\r
1173             if (inquote) {\r
1174                 if (ch == 0x0027) { // '\''\r
1175                     inquote = false;\r
1176                 }\r
1177                 else {\r
1178                     if ((m_parsedToken_.m_charsLen_ == 0) || inchars) {\r
1179                          if (m_parsedToken_.m_charsLen_ == 0) {\r
1180                              m_parsedToken_.m_charsOffset_ = m_extraCurrent_;\r
1181                          }\r
1182                          m_parsedToken_.m_charsLen_ ++;\r
1183                     }\r
1184                     else {\r
1185                         if (newextensionlen == 0) {\r
1186                             extensionoffset = m_extraCurrent_;\r
1187                         }\r
1188                         newextensionlen ++;\r
1189                     }\r
1190                 }\r
1191             }\r
1192             else if (isescaped) {\r
1193                 isescaped = false;\r
1194                 if (newstrength == TOKEN_UNSET_) {\r
1195                     throwParseException(m_rules_, m_current_);\r
1196                 }\r
1197                 if (ch != 0 && m_current_ != limit) {\r
1198                     if (inchars) {\r
1199                         if (m_parsedToken_.m_charsLen_ == 0) {\r
1200                             m_parsedToken_.m_charsOffset_ = m_current_;\r
1201                         }\r
1202                         m_parsedToken_.m_charsLen_ ++;\r
1203                     }\r
1204                     else {\r
1205                         if (newextensionlen == 0) {\r
1206                             extensionoffset = m_current_;\r
1207                         }\r
1208                         newextensionlen ++;\r
1209                     }\r
1210                 }\r
1211             }\r
1212             else {\r
1213                 if (!UCharacterProperty.isRuleWhiteSpace(ch)) {\r
1214                     // Sets the strength for this entry\r
1215                     switch (ch) {\r
1216                     case 0x003D : // '='\r
1217                         if (newstrength != TOKEN_UNSET_) {\r
1218                             return doEndParseNextToken(newstrength,\r
1219                                                        top,\r
1220                                                        extensionoffset,\r
1221                                                        newextensionlen,\r
1222                                                        variabletop, before);\r
1223                         }\r
1224                         // if we start with strength, we'll reset to top\r
1225                         if (startofrules == true) {\r
1226                             m_parsedToken_.m_indirectIndex_ = 5;\r
1227                             top = doSetTop();\r
1228                             return doEndParseNextToken(TOKEN_RESET_,\r
1229                                                        top,\r
1230                                                        extensionoffset,\r
1231                                                        newextensionlen,\r
1232                                                        variabletop, before);\r
1233                         }\r
1234                         newstrength = Collator.IDENTICAL;\r
1235                         break;\r
1236                     case 0x002C : // ','\r
1237                         if (newstrength != TOKEN_UNSET_) {\r
1238                             return doEndParseNextToken(newstrength,\r
1239                                                        top,\r
1240                                                        extensionoffset,\r
1241                                                        newextensionlen,\r
1242                                                        variabletop, before);\r
1243                         }\r
1244                         // if we start with strength, we'll reset to top\r
1245                         if (startofrules == true) {\r
1246                             m_parsedToken_.m_indirectIndex_ = 5;\r
1247                             top = doSetTop();\r
1248                             return doEndParseNextToken(TOKEN_RESET_,\r
1249                                                        top,\r
1250                                                        extensionoffset,\r
1251                                                        newextensionlen,\r
1252                                                        variabletop, before);\r
1253                         }\r
1254                         newstrength = Collator.TERTIARY;\r
1255                         break;\r
1256                     case 0x003B : // ';'\r
1257                         if (newstrength != TOKEN_UNSET_) {\r
1258                             return doEndParseNextToken(newstrength,\r
1259                                                        top,\r
1260                                                        extensionoffset,\r
1261                                                        newextensionlen,\r
1262                                                        variabletop, before);\r
1263                         }\r
1264                         // if we start with strength, we'll reset to top\r
1265                         if (startofrules == true) {\r
1266                             m_parsedToken_.m_indirectIndex_ = 5;\r
1267                             top = doSetTop();\r
1268                             return doEndParseNextToken(TOKEN_RESET_,\r
1269                                                        top,\r
1270                                                        extensionoffset,\r
1271                                                        newextensionlen,\r
1272                                                        variabletop, before);\r
1273                         }\r
1274                         newstrength = Collator.SECONDARY;\r
1275                         break;\r
1276                     case 0x003C : // '<'\r
1277                         if (newstrength != TOKEN_UNSET_) {\r
1278                             return doEndParseNextToken(newstrength,\r
1279                                                        top,\r
1280                                                        extensionoffset,\r
1281                                                        newextensionlen,\r
1282                                                        variabletop, before);\r
1283                         }\r
1284                         // if we start with strength, we'll reset to top\r
1285                         if (startofrules == true) {\r
1286                             m_parsedToken_.m_indirectIndex_ = 5;\r
1287                             top = doSetTop();\r
1288                             return doEndParseNextToken(TOKEN_RESET_,\r
1289                                                        top,\r
1290                                                        extensionoffset,\r
1291                                                        newextensionlen,\r
1292                                                        variabletop, before);\r
1293                         }\r
1294                         // before this, do a scan to verify whether this is\r
1295                         // another strength\r
1296                         if (m_source_.charAt(m_current_ + 1) == 0x003C) {\r
1297                             m_current_ ++;\r
1298                             if (m_source_.charAt(m_current_ + 1) == 0x003C) {\r
1299                                 m_current_ ++; // three in a row!\r
1300                                 newstrength = Collator.TERTIARY;\r
1301                             }\r
1302                             else { // two in a row\r
1303                                 newstrength = Collator.SECONDARY;\r
1304                             }\r
1305                         }\r
1306                         else { // just one\r
1307                             newstrength = Collator.PRIMARY;\r
1308                         }\r
1309                         break;\r
1310                     case 0x0026 : // '&'\r
1311                         if (newstrength != TOKEN_UNSET_) {\r
1312                             return doEndParseNextToken(newstrength,\r
1313                                                        top,\r
1314                                                        extensionoffset,\r
1315                                                        newextensionlen,\r
1316                                                        variabletop, before);\r
1317                         }\r
1318                         newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0\r
1319                         break;\r
1320                     case 0x005b : // '['\r
1321                         // options - read an option, analyze it\r
1322                         m_optionEnd_ = m_rules_.indexOf(0x005d, m_current_);\r
1323                         if (m_optionEnd_ != -1) { // ']'\r
1324                             byte result = readAndSetOption();\r
1325                             m_current_ = m_optionEnd_;\r
1326                             if ((result & TOKEN_TOP_MASK_) != 0) {\r
1327                                 if (newstrength == TOKEN_RESET_) {\r
1328                                     top = doSetTop();\r
1329                                     if (before != 0) {\r
1330                                         // This is a combination of before and\r
1331                                         // indirection like\r
1332                                         // '&[before 2][first regular]<b'\r
1333                                         m_source_.append((char)0x002d);\r
1334                                         m_source_.append((char)before);\r
1335                                         m_extraCurrent_ += 2;\r
1336                                         m_parsedToken_.m_charsLen_ += 2;\r
1337                                     }\r
1338                                     m_current_ ++;\r
1339                                     return doEndParseNextToken(newstrength,\r
1340                                                        true,\r
1341                                                        extensionoffset,\r
1342                                                        newextensionlen,\r
1343                                                        variabletop, before);\r
1344                                 }\r
1345                                 else {\r
1346                                     throwParseException(m_rules_, m_current_);\r
1347                                 }\r
1348                             }\r
1349                             else if ((result & TOKEN_VARIABLE_TOP_MASK_) != 0) {\r
1350                                 if (newstrength != TOKEN_RESET_\r
1351                                     && newstrength != TOKEN_UNSET_) {\r
1352                                     variabletop = true;\r
1353                                     m_parsedToken_.m_charsOffset_\r
1354                                                              = m_extraCurrent_;\r
1355                                     m_source_.append((char)0xFFFF);\r
1356                                     m_extraCurrent_ ++;\r
1357                                     m_current_ ++;\r
1358                                     m_parsedToken_.m_charsLen_ = 1;\r
1359                                     return doEndParseNextToken(newstrength,\r
1360                                                        top,\r
1361                                                        extensionoffset,\r
1362                                                        newextensionlen,\r
1363                                                        variabletop, before);\r
1364                                 }\r
1365                                 else {\r
1366                                     throwParseException(m_rules_, m_current_);\r
1367                                 }\r
1368                             }\r
1369                             else if ((result & TOKEN_BEFORE_) != 0){\r
1370                                 if (newstrength == TOKEN_RESET_) {\r
1371                                     before = (byte)(result & TOKEN_BEFORE_);\r
1372                                 }\r
1373                                 else {\r
1374                                     throwParseException(m_rules_, m_current_);\r
1375                                 }\r
1376                             }\r
1377                         }\r
1378                         break;\r
1379                     case 0x002F : // '/'\r
1380                         wasinquote = false; // if we were copying source\r
1381                                             // characters, we want to stop now\r
1382                         inchars = false; // we're now processing expansion\r
1383                         break;\r
1384                     case 0x005C : // back slash for escaped chars\r
1385                         isescaped = true;\r
1386                         break;\r
1387                     // found a quote, we're gonna start copying\r
1388                     case 0x0027 : //'\''\r
1389                         if (newstrength == TOKEN_UNSET_) {\r
1390                             // quote is illegal until we have a strength\r
1391                             throwParseException(m_rules_, m_current_);\r
1392                         }\r
1393                         inquote = true;\r
1394                         if (inchars) { // we're doing characters\r
1395                             if (wasinquote == false) {\r
1396                                 m_parsedToken_.m_charsOffset_ = m_extraCurrent_;\r
1397                             }\r
1398                             if (m_parsedToken_.m_charsLen_ != 0) {\r
1399                                 m_source_.append(m_source_.substring(\r
1400                                        m_current_ - m_parsedToken_.m_charsLen_,\r
1401                                        m_current_));\r
1402                                 m_extraCurrent_ += m_parsedToken_.m_charsLen_;\r
1403                             }\r
1404                             m_parsedToken_.m_charsLen_ ++;\r
1405                         }\r
1406                         else { // we're doing an expansion\r
1407                             if (wasinquote == false) {\r
1408                                 extensionoffset = m_extraCurrent_;\r
1409                             }\r
1410                             if (newextensionlen != 0) {\r
1411                                 m_source_.append(m_source_.substring(\r
1412                                                    m_current_ - newextensionlen,\r
1413                                                    m_current_));\r
1414                                 m_extraCurrent_ += newextensionlen;\r
1415                             }\r
1416                             newextensionlen ++;\r
1417                         }\r
1418                         wasinquote = true;\r
1419                         m_current_ ++;\r
1420                         ch = m_source_.charAt(m_current_);\r
1421                         if (ch == 0x0027) { // copy the double quote\r
1422                             m_source_.append(ch);\r
1423                             m_extraCurrent_ ++;\r
1424                             inquote = false;\r
1425                         }\r
1426                         break;\r
1427                     // '@' is french only if the strength is not currently set\r
1428                     // if it is, it's just a regular character in collation\r
1429                     case 0x0040 : // '@'\r
1430                         if (newstrength == TOKEN_UNSET_) {\r
1431                             m_options_.m_isFrenchCollation_ = true;\r
1432                         break;\r
1433                     }\r
1434                     case 0x007C : //|\r
1435                         // this means we have actually been reading prefix part\r
1436                         // we want to store read characters to the prefix part\r
1437                         // and continue reading the characters (proper way\r
1438                         // would be to restart reading the chars, but in that\r
1439                         // case we would have to complicate the token hasher,\r
1440                         // which I do not intend to play with. Instead, we will\r
1441                         // do prefixes when prefixes are due (before adding the\r
1442                         // elements).\r
1443                         m_parsedToken_.m_prefixOffset_\r
1444                                                 = m_parsedToken_.m_charsOffset_;\r
1445                         m_parsedToken_.m_prefixLen_\r
1446                                                 = m_parsedToken_.m_charsLen_;\r
1447                         if (inchars) { // we're doing characters\r
1448                             if (wasinquote == false) {\r
1449                                 m_parsedToken_.m_charsOffset_ = m_extraCurrent_;\r
1450                             }\r
1451                             if (m_parsedToken_.m_charsLen_ != 0) {\r
1452                                 String prefix = m_source_.substring(\r
1453                                        m_current_ - m_parsedToken_.m_charsLen_,\r
1454                                        m_current_);\r
1455                                 m_source_.append(prefix);\r
1456                                 m_extraCurrent_ += m_parsedToken_.m_charsLen_;\r
1457                             }\r
1458                             m_parsedToken_.m_charsLen_ ++;\r
1459                         }\r
1460                         wasinquote = true;\r
1461                         do {\r
1462                             m_current_ ++;\r
1463                             ch = m_source_.charAt(m_current_);\r
1464                             // skip whitespace between '|' and the character\r
1465                         } while (UCharacterProperty.isRuleWhiteSpace(ch));\r
1466                         break;\r
1467                     case 0x0023: // '#' // this is a comment, skip everything through the end of line\r
1468                         do {\r
1469                             m_current_ ++;\r
1470                             ch = m_source_.charAt(m_current_);\r
1471                         } while (!isCharNewLine(ch));\r
1472                         break;\r
1473                     case 0x0021: // '!' // ignoring java set thai reordering\r
1474                         break;\r
1475                     default :\r
1476                         if (newstrength == TOKEN_UNSET_) {\r
1477                             throwParseException(m_rules_, m_current_);\r
1478                         }\r
1479                         if (isSpecialChar(ch) && (inquote == false)) {\r
1480                             throwParseException(m_rules_, m_current_);\r
1481                         }\r
1482                         if (ch == 0x0000 && m_current_ + 1 == limit) {\r
1483                             break;\r
1484                         }\r
1485                         if (inchars) {\r
1486                             if (m_parsedToken_.m_charsLen_ == 0) {\r
1487                                 m_parsedToken_.m_charsOffset_ = m_current_;\r
1488                             }\r
1489                             m_parsedToken_.m_charsLen_++;\r
1490                         }\r
1491                         else {\r
1492                             if (newextensionlen == 0) {\r
1493                                 extensionoffset = m_current_;\r
1494                             }\r
1495                             newextensionlen ++;\r
1496                         }\r
1497                         break;\r
1498                     }\r
1499                 }\r
1500             }\r
1501             if (wasinquote) {\r
1502                 if (ch != 0x27) {\r
1503                       m_source_.append(ch);\r
1504                     m_extraCurrent_ ++;\r
1505                 }\r
1506             }\r
1507             m_current_ ++;\r
1508         }\r
1509         return doEndParseNextToken(newstrength, top,\r
1510                                    extensionoffset, newextensionlen,\r
1511                                    variabletop, before);\r
1512     }\r
1513 \r
1514     /**\r
1515      * End the next parse token\r
1516      * @param newstrength new strength\r
1517      * @return offset in rules, -1 for end of rules\r
1518      */\r
1519     private int doEndParseNextToken(int newstrength, /*int newcharslen,*/\r
1520                                     boolean top, /*int charsoffset,*/\r
1521                                     int extensionoffset, int newextensionlen,\r
1522                                     boolean variabletop, int before)\r
1523                                     throws ParseException\r
1524     {\r
1525         if (newstrength == TOKEN_UNSET_) {\r
1526             return -1;\r
1527         }\r
1528         if (m_parsedToken_.m_charsLen_ == 0 && top == false) {\r
1529             throwParseException(m_rules_, m_current_);\r
1530         }\r
1531 \r
1532         m_parsedToken_.m_strength_ = newstrength;\r
1533         //m_parsedToken_.m_charsOffset_ = charsoffset;\r
1534         //m_parsedToken_.m_charsLen_ = newcharslen;\r
1535         m_parsedToken_.m_extensionOffset_ = extensionoffset;\r
1536         m_parsedToken_.m_extensionLen_ = newextensionlen;\r
1537         m_parsedToken_.m_flags_ = (char)\r
1538                                   ((variabletop ? TOKEN_VARIABLE_TOP_MASK_ : 0)\r
1539                                   | (top ? TOKEN_TOP_MASK_ : 0) | before);\r
1540         return m_current_;\r
1541     }\r
1542 \r
1543     /**\r
1544      * Token before this element\r
1545      * @param sourcetoken\r
1546      * @param strength collation strength\r
1547      * @return the token before source token\r
1548      * @exception ParseException thrown when rules have the wrong syntax\r
1549      */\r
1550     private Token getVirginBefore(Token sourcetoken, int strength)\r
1551                                                           throws ParseException\r
1552     {\r
1553         // this is a virgin before - we need to fish the anchor from the UCA\r
1554         if (sourcetoken != null) {\r
1555             int offset = sourcetoken.m_source_ & 0xFFFFFF;\r
1556             m_UCAColEIter_.setText(m_source_.substring(offset, offset + 1));\r
1557         }\r
1558         else {\r
1559             m_UCAColEIter_.setText(\r
1560                              m_source_.substring(m_parsedToken_.m_charsOffset_,\r
1561                              m_parsedToken_.m_charsOffset_ + 1));\r
1562         }\r
1563 \r
1564         int basece = m_UCAColEIter_.next() & 0xFFFFFF3F;\r
1565         int basecontce = m_UCAColEIter_.next();\r
1566         if (basecontce == CollationElementIterator.NULLORDER) {\r
1567             basecontce = 0;\r
1568         }\r
1569 \r
1570         int ch = 0;\r
1571 \r
1572 \r
1573         if((basece >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)\r
1574                 && (basece >>> 24 <=  RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */\r
1575 \r
1576             int primary = basece & RuleBasedCollator.CE_PRIMARY_MASK_ | (basecontce & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;\r
1577             int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);\r
1578             ch = RuleBasedCollator.impCEGen_.getCodePointFromRaw(raw-1);\r
1579             int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);\r
1580             m_utilCEBuffer_[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;\r
1581             m_utilCEBuffer_[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;\r
1582 \r
1583             m_parsedToken_.m_charsOffset_ = m_extraCurrent_;\r
1584             m_source_.append('\uFFFE');\r
1585             m_source_.append((char)ch);\r
1586             m_extraCurrent_ += 2;\r
1587             m_parsedToken_.m_charsLen_++;\r
1588 \r
1589             m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)\r
1590             | m_parsedToken_.m_charsOffset_;\r
1591             m_utilToken_.m_rules_ = m_source_;\r
1592             sourcetoken = (Token)m_hashTable_.get(m_utilToken_);\r
1593 \r
1594             if(sourcetoken == null) {\r
1595                 m_listHeader_[m_resultLength_] = new TokenListHeader();\r
1596                 m_listHeader_[m_resultLength_].m_baseCE_\r
1597                     = m_utilCEBuffer_[0] & 0xFFFFFF3F;\r
1598                 if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {\r
1599                     m_listHeader_[m_resultLength_].m_baseContCE_\r
1600                     = m_utilCEBuffer_[1];\r
1601                 }\r
1602                 else {\r
1603                     m_listHeader_[m_resultLength_].m_baseContCE_ = 0;\r
1604                 }\r
1605                 m_listHeader_[m_resultLength_].m_nextCE_ = 0;\r
1606                 m_listHeader_[m_resultLength_].m_nextContCE_ = 0;\r
1607                 m_listHeader_[m_resultLength_].m_previousCE_ = 0;\r
1608                 m_listHeader_[m_resultLength_].m_previousContCE_ = 0;\r
1609                 m_listHeader_[m_resultLength_].m_indirect_ = false;\r
1610 \r
1611                 sourcetoken = new Token();\r
1612                 initAReset(-1, sourcetoken);\r
1613             }\r
1614 \r
1615         } else {\r
1616 \r
1617             // first ce and second ce m_utilCEBuffer_\r
1618             /*int invpos = */CollationParsedRuleBuilder.INVERSE_UCA_.getInversePrevCE(\r
1619                                                          basece, basecontce,\r
1620                                                          strength, m_utilCEBuffer_);\r
1621             // we got the previous CE. Now we need to see if the difference between\r
1622             // the two CEs is really of the requested strength.\r
1623             // if it's a bigger difference (we asked for secondary and got primary), we\r
1624             // need to modify the CE.\r
1625             if(CollationParsedRuleBuilder.INVERSE_UCA_.getCEStrengthDifference(basece, basecontce, m_utilCEBuffer_[0], m_utilCEBuffer_[1]) < strength) {\r
1626                 // adjust the strength\r
1627                 // now we are in the situation where our baseCE should actually be modified in\r
1628                 // order to get the CE in the right position.\r
1629                 if(strength == Collator.SECONDARY) {\r
1630                     m_utilCEBuffer_[0] = basece - 0x0200;\r
1631                 } else { // strength == UCOL_TERTIARY\r
1632                     m_utilCEBuffer_[0] = basece - 0x02;\r
1633                 }\r
1634                 if(RuleBasedCollator.isContinuation(basecontce)) {\r
1635                     if(strength == Collator.SECONDARY) {\r
1636                         m_utilCEBuffer_[1] = basecontce - 0x0200;\r
1637                     } else { // strength == UCOL_TERTIARY\r
1638                         m_utilCEBuffer_[1] = basecontce - 0x02;\r
1639                     }\r
1640                 }\r
1641             }\r
1642 \r
1643 /*\r
1644             // the code below relies on getting a code point from the inverse table, in order to be\r
1645             // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:\r
1646             // 1. There are many code points that have the same CE\r
1647             // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.\r
1648             // Also, in case when there is no equivalent strength before an element, we have to actually\r
1649             // construct one. For example, &[before 2]a << x won't result in x << a, because the element\r
1650             // before a is a primary difference.\r
1651             ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_table_[3 * invpos\r
1652                                                                       + 2];\r
1653             if ((ch &  INVERSE_SIZE_MASK_) != 0) {\r
1654                 int offset = ch & INVERSE_OFFSET_MASK_;\r
1655                 ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_continuations_[\r
1656                                                                            offset];\r
1657             }\r
1658             m_source_.append((char)ch);\r
1659             m_extraCurrent_ ++;\r
1660             m_parsedToken_.m_charsOffset_ = m_extraCurrent_ - 1;\r
1661             m_parsedToken_.m_charsLen_ = 1;\r
1662 \r
1663             // We got an UCA before. However, this might have been tailored.\r
1664             // example:\r
1665             // &\u30ca = \u306a\r
1666             // &[before 3]\u306a<<<\u306a|\u309d\r
1667 \r
1668             m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)\r
1669                                                  | m_parsedToken_.m_charsOffset_;\r
1670             m_utilToken_.m_rules_ = m_source_;\r
1671             sourcetoken = (Token)m_hashTable_.get(m_utilToken_);\r
1672 */\r
1673 \r
1674             // here is how it should be. The situation such as &[before 1]a < x, should be\r
1675             // resolved exactly as if we wrote &a > x.\r
1676             // therefore, I don't really care if the UCA value before a has been changed.\r
1677             // However, I do care if the strength between my element and the previous element\r
1678             // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll\r
1679             // have to construct the base CE.\r
1680 \r
1681             // if we found a tailored thing, we have to use the UCA value and\r
1682             // construct a new reset token with constructed name\r
1683             //if (sourcetoken != null && sourcetoken.m_strength_ != TOKEN_RESET_) {\r
1684                 // character to which we want to anchor is already tailored.\r
1685                 // We need to construct a new token which will be the anchor point\r
1686                 //m_source_.setCharAt(m_extraCurrent_ - 1, '\uFFFE');\r
1687                 //m_source_.append(ch);\r
1688                 //m_extraCurrent_ ++;\r
1689                 //m_parsedToken_.m_charsLen_ ++;\r
1690                 // grab before\r
1691                 m_parsedToken_.m_charsOffset_ -= 10;\r
1692                 m_parsedToken_.m_charsLen_ += 10;\r
1693                 m_listHeader_[m_resultLength_] = new TokenListHeader();\r
1694                 m_listHeader_[m_resultLength_].m_baseCE_\r
1695                                                  = m_utilCEBuffer_[0] & 0xFFFFFF3F;\r
1696                 if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {\r
1697                     m_listHeader_[m_resultLength_].m_baseContCE_\r
1698                                                               = m_utilCEBuffer_[1];\r
1699                 }\r
1700                 else {\r
1701                     m_listHeader_[m_resultLength_].m_baseContCE_ = 0;\r
1702                 }\r
1703                 m_listHeader_[m_resultLength_].m_nextCE_ = 0;\r
1704                 m_listHeader_[m_resultLength_].m_nextContCE_ = 0;\r
1705                 m_listHeader_[m_resultLength_].m_previousCE_ = 0;\r
1706                 m_listHeader_[m_resultLength_].m_previousContCE_ = 0;\r
1707                 m_listHeader_[m_resultLength_].m_indirect_ = false;\r
1708                 sourcetoken = new Token();\r
1709                 initAReset(-1, sourcetoken);\r
1710             //}\r
1711         }\r
1712         return sourcetoken;\r
1713     }\r
1714 \r
1715     /**\r
1716      * Processing Description.\r
1717      * 1. Build a m_listHeader_. Each list has a header, which contains two lists\r
1718      * (positive and negative), a reset token, a baseCE, nextCE, and\r
1719      * previousCE. The lists and reset may be null.\r
1720      * 2. As you process, you keep a LAST pointer that points to the last token\r
1721      * you handled.\r
1722      * @param expand string offset, -1 for null strings\r
1723      * @param targetToken token to update\r
1724      * @return expandnext offset\r
1725      * @throws ParseException thrown when rules syntax failed\r
1726      */\r
1727     private int initAReset(int expand, Token targetToken) throws ParseException\r
1728     {\r
1729         if (m_resultLength_ == m_listHeader_.length - 1) {\r
1730             // Unfortunately, this won't work, as we store addresses of lhs in\r
1731             // token\r
1732             TokenListHeader temp[] = new TokenListHeader[m_resultLength_ << 1];\r
1733             System.arraycopy(m_listHeader_, 0, temp, 0, m_resultLength_ + 1);\r
1734             m_listHeader_ = temp;\r
1735         }\r
1736         // do the reset thing\r
1737         targetToken.m_rules_ = m_source_;\r
1738         targetToken.m_source_ = m_parsedToken_.m_charsLen_ << 24\r
1739                                 | m_parsedToken_.m_charsOffset_;\r
1740         targetToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24\r
1741                                    | m_parsedToken_.m_extensionOffset_;\r
1742         // keep the flags around so that we know about before\r
1743         targetToken.m_flags_ = m_parsedToken_.m_flags_;\r
1744 \r
1745         if (m_parsedToken_.m_prefixOffset_ != 0) {\r
1746             throwParseException(m_rules_, m_parsedToken_.m_charsOffset_ - 1);\r
1747         }\r
1748 \r
1749         targetToken.m_prefix_ = 0;\r
1750         // TODO: this should also handle reverse\r
1751         targetToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;\r
1752         targetToken.m_strength_ = TOKEN_RESET_;\r
1753         targetToken.m_next_ = null;\r
1754         targetToken.m_previous_ = null;\r
1755         targetToken.m_CELength_ = 0;\r
1756         targetToken.m_expCELength_ = 0;\r
1757         targetToken.m_listHeader_ = m_listHeader_[m_resultLength_];\r
1758         m_listHeader_[m_resultLength_].m_first_ = null;\r
1759         m_listHeader_[m_resultLength_].m_last_ = null;\r
1760         m_listHeader_[m_resultLength_].m_first_ = null;\r
1761         m_listHeader_[m_resultLength_].m_last_ = null;\r
1762         m_listHeader_[m_resultLength_].m_reset_ = targetToken;\r
1763 \r
1764         /* 3 Consider each item: relation, source, and expansion:\r
1765          * e.g. ...< x / y ...\r
1766          * First convert all expansions into normal form. Examples:\r
1767          * If "xy" doesn't occur earlier in the list or in the UCA, convert\r
1768          * &xy * c * d * ... into &x * c/y * d * ...\r
1769          * Note: reset values can never have expansions, although they can\r
1770          * cause the very next item to have one. They may be contractions, if\r
1771          * they are found earlier in the list.\r
1772          */\r
1773         int result = 0;\r
1774         if (expand > 0) {\r
1775             // check to see if there is an expansion\r
1776             if (m_parsedToken_.m_charsLen_ > 1) {\r
1777                 targetToken.m_source_ = ((expand\r
1778                                           - m_parsedToken_.m_charsOffset_ )\r
1779                                           << 24)\r
1780                                           | m_parsedToken_.m_charsOffset_;\r
1781                 result = ((m_parsedToken_.m_charsLen_\r
1782                                + m_parsedToken_.m_charsOffset_ - expand) << 24)\r
1783                                | expand;\r
1784             }\r
1785         }\r
1786 \r
1787         m_resultLength_ ++;\r
1788         m_hashTable_.put(targetToken, targetToken);\r
1789         return result;\r
1790     }\r
1791 \r
1792     /**\r
1793      * Checks if an character is special\r
1794      * @param ch character to test\r
1795      * @return true if the character is special\r
1796      */\r
1797     private static final boolean isSpecialChar(char ch)\r
1798     {\r
1799         return (ch <= 0x002F && ch >= 0x0020) || (ch <= 0x003F && ch >= 0x003A)\r
1800                || (ch <= 0x0060 && ch >= 0x005B)\r
1801                || (ch <= 0x007E && ch >= 0x007D) || ch == 0x007B;\r
1802     }\r
1803 \r
1804     private\r
1805     UnicodeSet readAndSetUnicodeSet(String source, int start) throws ParseException\r
1806     {\r
1807       while(source.charAt(start) != '[') { /* advance while we find the first '[' */\r
1808         start++;\r
1809       }\r
1810       // now we need to get a balanced set of '[]'. The problem is that a set can have\r
1811       // many, and *end point to the first closing '['\r
1812       int noOpenBraces = 1;\r
1813       int current = 1; // skip the opening brace\r
1814       while(start+current < source.length() && noOpenBraces != 0) {\r
1815         if(source.charAt(start+current) == '[') {\r
1816           noOpenBraces++;\r
1817         } else if(source.charAt(start+current) == ']') { // closing brace\r
1818           noOpenBraces--;\r
1819         }\r
1820         current++;\r
1821       }\r
1822       //int nextBrace = -1;\r
1823 \r
1824       if(noOpenBraces != 0 || (/*nextBrace =*/ source.indexOf("]", start+current) /*']'*/) == -1) {\r
1825         throwParseException(m_rules_, start);\r
1826       }\r
1827       return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current);\r
1828     }\r
1829 \r
1830 \r
1831     /** in C, optionarg is passed by reference to function.\r
1832      *  We use a private int to simulate this.\r
1833      */\r
1834     private int m_optionarg_ = 0;\r
1835 \r
1836     private int readOption(String rules, int start, int optionend)\r
1837     {\r
1838         m_optionarg_ = 0;\r
1839         int i = 0;\r
1840         while (i < RULES_OPTIONS_.length) {\r
1841             String option = RULES_OPTIONS_[i].m_name_;\r
1842             int optionlength = option.length();\r
1843             if (rules.length() > start + optionlength\r
1844                 && option.equalsIgnoreCase(rules.substring(start,\r
1845                                                       start + optionlength))) {\r
1846                 if (optionend - start > optionlength) {\r
1847                     m_optionarg_ = start + optionlength;\r
1848                     // start of the options, skip space\r
1849                     while (m_optionarg_ < optionend && UCharacter.isWhitespace(rules.charAt(m_optionarg_)))\r
1850                     {   // eat whitespace\r
1851                         m_optionarg_ ++;\r
1852                     }\r
1853                 }\r
1854                 break;\r
1855             }\r
1856             i ++;\r
1857         }\r
1858         if(i == RULES_OPTIONS_.length) {\r
1859             i = -1;\r
1860         }\r
1861         return i;\r
1862     }\r
1863     /**\r
1864      * Reads and set collation options\r
1865      * @return TOKEN_SUCCESS if option is set correct, 0 otherwise\r
1866      * @exception ParseException thrown when options in rules are wrong\r
1867      */\r
1868     private byte readAndSetOption() throws ParseException\r
1869     {\r
1870         int start = m_current_ + 1; // skip opening '['\r
1871         int i = readOption(m_rules_, start, m_optionEnd_);\r
1872 \r
1873         int optionarg = m_optionarg_;\r
1874 \r
1875         if (i < 0) {\r
1876             throwParseException(m_rules_, start);\r
1877         }\r
1878 \r
1879         if (i < 7) {\r
1880             if (optionarg != 0) {\r
1881                 for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;\r
1882                                                                         j ++) {\r
1883                      String subname = RULES_OPTIONS_[i].m_subOptions_[j];\r
1884                      int size = optionarg + subname.length();\r
1885                      if (m_rules_.length() > size\r
1886                          && subname.equalsIgnoreCase(m_rules_.substring(\r
1887                                                            optionarg, size))) {\r
1888                          setOptions(m_options_, RULES_OPTIONS_[i].m_attribute_,\r
1889                              RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]);\r
1890                          return TOKEN_SUCCESS_MASK_;\r
1891                      }\r
1892                 }\r
1893             }\r
1894             throwParseException(m_rules_, optionarg);\r
1895         }\r
1896         else if (i == 7) { // variable top\r
1897             return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_;\r
1898         }\r
1899         else if (i == 8) { // rearange\r
1900             return TOKEN_SUCCESS_MASK_;\r
1901         }\r
1902         else if (i == 9) { // before\r
1903             if (optionarg != 0) {\r
1904                 for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;\r
1905                                                                         j ++) {\r
1906                      String subname = RULES_OPTIONS_[i].m_subOptions_[j];\r
1907                      int size = optionarg + subname.length();\r
1908                      if (m_rules_.length() > size\r
1909                          && subname.equalsIgnoreCase(\r
1910                                                m_rules_.substring(optionarg,\r
1911                                               optionarg + subname.length()))) {\r
1912                          return (byte)(TOKEN_SUCCESS_MASK_\r
1913                             | RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]\r
1914                             + 1);\r
1915                      }\r
1916                 }\r
1917             }\r
1918             throwParseException(m_rules_, optionarg);\r
1919         }\r
1920         else if (i == 10) {  // top, we are going to have an array with\r
1921             // structures of limit CEs index to this array will be\r
1922             // src->parsedToken.indirectIndex\r
1923             m_parsedToken_.m_indirectIndex_ = 0;\r
1924             return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;\r
1925         }\r
1926         else if (i < 13) { // first, last\r
1927             for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j ++) {\r
1928                 String subname = RULES_OPTIONS_[i].m_subOptions_[j];\r
1929                 int size = optionarg + subname.length();\r
1930                 if (m_rules_.length() > size\r
1931                     && subname.equalsIgnoreCase(m_rules_.substring(optionarg,\r
1932                                                                    size))) {\r
1933                     m_parsedToken_.m_indirectIndex_ = (char)(i - 10 + (j << 1));\r
1934                     return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;\r
1935                 }\r
1936             }\r
1937             throwParseException(m_rules_, optionarg);\r
1938         }\r
1939         else if(i == 13 || i == 14) { // copy and remove are handled before normalization\r
1940             // we need to move end here\r
1941             int noOpenBraces = 1;\r
1942             m_current_++; // skip opening brace\r
1943             while(m_current_ < m_source_.length() && noOpenBraces != 0) {\r
1944                 if(m_source_.charAt(m_current_) == '[') {\r
1945                   noOpenBraces++;\r
1946                 } else if(m_source_.charAt(m_current_) == ']') { // closing brace\r
1947                   noOpenBraces--;\r
1948                 }\r
1949                 m_current_++;\r
1950             }\r
1951             m_optionEnd_ = m_current_-1;\r
1952             return TOKEN_SUCCESS_MASK_;\r
1953         }\r
1954         else {\r
1955             throwParseException(m_rules_, optionarg);\r
1956         }\r
1957         return TOKEN_SUCCESS_MASK_; // we will never reach here.\r
1958     }\r
1959 \r
1960     /**\r
1961      * Set collation option\r
1962      * @param optionset option set to set\r
1963      * @param attribute type to set\r
1964      * @param value attribute value\r
1965      */\r
1966     private void setOptions(OptionSet optionset, int attribute, int value)\r
1967     {\r
1968         switch (attribute) {\r
1969             case RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_ :\r
1970                 optionset.m_isHiragana4_\r
1971                             = (value == RuleBasedCollator.AttributeValue.ON_);\r
1972                 break;\r
1973             case RuleBasedCollator.Attribute.FRENCH_COLLATION_ :\r
1974                 optionset.m_isFrenchCollation_\r
1975                              = (value == RuleBasedCollator.AttributeValue.ON_);\r
1976                 break;\r
1977             case RuleBasedCollator.Attribute.ALTERNATE_HANDLING_ :\r
1978                 optionset.m_isAlternateHandlingShifted_\r
1979                              = (value\r
1980                                 == RuleBasedCollator.AttributeValue.SHIFTED_);\r
1981                 break;\r
1982             case RuleBasedCollator.Attribute.CASE_FIRST_ :\r
1983                 optionset.m_caseFirst_ = value;\r
1984                 break;\r
1985             case RuleBasedCollator.Attribute.CASE_LEVEL_ :\r
1986                 optionset.m_isCaseLevel_\r
1987                              = (value == RuleBasedCollator.AttributeValue.ON_);\r
1988                 break;\r
1989             case RuleBasedCollator.Attribute.NORMALIZATION_MODE_ :\r
1990                 if (value == RuleBasedCollator.AttributeValue.ON_) {\r
1991                     value = Collator.CANONICAL_DECOMPOSITION;\r
1992                 }\r
1993                 optionset.m_decomposition_ = value;\r
1994                 break;\r
1995             case RuleBasedCollator.Attribute.STRENGTH_ :\r
1996                 optionset.m_strength_ = value;\r
1997                 break;\r
1998             default :\r
1999                 break;\r
2000         }\r
2001       }\r
2002 \r
2003     UnicodeSet getTailoredSet() throws ParseException\r
2004     {\r
2005         boolean startOfRules = true;\r
2006         UnicodeSet tailored = new UnicodeSet();\r
2007         String pattern;\r
2008         CanonicalIterator it = new CanonicalIterator("");\r
2009 \r
2010         m_parsedToken_.m_strength_ = TOKEN_UNSET_;\r
2011         int sourcelimit = m_source_.length();\r
2012         //int expandNext = 0;\r
2013 \r
2014         while (m_current_ < sourcelimit) {\r
2015         m_parsedToken_.m_prefixOffset_ = 0;\r
2016         if (parseNextToken(startOfRules) < 0) {\r
2017             // we have reached the end\r
2018             continue;\r
2019         }\r
2020         startOfRules = false;\r
2021         // The idea is to tokenize the rule set. For each non-reset token,\r
2022         // we add all the canonicaly equivalent FCD sequences\r
2023             if(m_parsedToken_.m_strength_ != TOKEN_RESET_) {\r
2024                 it.setSource(m_source_.substring(\r
2025                       m_parsedToken_.m_charsOffset_,\r
2026                       m_parsedToken_.m_charsOffset_+m_parsedToken_.m_charsLen_));\r
2027                 pattern = it.next();\r
2028                 while(pattern != null) {\r
2029                       if(Normalizer.quickCheck(pattern, Normalizer.FCD,0) != Normalizer.NO) {\r
2030                         tailored.add(pattern);\r
2031                     }\r
2032                     pattern = it.next();\r
2033                 }\r
2034             }\r
2035         }\r
2036         return tailored;\r
2037     }\r
2038 \r
2039     final private void extractSetsFromRules(String rules) throws ParseException {\r
2040       int optionNumber = -1;\r
2041       int setStart = 0;\r
2042       int i = 0;\r
2043       while(i < rules.length()) {\r
2044         if(rules.charAt(i) == 0x005B) {\r
2045           optionNumber = readOption(rules, i+1, rules.length());\r
2046           setStart = m_optionarg_;\r
2047           if(optionNumber == 13) { /* copy - parts of UCA to tailoring */\r
2048             UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);\r
2049               if(m_copySet_ == null) {\r
2050                 m_copySet_ = newSet;\r
2051               } else {\r
2052                 m_copySet_.addAll(newSet);\r
2053               }\r
2054           } else if(optionNumber == 14) {\r
2055             UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);\r
2056               if(m_removeSet_ == null) {\r
2057                 m_removeSet_ = newSet;\r
2058               } else {\r
2059                 m_removeSet_.addAll(newSet);\r
2060               }\r
2061           }\r
2062         }\r
2063         i++;\r
2064       }\r
2065     }\r
2066 }\r