]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_4_2-src/main/classes/collate/src/com/ibm/icu/text/CollationRuleParser.java
go
[Dictionary.git] / jars / icu4j-4_4_2-src / main / classes / collate / src / com / ibm / icu / text / CollationRuleParser.java
1 /**\r
2 *******************************************************************************\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and    *\r
4 * others. All Rights Reserved.                                                *\r
5 *******************************************************************************\r
6 */\r
7 package com.ibm.icu.text;\r
8 \r
9 import java.text.ParseException;\r
10 import java.util.Arrays;\r
11 import java.util.Hashtable;\r
12 \r
13 import com.ibm.icu.impl.UCharacterProperty;\r
14 import com.ibm.icu.lang.UCharacter;\r
15 \r
16 /**\r
17 * Class for parsing collation rules, produces a list of tokens that will be\r
18 * turned into collation elements\r
19 * @author Syn Wee Quek\r
20 * @since release 2.2, June 7 2002\r
21 */\r
22 final class CollationRuleParser\r
23 {\r
24     // public data members ---------------------------------------------------\r
25 \r
26     // package private constructors ------------------------------------------\r
27 \r
28     /**\r
29      * <p>RuleBasedCollator constructor that takes the rules.\r
30      * Please see RuleBasedCollator class description for more details on the\r
31      * collation rule syntax.</p>\r
32      * @see java.util.Locale\r
33      * @param rules the collation rules to build the collation table from.\r
34      * @exception ParseException thrown when argument rules have an invalid\r
35      *            syntax.\r
36      */\r
37     CollationRuleParser(String rules) throws ParseException\r
38     {\r
39         extractSetsFromRules(rules);\r
40         m_source_ = new StringBuilder(Normalizer.decompose(rules, false).trim());\r
41         m_rules_ = m_source_.toString();\r
42         m_current_ = 0;\r
43         m_extraCurrent_ = m_source_.length();\r
44         m_variableTop_ = null;\r
45         m_parsedToken_ = new ParsedToken();\r
46         m_hashTable_ = new Hashtable<Token, Token>();\r
47         m_options_ = new OptionSet(RuleBasedCollator.UCA_);\r
48         m_listHeader_ = new TokenListHeader[512];\r
49         m_resultLength_ = 0;\r
50         m_prevStrength_ = TOKEN_UNSET_;\r
51         // call assembleTokenList() manually, so that we can\r
52         // init a parser and manually parse tokens\r
53         //assembleTokenList();\r
54     }\r
55 \r
56     // package private inner classes -----------------------------------------\r
57 \r
58     /**\r
59      * Collation options set\r
60      */\r
61     static class OptionSet\r
62     {\r
63         // package private constructor ---------------------------------------\r
64 \r
65         /**\r
66          * Initializes the option set with the argument collators\r
67          * @param collator option to use\r
68          */\r
69         OptionSet(RuleBasedCollator collator)\r
70         {\r
71             m_variableTopValue_ = collator.m_variableTopValue_;\r
72             m_isFrenchCollation_ = collator.isFrenchCollation();\r
73             m_isAlternateHandlingShifted_\r
74                                    = collator.isAlternateHandlingShifted();\r
75             m_caseFirst_ = collator.m_caseFirst_;\r
76             m_isCaseLevel_ = collator.isCaseLevel();\r
77             m_decomposition_ = collator.getDecomposition();\r
78             m_strength_ = collator.getStrength();\r
79             m_isHiragana4_ = collator.m_isHiragana4_;\r
80         }\r
81 \r
82         // package private data members --------------------------------------\r
83 \r
84         int m_variableTopValue_;\r
85         boolean m_isFrenchCollation_;\r
86         /**\r
87          * Attribute for handling variable elements\r
88          */\r
89         boolean m_isAlternateHandlingShifted_;\r
90         /**\r
91          * who goes first, lower case or uppercase\r
92          */\r
93         int m_caseFirst_;\r
94         /**\r
95          * do we have an extra case level\r
96          */\r
97         boolean m_isCaseLevel_;\r
98         /**\r
99          * attribute for normalization\r
100          */\r
101         int m_decomposition_;\r
102         /**\r
103          * attribute for strength\r
104          */\r
105         int m_strength_;\r
106         /**\r
107          * attribute for special Hiragana\r
108          */\r
109         boolean m_isHiragana4_;\r
110     }\r
111 \r
112     /**\r
113      * List of tokens used by the collation rules\r
114      */\r
115     static class TokenListHeader\r
116     {\r
117         Token m_first_;\r
118         Token m_last_;\r
119         Token m_reset_;\r
120         boolean m_indirect_;\r
121         int m_baseCE_;\r
122         int m_baseContCE_;\r
123         int m_nextCE_;\r
124         int m_nextContCE_;\r
125         int m_previousCE_;\r
126         int m_previousContCE_;\r
127         int m_pos_[] = new int[Collator.IDENTICAL + 1];\r
128         int m_gapsLo_[] = new int[3 * (Collator.TERTIARY + 1)];\r
129         int m_gapsHi_[] = new int[3 * (Collator.TERTIARY + 1)];\r
130         int m_numStr_[] = new int[3 * (Collator.TERTIARY + 1)];\r
131         Token m_fStrToken_[] = new Token[Collator.TERTIARY + 1];\r
132         Token m_lStrToken_[] = new Token[Collator.TERTIARY + 1];\r
133     }\r
134 \r
135     /**\r
136      * Token wrapper for collation rules\r
137      */\r
138     static class Token\r
139     {\r
140        // package private data members ---------------------------------------\r
141 \r
142        int m_CE_[];\r
143        int m_CELength_;\r
144        int m_expCE_[];\r
145        int m_expCELength_;\r
146        int m_source_;\r
147        int m_expansion_;\r
148        int m_prefix_;\r
149        int m_strength_;\r
150        int m_toInsert_;\r
151        int m_polarity_; // 1 for <, <<, <<<, , ; and 0 for >, >>, >>>\r
152        TokenListHeader m_listHeader_;\r
153        Token m_previous_;\r
154        Token m_next_;\r
155        StringBuilder m_rules_;\r
156        char m_flags_;\r
157 \r
158        // package private constructors ---------------------------------------\r
159 \r
160        Token()\r
161        {\r
162            m_CE_ = new int[128];\r
163            m_expCE_ = new int[128];\r
164            // TODO: this should also handle reverse\r
165            m_polarity_ = TOKEN_POLARITY_POSITIVE_;\r
166            m_next_ = null;\r
167            m_previous_ = null;\r
168            m_CELength_ = 0;\r
169            m_expCELength_ = 0;\r
170        }\r
171 \r
172        // package private methods --------------------------------------------\r
173 \r
174        /**\r
175         * Hashcode calculation for token\r
176         * @return the hashcode\r
177         */\r
178        public int hashCode()\r
179        {\r
180            int result = 0;\r
181            int len = (m_source_ & 0xFF000000) >>> 24;\r
182            int inc = ((len - 32) / 32) + 1;\r
183 \r
184            int start = m_source_ & 0x00FFFFFF;\r
185            int limit = start + len;\r
186 \r
187            while (start < limit) {\r
188                result = (result * 37) + m_rules_.charAt(start);\r
189                start += inc;\r
190            }\r
191            return result;\r
192        }\r
193 \r
194        /**\r
195         * Equals calculation\r
196         * @param target object to compare\r
197         * @return true if target is the same as this object\r
198         */\r
199        public boolean equals(Object target)\r
200        {\r
201            if (target == this) {\r
202                return true;\r
203            }\r
204            if (target instanceof Token) {\r
205                Token t = (Token)target;\r
206                int sstart = m_source_ & 0x00FFFFFF;\r
207                int tstart = t.m_source_ & 0x00FFFFFF;\r
208                int slimit = (m_source_ & 0xFF000000) >> 24;\r
209                int tlimit = (m_source_ & 0xFF000000) >> 24;\r
210 \r
211                int end = sstart + slimit - 1;\r
212 \r
213                if (m_source_ == 0 || t.m_source_ == 0) {\r
214                    return false;\r
215                }\r
216                if (slimit != tlimit) {\r
217                    return false;\r
218                }\r
219                if (m_source_ == t.m_source_) {\r
220                    return true;\r
221                }\r
222 \r
223                while (sstart < end\r
224                       && m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart))\r
225                {\r
226                    ++ sstart;\r
227                    ++ tstart;\r
228                }\r
229                if (m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) {\r
230                    return true;\r
231                }\r
232            }\r
233            return false;\r
234         }\r
235     }\r
236 \r
237     // package private data member -------------------------------------------\r
238 \r
239     /**\r
240      * Indicator that the token is resetted yet, ie & in the rules\r
241      */\r
242     static final int TOKEN_RESET_ = 0xDEADBEEF;\r
243 \r
244     /**\r
245      * Size of the number of tokens\r
246      */\r
247     int m_resultLength_;\r
248     /**\r
249      * List of parsed tokens\r
250      */\r
251     TokenListHeader m_listHeader_[];\r
252     /**\r
253      * Variable top token\r
254      */\r
255     Token m_variableTop_;\r
256     /**\r
257      * Collation options\r
258      */\r
259     OptionSet m_options_;\r
260     /**\r
261      * Normalized collation rules with some extra characters\r
262      */\r
263     StringBuilder m_source_;\r
264     /**\r
265      * Hash table to keep all tokens\r
266      */\r
267     Hashtable<Token, Token> m_hashTable_;\r
268 \r
269     // package private method ------------------------------------------------\r
270 \r
271     void setDefaultOptionsInCollator(RuleBasedCollator collator)\r
272     {\r
273         collator.m_defaultStrength_ = m_options_.m_strength_;\r
274         collator.m_defaultDecomposition_ = m_options_.m_decomposition_;\r
275         collator.m_defaultIsFrenchCollation_ = m_options_.m_isFrenchCollation_;\r
276         collator.m_defaultIsAlternateHandlingShifted_\r
277                                     = m_options_.m_isAlternateHandlingShifted_;\r
278         collator.m_defaultIsCaseLevel_ = m_options_.m_isCaseLevel_;\r
279         collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_;\r
280         collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_;\r
281         collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_;\r
282     }\r
283 \r
284     // private inner classes -------------------------------------------------\r
285 \r
286     /**\r
287      * This is a token that has been parsed but not yet processed. Used to\r
288      * reduce the number of arguments in the parser\r
289      */\r
290     private static class ParsedToken\r
291     {\r
292         // private constructor ----------------------------------------------\r
293 \r
294         /**\r
295          * Empty constructor\r
296          */\r
297         ParsedToken()\r
298         {\r
299             m_charsLen_ = 0;\r
300             m_charsOffset_ = 0;\r
301             m_extensionLen_ = 0;\r
302             m_extensionOffset_ = 0;\r
303             m_prefixLen_ = 0;\r
304             m_prefixOffset_ = 0;\r
305             m_flags_ = 0;\r
306             m_strength_ = TOKEN_UNSET_;\r
307         }\r
308 \r
309         // private data members ---------------------------------------------\r
310 \r
311         int m_strength_;\r
312         int m_charsOffset_;\r
313         int m_charsLen_;\r
314         int m_extensionOffset_;\r
315         int m_extensionLen_;\r
316         int m_prefixOffset_;\r
317         int m_prefixLen_;\r
318         char m_flags_;\r
319         char m_indirectIndex_;\r
320     }\r
321 \r
322     /**\r
323      * Boundary wrappers\r
324      */\r
325     private static class IndirectBoundaries\r
326     {\r
327         // package private constructor ---------------------------------------\r
328 \r
329         IndirectBoundaries(int startce[], int limitce[])\r
330         {\r
331             // Set values for the top - TODO: once we have values for all the\r
332             // indirects, we are going to initalize here.\r
333             m_startCE_ = startce[0];\r
334             m_startContCE_ = startce[1];\r
335             if (limitce != null) {\r
336                 m_limitCE_ = limitce[0];\r
337                 m_limitContCE_ = limitce[1];\r
338             }\r
339             else {\r
340                 m_limitCE_ = 0;\r
341                 m_limitContCE_ = 0;\r
342             }\r
343         }\r
344 \r
345         // package private data members --------------------------------------\r
346 \r
347         int m_startCE_;\r
348         int m_startContCE_;\r
349         int m_limitCE_;\r
350         int m_limitContCE_;\r
351     }\r
352 \r
353     /**\r
354      * Collation option rule tag\r
355      */\r
356     private static class TokenOption\r
357     {\r
358         // package private constructor ---------------------------------------\r
359 \r
360         TokenOption(String name, int attribute, String suboptions[],\r
361                     int suboptionattributevalue[])\r
362         {\r
363             m_name_ = name;\r
364             m_attribute_ = attribute;\r
365             m_subOptions_ = suboptions;\r
366             m_subOptionAttributeValues_ = suboptionattributevalue;\r
367         }\r
368 \r
369         // package private data member ---------------------------------------\r
370 \r
371         private String m_name_;\r
372         private int m_attribute_;\r
373         private String m_subOptions_[];\r
374         private int m_subOptionAttributeValues_[];\r
375     }\r
376 \r
377     // private variables -----------------------------------------------------\r
378 \r
379     /**\r
380      * Current parsed token\r
381      */\r
382     private ParsedToken m_parsedToken_;\r
383     /**\r
384      * Collation rule\r
385      */\r
386     private String m_rules_;\r
387     private int m_current_;\r
388     /**\r
389      * End of the option while reading.\r
390      * Need it for UnicodeSet reading support.\r
391      */\r
392     private int m_optionEnd_;\r
393     /*\r
394      * Current offset in m_source\r
395      */\r
396     //private int m_sourceLimit_;\r
397     /**\r
398      * Offset to m_source_ ofr the extra expansion characters\r
399      */\r
400     private int m_extraCurrent_;\r
401 \r
402     /**\r
403      * UnicodeSet that contains code points to be copied from the UCA\r
404      */\r
405     UnicodeSet m_copySet_;\r
406 \r
407     /**\r
408      * UnicodeSet that contains code points for which we want to remove\r
409      * UCA contractions. It implies copying of these code points from\r
410      * the UCA.\r
411      */\r
412     UnicodeSet m_removeSet_;\r
413     /**\r
414      * Stores the previous token's strength when making a list of same level\r
415      * differences.\r
416      */\r
417     private int m_prevStrength_;\r
418 \r
419     /*\r
420      * This is space for the extra strings that need to be unquoted during the\r
421      * parsing of the rules\r
422      */\r
423     //private static final int TOKEN_EXTRA_RULE_SPACE_SIZE_ = 2048;\r
424     /**\r
425      * Indicator that the token is not set yet\r
426      */\r
427     private static final int TOKEN_UNSET_ = 0xFFFFFFFF;\r
428     /*\r
429      * Indicator that the rule is in the > polarity, ie everything on the\r
430      * right of the rule is less than\r
431      */\r
432     //private static final int TOKEN_POLARITY_NEGATIVE_ = 0;\r
433     /**\r
434      * Indicator that the rule is in the < polarity, ie everything on the\r
435      * right of the rule is greater than\r
436      */\r
437     private static final int TOKEN_POLARITY_POSITIVE_ = 1;\r
438     /**\r
439      * Flag mask to determine if top is set\r
440      */\r
441     private static final int TOKEN_TOP_MASK_ = 0x04;\r
442     /**\r
443      * Flag mask to determine if variable top is set\r
444      */\r
445     private static final int TOKEN_VARIABLE_TOP_MASK_ = 0x08;\r
446     /**\r
447      * Flag mask to determine if a before attribute is set\r
448      */\r
449     private static final int TOKEN_BEFORE_ = 0x03;\r
450     /**\r
451      * For use in parsing token options\r
452      */\r
453     private static final int TOKEN_SUCCESS_MASK_ = 0x10;\r
454 \r
455     /**\r
456      * These values are used for finding CE values for indirect positioning.\r
457      * Indirect positioning is a mechanism for allowing resets on symbolic\r
458      * values. It only works for resets and you cannot tailor indirect names.\r
459      * An indirect name can define either an anchor point or a range. An anchor\r
460      * point behaves in exactly the same way as a code point in reset would,\r
461      * except that it cannot be tailored. A range (we currently only know for\r
462      * the [top] range will explicitly set the upper bound for generated CEs,\r
463      * thus allowing for better control over how many CEs can be squeezed\r
464      * between in the range without performance penalty. In that respect, we use\r
465      * [top] for tailoring of locales that use CJK characters. Other indirect\r
466      * values are currently a pure convenience, they can be used to assure that\r
467      * the CEs will be always positioned in the same place relative to a point\r
468      * with known properties (e.g. first primary ignorable).\r
469      */\r
470     private static final IndirectBoundaries INDIRECT_BOUNDARIES_[];\r
471 \r
472 //    /**\r
473 //     * Inverse UCA constants\r
474 //     */\r
475 //    private static final int INVERSE_SIZE_MASK_ = 0xFFF00000;\r
476 //    private static final int INVERSE_OFFSET_MASK_ = 0x000FFFFF;\r
477 //    private static final int INVERSE_SHIFT_VALUE_ = 20;\r
478 \r
479     /**\r
480      * Collation option tags\r
481      * [last variable] last variable value\r
482      * [last primary ignorable] largest CE for primary ignorable\r
483      * [last secondary ignorable] largest CE for secondary ignorable\r
484      * [last tertiary ignorable] largest CE for tertiary ignorable\r
485      * [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)\r
486      */\r
487     private static final TokenOption RULES_OPTIONS_[];\r
488 \r
489     static\r
490     {\r
491         INDIRECT_BOUNDARIES_ = new IndirectBoundaries[15];\r
492         // UCOL_RESET_TOP_VALUE\r
493         INDIRECT_BOUNDARIES_[0] = new IndirectBoundaries(\r
494                         RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,\r
495                         RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);\r
496         // UCOL_FIRST_PRIMARY_IGNORABLE\r
497         INDIRECT_BOUNDARIES_[1] = new IndirectBoundaries(\r
498                     RuleBasedCollator.UCA_CONSTANTS_.FIRST_PRIMARY_IGNORABLE_,\r
499                     null);\r
500         // UCOL_LAST_PRIMARY_IGNORABLE\r
501         INDIRECT_BOUNDARIES_[2] = new IndirectBoundaries(\r
502                     RuleBasedCollator.UCA_CONSTANTS_.LAST_PRIMARY_IGNORABLE_,\r
503                     null);\r
504 \r
505         // UCOL_FIRST_SECONDARY_IGNORABLE\r
506         INDIRECT_BOUNDARIES_[3] = new IndirectBoundaries(\r
507                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_SECONDARY_IGNORABLE_,\r
508                    null);\r
509         // UCOL_LAST_SECONDARY_IGNORABLE\r
510         INDIRECT_BOUNDARIES_[4] = new IndirectBoundaries(\r
511                    RuleBasedCollator.UCA_CONSTANTS_.LAST_SECONDARY_IGNORABLE_,\r
512                    null);\r
513         // UCOL_FIRST_TERTIARY_IGNORABLE\r
514         INDIRECT_BOUNDARIES_[5] = new IndirectBoundaries(\r
515                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_TERTIARY_IGNORABLE_,\r
516                    null);\r
517         // UCOL_LAST_TERTIARY_IGNORABLE\r
518         INDIRECT_BOUNDARIES_[6] = new IndirectBoundaries(\r
519                    RuleBasedCollator.UCA_CONSTANTS_.LAST_TERTIARY_IGNORABLE_,\r
520                    null);\r
521         // UCOL_FIRST_VARIABLE;\r
522         INDIRECT_BOUNDARIES_[7] = new IndirectBoundaries(\r
523                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_VARIABLE_,\r
524                    null);\r
525         // UCOL_LAST_VARIABLE\r
526         INDIRECT_BOUNDARIES_[8] = new IndirectBoundaries(\r
527                    RuleBasedCollator.UCA_CONSTANTS_.LAST_VARIABLE_,\r
528                    null);\r
529         // UCOL_FIRST_NON_VARIABLE\r
530         INDIRECT_BOUNDARIES_[9] = new IndirectBoundaries(\r
531                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_NON_VARIABLE_,\r
532                    null);\r
533         // UCOL_LAST_NON_VARIABLE\r
534         INDIRECT_BOUNDARIES_[10] = new IndirectBoundaries(\r
535                    RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,\r
536                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);\r
537         // UCOL_FIRST_IMPLICIT\r
538         INDIRECT_BOUNDARIES_[11] = new IndirectBoundaries(\r
539                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_,\r
540                    null);\r
541         // UCOL_LAST_IMPLICIT\r
542         INDIRECT_BOUNDARIES_[12] = new IndirectBoundaries(\r
543                    RuleBasedCollator.UCA_CONSTANTS_.LAST_IMPLICIT_,\r
544                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_);\r
545         // UCOL_FIRST_TRAILING\r
546         INDIRECT_BOUNDARIES_[13] = new IndirectBoundaries(\r
547                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_,\r
548                    null);\r
549         // UCOL_LAST_TRAILING\r
550         INDIRECT_BOUNDARIES_[14] = new IndirectBoundaries(\r
551                    RuleBasedCollator.UCA_CONSTANTS_.LAST_TRAILING_,\r
552                    null);\r
553         INDIRECT_BOUNDARIES_[14].m_limitCE_\r
554                  = RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_SPECIAL_MIN_ << 24;\r
555 \r
556         RULES_OPTIONS_ = new TokenOption[19];\r
557         String option[] = {"non-ignorable", "shifted"};\r
558         int value[] = {RuleBasedCollator.AttributeValue.NON_IGNORABLE_,\r
559                        RuleBasedCollator.AttributeValue.SHIFTED_};\r
560         RULES_OPTIONS_[0] = new TokenOption("alternate",\r
561                               RuleBasedCollator.Attribute.ALTERNATE_HANDLING_,\r
562                               option, value);\r
563         option = new String[1];\r
564         option[0] = "2";\r
565         value = new int[1];\r
566         value[0] = RuleBasedCollator.AttributeValue.ON_;\r
567         RULES_OPTIONS_[1] = new TokenOption("backwards",\r
568                                  RuleBasedCollator.Attribute.FRENCH_COLLATION_,\r
569                                  option, value);\r
570         String offonoption[] = new String[2];\r
571         offonoption[0] = "off";\r
572         offonoption[1] = "on";\r
573         int offonvalue[] = new int[2];\r
574         offonvalue[0] = RuleBasedCollator.AttributeValue.OFF_;\r
575         offonvalue[1] = RuleBasedCollator.AttributeValue.ON_;\r
576         RULES_OPTIONS_[2] = new TokenOption("caseLevel",\r
577                                        RuleBasedCollator.Attribute.CASE_LEVEL_,\r
578                                        offonoption, offonvalue);\r
579         option = new String[3];\r
580         option[0] = "lower";\r
581         option[1] = "upper";\r
582         option[2] = "off";\r
583         value = new int[3];\r
584         value[0] = RuleBasedCollator.AttributeValue.LOWER_FIRST_;\r
585         value[1] = RuleBasedCollator.AttributeValue.UPPER_FIRST_;\r
586         value[2] = RuleBasedCollator.AttributeValue.OFF_;\r
587         RULES_OPTIONS_[3] = new TokenOption("caseFirst",\r
588                                        RuleBasedCollator.Attribute.CASE_FIRST_,\r
589                                        option, value);\r
590         RULES_OPTIONS_[4] = new TokenOption("normalization",\r
591                                RuleBasedCollator.Attribute.NORMALIZATION_MODE_,\r
592                                offonoption, offonvalue);\r
593         RULES_OPTIONS_[5] = new TokenOption("hiraganaQ",\r
594                          RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_,\r
595                          offonoption, offonvalue);\r
596         option = new String[5];\r
597         option[0] = "1";\r
598         option[1] = "2";\r
599         option[2] = "3";\r
600         option[3] = "4";\r
601         option[4] = "I";\r
602         value = new int[5];\r
603         value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;\r
604         value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;\r
605         value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;\r
606         value[3] = RuleBasedCollator.AttributeValue.QUATERNARY_;\r
607         value[4] = RuleBasedCollator.AttributeValue.IDENTICAL_;\r
608         RULES_OPTIONS_[6] = new TokenOption("strength",\r
609                                          RuleBasedCollator.Attribute.STRENGTH_,\r
610                                          option, value);\r
611         RULES_OPTIONS_[7] = new TokenOption("variable top",\r
612                                   RuleBasedCollator.Attribute.LIMIT_,\r
613                                   null, null);\r
614         RULES_OPTIONS_[8] = new TokenOption("rearrange",\r
615                                   RuleBasedCollator.Attribute.LIMIT_,\r
616                                   null, null);\r
617         option = new String[3];\r
618         option[0] = "1";\r
619         option[1] = "2";\r
620         option[2] = "3";\r
621         value = new int[3];\r
622         value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;\r
623         value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;\r
624         value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;\r
625         RULES_OPTIONS_[9] = new TokenOption("before",\r
626                                   RuleBasedCollator.Attribute.LIMIT_,\r
627                                   option, value);\r
628         RULES_OPTIONS_[10] = new TokenOption("top",\r
629                                   RuleBasedCollator.Attribute.LIMIT_,\r
630                                   null, null);\r
631         String firstlastoption[] = new String[7];\r
632         firstlastoption[0] = "primary";\r
633         firstlastoption[1] = "secondary";\r
634         firstlastoption[2] = "tertiary";\r
635         firstlastoption[3] = "variable";\r
636         firstlastoption[4] = "regular";\r
637         firstlastoption[5] = "implicit";\r
638         firstlastoption[6] = "trailing";\r
639 \r
640         int firstlastvalue[] = new int[7];\r
641         Arrays.fill(firstlastvalue, RuleBasedCollator.AttributeValue.PRIMARY_);\r
642 \r
643         RULES_OPTIONS_[11] = new TokenOption("first",\r
644                                   RuleBasedCollator.Attribute.LIMIT_,\r
645                                   firstlastoption, firstlastvalue);\r
646         RULES_OPTIONS_[12] = new TokenOption("last",\r
647                                   RuleBasedCollator.Attribute.LIMIT_,\r
648                                   firstlastoption, firstlastvalue);\r
649         RULES_OPTIONS_[13] = new TokenOption("optimize",\r
650                                   RuleBasedCollator.Attribute.LIMIT_,\r
651                                   null, null);\r
652         RULES_OPTIONS_[14] = new TokenOption("suppressContractions",\r
653                                   RuleBasedCollator.Attribute.LIMIT_,\r
654                                   null, null);\r
655         RULES_OPTIONS_[15] = new TokenOption("undefined",\r
656                                   RuleBasedCollator.Attribute.LIMIT_,\r
657                                   null, null);\r
658         RULES_OPTIONS_[16] = new TokenOption("scriptOrder",\r
659                                   RuleBasedCollator.Attribute.LIMIT_,\r
660                                   null, null);\r
661         RULES_OPTIONS_[17] = new TokenOption("charsetname",\r
662                                   RuleBasedCollator.Attribute.LIMIT_,\r
663                                   null, null);\r
664         RULES_OPTIONS_[18] = new TokenOption("charset",\r
665                                   RuleBasedCollator.Attribute.LIMIT_,\r
666                                   null, null);\r
667     }\r
668 \r
669     /**\r
670      * Utility data members\r
671      */\r
672     private Token m_utilToken_ = new Token();\r
673     private CollationElementIterator m_UCAColEIter_\r
674                       = RuleBasedCollator.UCA_.getCollationElementIterator("");\r
675     private int m_utilCEBuffer_[] = new int[2];\r
676 \r
677     // private methods -------------------------------------------------------\r
678 \r
679     /**\r
680      * Assembles the token list\r
681      * @exception ParseException thrown when rules syntax fails\r
682      */\r
683     int assembleTokenList() throws ParseException\r
684     {\r
685         Token lastToken = null;\r
686         m_parsedToken_.m_strength_ = TOKEN_UNSET_;\r
687         int sourcelimit = m_source_.length();\r
688         int expandNext = 0;\r
689 \r
690         while (m_current_ < sourcelimit) {\r
691             m_parsedToken_.m_prefixOffset_ = 0;\r
692             if (parseNextToken(lastToken == null) < 0) {\r
693                 // we have reached the end\r
694                 continue;\r
695             }\r
696             char specs = m_parsedToken_.m_flags_;\r
697             boolean variableTop = ((specs & TOKEN_VARIABLE_TOP_MASK_) != 0);\r
698             boolean top = ((specs & TOKEN_TOP_MASK_) != 0);\r
699             int lastStrength = TOKEN_UNSET_;\r
700             if (lastToken != null) {\r
701                 lastStrength = lastToken.m_strength_;\r
702             }\r
703             m_utilToken_.m_source_ = m_parsedToken_.m_charsLen_ << 24\r
704                                              | m_parsedToken_.m_charsOffset_;\r
705             m_utilToken_.m_rules_ = m_source_;\r
706             // 4 Lookup each source in the CharsToToken map, and find a\r
707             // sourcetoken\r
708             Token sourceToken = m_hashTable_.get(m_utilToken_);\r
709             if (m_parsedToken_.m_strength_ != TOKEN_RESET_) {\r
710                 if (lastToken == null) {\r
711                     // this means that rules haven't started properly\r
712                     throwParseException(m_source_.toString(), 0);\r
713                 }\r
714                 //  6 Otherwise (when relation != reset)\r
715                 if (sourceToken == null) {\r
716                     // If sourceToken is null, create new one\r
717                     sourceToken = new Token();\r
718                      sourceToken.m_rules_ = m_source_;\r
719                     sourceToken.m_source_ = m_parsedToken_.m_charsLen_ << 24\r
720                                            | m_parsedToken_.m_charsOffset_;\r
721                     sourceToken.m_prefix_ = m_parsedToken_.m_prefixLen_ << 24\r
722                                            | m_parsedToken_.m_prefixOffset_;\r
723                     // TODO: this should also handle reverse\r
724                     sourceToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;\r
725                     sourceToken.m_next_ = null;\r
726                      sourceToken.m_previous_ = null;\r
727                     sourceToken.m_CELength_ = 0;\r
728                     sourceToken.m_expCELength_ = 0;\r
729                     m_hashTable_.put(sourceToken, sourceToken);\r
730                 }\r
731                 else {\r
732                     // we could have fished out a reset here\r
733                     if (sourceToken.m_strength_ != TOKEN_RESET_\r
734                         && lastToken != sourceToken) {\r
735                         // otherwise remove sourceToken from where it was.\r
736                         if (sourceToken.m_next_ != null) {\r
737                             if (sourceToken.m_next_.m_strength_\r
738                                                    > sourceToken.m_strength_) {\r
739                                 sourceToken.m_next_.m_strength_\r
740                                                    = sourceToken.m_strength_;\r
741                             }\r
742                             sourceToken.m_next_.m_previous_\r
743                                                     = sourceToken.m_previous_;\r
744                         }\r
745                         else {\r
746                             sourceToken.m_listHeader_.m_last_\r
747                                                     = sourceToken.m_previous_;\r
748                         }\r
749                         if (sourceToken.m_previous_ != null) {\r
750                             sourceToken.m_previous_.m_next_\r
751                                                         = sourceToken.m_next_;\r
752                         }\r
753                         else {\r
754                             sourceToken.m_listHeader_.m_first_\r
755                                                         = sourceToken.m_next_;\r
756                         }\r
757                         sourceToken.m_next_ = null;\r
758                         sourceToken.m_previous_ = null;\r
759                     }\r
760                 }\r
761                 sourceToken.m_strength_ = m_parsedToken_.m_strength_;\r
762                 sourceToken.m_listHeader_ = lastToken.m_listHeader_;\r
763 \r
764                 // 1.  Find the strongest strength in each list, and set\r
765                 // strongestP and strongestN accordingly in the headers.\r
766                 if (lastStrength == TOKEN_RESET_\r
767                     || sourceToken.m_listHeader_.m_first_ == null) {\r
768                     // If LAST is a reset insert sourceToken in the list.\r
769                     if (sourceToken.m_listHeader_.m_first_ == null) {\r
770                         sourceToken.m_listHeader_.m_first_ = sourceToken;\r
771                         sourceToken.m_listHeader_.m_last_ = sourceToken;\r
772                     }\r
773                     else { // we need to find a place for us\r
774                            // and we'll get in front of the same strength\r
775                         if (sourceToken.m_listHeader_.m_first_.m_strength_\r
776                                                  <= sourceToken.m_strength_) {\r
777                             sourceToken.m_next_\r
778                                           = sourceToken.m_listHeader_.m_first_;\r
779                             sourceToken.m_next_.m_previous_ = sourceToken;\r
780                             sourceToken.m_listHeader_.m_first_ = sourceToken;\r
781                             sourceToken.m_previous_ = null;\r
782                         }\r
783                         else {\r
784                             lastToken = sourceToken.m_listHeader_.m_first_;\r
785                             while (lastToken.m_next_ != null\r
786                                    && lastToken.m_next_.m_strength_\r
787                                                  > sourceToken.m_strength_) {\r
788                                 lastToken = lastToken.m_next_;\r
789                             }\r
790                             if (lastToken.m_next_ != null) {\r
791                                 lastToken.m_next_.m_previous_ = sourceToken;\r
792                             }\r
793                             else {\r
794                                 sourceToken.m_listHeader_.m_last_\r
795                                                                = sourceToken;\r
796                             }\r
797                             sourceToken.m_previous_ = lastToken;\r
798                             sourceToken.m_next_ = lastToken.m_next_;\r
799                             lastToken.m_next_ = sourceToken;\r
800                         }\r
801                     }\r
802                 }\r
803                 else {\r
804                     // Otherwise (when LAST is not a reset)\r
805                     // if polarity (LAST) == polarity(relation), insert\r
806                     // sourceToken after LAST, otherwise insert before.\r
807                     // when inserting after or before, search to the next\r
808                     // position with the same strength in that direction.\r
809                     // (This is called postpone insertion).\r
810                     if (sourceToken != lastToken) {\r
811                         if (lastToken.m_polarity_ == sourceToken.m_polarity_) {\r
812                             while (lastToken.m_next_ != null\r
813                                    && lastToken.m_next_.m_strength_\r
814                                                    > sourceToken.m_strength_) {\r
815                                 lastToken = lastToken.m_next_;\r
816                             }\r
817                             sourceToken.m_previous_ = lastToken;\r
818                             if (lastToken.m_next_ != null) {\r
819                                 lastToken.m_next_.m_previous_ = sourceToken;\r
820                             }\r
821                             else {\r
822                                 sourceToken.m_listHeader_.m_last_ = sourceToken;\r
823                             }\r
824                             sourceToken.m_next_ = lastToken.m_next_;\r
825                             lastToken.m_next_ = sourceToken;\r
826                         }\r
827                         else {\r
828                             while (lastToken.m_previous_ != null\r
829                                    && lastToken.m_previous_.m_strength_\r
830                                                 > sourceToken.m_strength_) {\r
831                                 lastToken = lastToken.m_previous_;\r
832                             }\r
833                             sourceToken.m_next_ = lastToken;\r
834                             if (lastToken.m_previous_ != null) {\r
835                                 lastToken.m_previous_.m_next_ = sourceToken;\r
836                             }\r
837                             else {\r
838                                 sourceToken.m_listHeader_.m_first_\r
839                                                                  = sourceToken;\r
840                             }\r
841                             sourceToken.m_previous_ = lastToken.m_previous_;\r
842                             lastToken.m_previous_ = sourceToken;\r
843                         }\r
844                     }\r
845                     else { // repeated one thing twice in rules, stay with the\r
846                            // stronger strength\r
847                         if (lastStrength < sourceToken.m_strength_) {\r
848                             sourceToken.m_strength_ = lastStrength;\r
849                         }\r
850                     }\r
851                 }\r
852                 // if the token was a variable top, we're gonna put it in\r
853                 if (variableTop == true && m_variableTop_ == null) {\r
854                     variableTop = false;\r
855                     m_variableTop_ = sourceToken;\r
856                 }\r
857                 // Treat the expansions.\r
858                 // There are two types of expansions: explicit (x / y) and\r
859                 // reset based propagating expansions\r
860                 // (&abc * d * e <=> &ab * d / c * e / c)\r
861                 // if both of them are in effect for a token, they are combined.\r
862                sourceToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24\r
863                                           | m_parsedToken_.m_extensionOffset_;\r
864                if (expandNext != 0) {\r
865                    if (sourceToken.m_strength_ == RuleBasedCollator.PRIMARY) {\r
866                        // primary strength kills off the implicit expansion\r
867                        expandNext = 0;\r
868                    }\r
869                    else if (sourceToken.m_expansion_ == 0) {\r
870                        // if there is no expansion, implicit is just added to\r
871                        // the token\r
872                        sourceToken.m_expansion_ = expandNext;\r
873                    }\r
874                    else {\r
875                        // there is both explicit and implicit expansion.\r
876                        // We need to make a combination\r
877                        int start = expandNext & 0xFFFFFF;\r
878                        int size = expandNext >>> 24;\r
879                        if (size > 0) {\r
880                           m_source_.append(m_source_.substring(start,\r
881                                                                start + size));\r
882                        }\r
883                           start = m_parsedToken_.m_extensionOffset_;\r
884                        m_source_.append(m_source_.substring(start,\r
885                                       start + m_parsedToken_.m_extensionLen_));\r
886                        sourceToken.m_expansion_ = (size\r
887                                        + m_parsedToken_.m_extensionLen_) << 24\r
888                                        | m_extraCurrent_;\r
889                        m_extraCurrent_ += size + m_parsedToken_.m_extensionLen_;\r
890                    }\r
891                 }\r
892                // if the previous token was a reset before, the strength of this\r
893                // token must match the strength of before. Otherwise we have an\r
894                // undefined situation.\r
895                // In other words, we currently have a cludge which we use to\r
896                // represent &a >> x. This is written as &[before 2]a << x.\r
897                if((lastToken.m_flags_ & TOKEN_BEFORE_) != 0) {\r
898                    int beforeStrength = (lastToken.m_flags_ & TOKEN_BEFORE_) - 1;\r
899                    if(beforeStrength != sourceToken.m_strength_) {\r
900                           throwParseException(m_source_.toString(), m_current_);\r
901                    }\r
902                }\r
903 \r
904             }\r
905             else {\r
906                 if (lastToken != null && lastStrength == TOKEN_RESET_) {\r
907                     // if the previous token was also a reset, this means that\r
908                     // we have two consecutive resets and we want to remove the\r
909                     // previous one if empty\r
910                     if (m_resultLength_ > 0 && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {\r
911                         m_resultLength_ --;\r
912                     }\r
913                 }\r
914                 if (sourceToken == null) {\r
915                     // this is a reset, but it might still be somewhere in the\r
916                     // tailoring, in shorter form\r
917                     int searchCharsLen = m_parsedToken_.m_charsLen_;\r
918                     while (searchCharsLen > 1 && sourceToken == null) {\r
919                         searchCharsLen --;\r
920                         // key = searchCharsLen << 24 | charsOffset;\r
921                         m_utilToken_.m_source_ = searchCharsLen << 24\r
922                                              | m_parsedToken_.m_charsOffset_;\r
923                         m_utilToken_.m_rules_ = m_source_;\r
924                         sourceToken = m_hashTable_.get(m_utilToken_);\r
925                     }\r
926                     if (sourceToken != null) {\r
927                         expandNext = (m_parsedToken_.m_charsLen_\r
928                                                       - searchCharsLen) << 24\r
929                                         | (m_parsedToken_.m_charsOffset_\r
930                                            + searchCharsLen);\r
931                     }\r
932                 }\r
933                 if ((specs & TOKEN_BEFORE_) != 0) {\r
934                     if (top == false) {\r
935                         // we're doing before & there is no indirection\r
936                         int strength = (specs & TOKEN_BEFORE_) - 1;\r
937                         if (sourceToken != null\r
938                             && sourceToken.m_strength_ != TOKEN_RESET_) {\r
939                             // this is a before that is already ordered in the UCA\r
940                             // - so we need to get the previous with good strength\r
941                             while (sourceToken.m_strength_ > strength\r
942                                    && sourceToken.m_previous_ != null) {\r
943                                 sourceToken = sourceToken.m_previous_;\r
944                             }\r
945                             // here, either we hit the strength or NULL\r
946                             if (sourceToken.m_strength_ == strength) {\r
947                                 if (sourceToken.m_previous_ != null) {\r
948                                     sourceToken = sourceToken.m_previous_;\r
949                                 }\r
950                                 else { // start of list\r
951                                     sourceToken\r
952                                          = sourceToken.m_listHeader_.m_reset_;\r
953                                 }\r
954                             }\r
955                             else { // we hit NULL, we should be doing the else part\r
956                                 sourceToken\r
957                                          = sourceToken.m_listHeader_.m_reset_;\r
958                                 sourceToken = getVirginBefore(sourceToken,\r
959                                                               strength);\r
960                             }\r
961                         }\r
962                         else {\r
963                             sourceToken\r
964                                       = getVirginBefore(sourceToken, strength);\r
965                         }\r
966                     }\r
967                     else {\r
968                         // this is both before and indirection\r
969                         top = false;\r
970                         m_listHeader_[m_resultLength_] = new TokenListHeader();\r
971                         m_listHeader_[m_resultLength_].m_previousCE_ = 0;\r
972                         m_listHeader_[m_resultLength_].m_previousContCE_ = 0;\r
973                         m_listHeader_[m_resultLength_].m_indirect_ = true;\r
974                         // we need to do slightly more work. we need to get the\r
975                         // baseCE using the inverse UCA & getPrevious. The next\r
976                         // bound is not set, and will be decided in ucol_bld\r
977                         int strength = (specs & TOKEN_BEFORE_) - 1;\r
978                         int baseCE = INDIRECT_BOUNDARIES_[\r
979                                    m_parsedToken_.m_indirectIndex_].m_startCE_;\r
980                         int baseContCE = INDIRECT_BOUNDARIES_[\r
981                                m_parsedToken_.m_indirectIndex_].m_startContCE_;\r
982                         int ce[] = new int[2];\r
983                         if((baseCE >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)\r
984                         && (baseCE >>> 24 <=  RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */\r
985                             int primary = baseCE & RuleBasedCollator.CE_PRIMARY_MASK_ | (baseContCE & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;\r
986                             int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);\r
987                             int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);\r
988                             ce[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;\r
989                             ce[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;\r
990                         } else {\r
991                             CollationParsedRuleBuilder.InverseUCA invuca\r
992                                 = CollationParsedRuleBuilder.INVERSE_UCA_;\r
993                             invuca.getInversePrevCE(baseCE, baseContCE, strength,\r
994                                     ce);\r
995                         }\r
996                         m_listHeader_[m_resultLength_].m_baseCE_ = ce[0];\r
997                         m_listHeader_[m_resultLength_].m_baseContCE_ = ce[1];\r
998                         m_listHeader_[m_resultLength_].m_nextCE_ = 0;\r
999                         m_listHeader_[m_resultLength_].m_nextContCE_ = 0;\r
1000 \r
1001                         sourceToken = new Token();\r
1002                         expandNext = initAReset(0, sourceToken);\r
1003                     }\r
1004                 }\r
1005                 // 5 If the relation is a reset:\r
1006                 // If sourceToken is null\r
1007                 // Create new list, create new sourceToken, make the baseCE\r
1008                 // from source, put the sourceToken in ListHeader of the new\r
1009                 // list\r
1010                 if (sourceToken == null) {\r
1011                     if (m_listHeader_[m_resultLength_] == null) {\r
1012                         m_listHeader_[m_resultLength_] = new TokenListHeader();\r
1013                     }\r
1014                     // 3 Consider each item: relation, source, and expansion:\r
1015                     // e.g. ...< x / y ...\r
1016                     // First convert all expansions into normal form.\r
1017                     // Examples:\r
1018                     // If "xy" doesn't occur earlier in the list or in the UCA,\r
1019                     // convert &xy * c * d * ... into &x * c/y * d * ...\r
1020                     // Note: reset values can never have expansions, although\r
1021                     // they can cause the very next item to have one. They may\r
1022                     // be contractions, if they are found earlier in the list.\r
1023                     if (top == false) {\r
1024                         CollationElementIterator coleiter\r
1025                         = RuleBasedCollator.UCA_.getCollationElementIterator(\r
1026                             m_source_.substring(m_parsedToken_.m_charsOffset_,\r
1027                                                 m_parsedToken_.m_charsOffset_\r
1028                                                 + m_parsedToken_.m_charsLen_));\r
1029 \r
1030                         int CE = coleiter.next();\r
1031                         // offset to the character in the full rule string\r
1032                         int expand = coleiter.getOffset()\r
1033                                      + m_parsedToken_.m_charsOffset_;\r
1034                         int SecondCE = coleiter.next();\r
1035 \r
1036                         m_listHeader_[m_resultLength_].m_baseCE_\r
1037                                                              = CE & 0xFFFFFF3F;\r
1038                         if (RuleBasedCollator.isContinuation(SecondCE)) {\r
1039                             m_listHeader_[m_resultLength_].m_baseContCE_\r
1040                                                                     = SecondCE;\r
1041                         }\r
1042                         else {\r
1043                             m_listHeader_[m_resultLength_].m_baseContCE_ = 0;\r
1044                         }\r
1045                         m_listHeader_[m_resultLength_].m_nextCE_ = 0;\r
1046                         m_listHeader_[m_resultLength_].m_nextContCE_ = 0;\r
1047                         m_listHeader_[m_resultLength_].m_previousCE_ = 0;\r
1048                         m_listHeader_[m_resultLength_].m_previousContCE_ = 0;\r
1049                         m_listHeader_[m_resultLength_].m_indirect_ = false;\r
1050                         sourceToken = new Token();\r
1051                         expandNext = initAReset(expand, sourceToken);\r
1052                     }\r
1053                     else { // top == TRUE\r
1054                         top = false;\r
1055                         m_listHeader_[m_resultLength_].m_previousCE_ = 0;\r
1056                         m_listHeader_[m_resultLength_].m_previousContCE_ = 0;\r
1057                         m_listHeader_[m_resultLength_].m_indirect_ = true;\r
1058                         IndirectBoundaries ib = INDIRECT_BOUNDARIES_[\r
1059                                               m_parsedToken_.m_indirectIndex_];\r
1060                         m_listHeader_[m_resultLength_].m_baseCE_\r
1061                                                                = ib.m_startCE_;\r
1062                         m_listHeader_[m_resultLength_].m_baseContCE_\r
1063                                                            = ib.m_startContCE_;\r
1064                         m_listHeader_[m_resultLength_].m_nextCE_\r
1065                                                                = ib.m_limitCE_;\r
1066                         m_listHeader_[m_resultLength_].m_nextContCE_\r
1067                                                            = ib.m_limitContCE_;\r
1068                         sourceToken = new Token();\r
1069                         expandNext = initAReset(0, sourceToken);\r
1070                     }\r
1071                 }\r
1072                 else { // reset to something already in rules\r
1073                     top = false;\r
1074                 }\r
1075             }\r
1076             // 7 After all this, set LAST to point to sourceToken, and goto\r
1077             // step 3.\r
1078             lastToken = sourceToken;\r
1079         }\r
1080 \r
1081         if (m_resultLength_ > 0\r
1082             && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {\r
1083             m_resultLength_ --;\r
1084         }\r
1085         return m_resultLength_;\r
1086     }\r
1087 \r
1088     /**\r
1089      * Formats and throws a ParseException\r
1090      * @param rules collation rule that failed\r
1091      * @param offset failed offset in rules\r
1092      * @throws ParseException with failure information\r
1093      */\r
1094     private static final void throwParseException(String rules, int offset)\r
1095                                                           throws ParseException\r
1096     {\r
1097         // for pre-context\r
1098         String precontext = rules.substring(0, offset);\r
1099         String postcontext = rules.substring(offset, rules.length());\r
1100         StringBuilder error = new StringBuilder(\r
1101                                     "Parse error occurred in rule at offset ");\r
1102         error.append(offset);\r
1103         error.append("\n after the prefix \"");\r
1104         error.append(precontext);\r
1105         error.append("\" before the suffix \"");\r
1106         error.append(postcontext);\r
1107         throw new ParseException(error.toString(), offset);\r
1108     }\r
1109 \r
1110     private final boolean doSetTop() {\r
1111         m_parsedToken_.m_charsOffset_ = m_extraCurrent_;\r
1112         m_source_.append((char)0xFFFE);\r
1113         IndirectBoundaries ib =\r
1114                   INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_];\r
1115         m_source_.append((char)(ib.m_startCE_ >> 16));\r
1116         m_source_.append((char)(ib.m_startCE_ & 0xFFFF));\r
1117         m_extraCurrent_ += 3;\r
1118         if (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_\r
1119                                                        ].m_startContCE_ == 0) {\r
1120             m_parsedToken_.m_charsLen_ = 3;\r
1121         }\r
1122         else {\r
1123             m_source_.append((char)(INDIRECT_BOUNDARIES_[\r
1124                                         m_parsedToken_.m_indirectIndex_\r
1125                                     ].m_startContCE_ >> 16));\r
1126             m_source_.append((char)(INDIRECT_BOUNDARIES_[\r
1127                                         m_parsedToken_.m_indirectIndex_\r
1128                                     ].m_startContCE_ & 0xFFFF));\r
1129             m_extraCurrent_ += 2;\r
1130             m_parsedToken_.m_charsLen_ = 5;\r
1131         }\r
1132         return true;\r
1133     }\r
1134 \r
1135     private static boolean isCharNewLine(char c) {\r
1136         switch (c) {\r
1137         case 0x000A: /* LF */\r
1138         case 0x000D: /* CR */\r
1139         case 0x000C: /* FF */\r
1140         case 0x0085: /* NEL */\r
1141         case 0x2028: /* LS */\r
1142         case 0x2029: /* PS */\r
1143             return true;\r
1144         default:\r
1145             return false;\r
1146         }\r
1147     }\r
1148 \r
1149     /**\r
1150      * Getting the next token\r
1151      *\r
1152      * @param startofrules\r
1153      *            flag indicating if we are at the start of rules\r
1154      * @return the offset of the rules\r
1155      * @exception ParseException\r
1156      *                thrown when rule parsing fails\r
1157      */\r
1158     @SuppressWarnings("fallthrough")\r
1159     private int parseNextToken(boolean startofrules) throws ParseException\r
1160     {\r
1161         // parsing part\r
1162         boolean variabletop = false;\r
1163         boolean top = false;\r
1164         boolean inchars = true;\r
1165         boolean inquote = false;\r
1166         boolean wasinquote = false;\r
1167         byte before = 0;\r
1168         boolean isescaped = false;\r
1169         int /*newcharslen = 0,*/ newextensionlen = 0;\r
1170         int /*charsoffset = 0,*/ extensionoffset = 0;\r
1171         int newstrength = TOKEN_UNSET_;\r
1172 \r
1173         m_parsedToken_.m_charsLen_ = 0;\r
1174         m_parsedToken_.m_charsOffset_ = 0;\r
1175         m_parsedToken_.m_prefixOffset_ = 0;\r
1176         m_parsedToken_.m_prefixLen_ = 0;\r
1177         m_parsedToken_.m_indirectIndex_ = 0;\r
1178 \r
1179         int limit = m_rules_.length();\r
1180         while (m_current_ < limit) {\r
1181             char ch = m_source_.charAt(m_current_);\r
1182             if (inquote) {\r
1183                 if (ch == 0x0027) { // '\''\r
1184                     inquote = false;\r
1185                 }\r
1186                 else {\r
1187                     if ((m_parsedToken_.m_charsLen_ == 0) || inchars) {\r
1188                          if (m_parsedToken_.m_charsLen_ == 0) {\r
1189                              m_parsedToken_.m_charsOffset_ = m_extraCurrent_;\r
1190                          }\r
1191                          m_parsedToken_.m_charsLen_ ++;\r
1192                     }\r
1193                     else {\r
1194                         if (newextensionlen == 0) {\r
1195                             extensionoffset = m_extraCurrent_;\r
1196                         }\r
1197                         newextensionlen ++;\r
1198                     }\r
1199                 }\r
1200             }\r
1201             else if (isescaped) {\r
1202                 isescaped = false;\r
1203                 if (newstrength == TOKEN_UNSET_) {\r
1204                     throwParseException(m_rules_, m_current_);\r
1205                 }\r
1206                 if (ch != 0 && m_current_ != limit) {\r
1207                     if (inchars) {\r
1208                         if (m_parsedToken_.m_charsLen_ == 0) {\r
1209                             m_parsedToken_.m_charsOffset_ = m_current_;\r
1210                         }\r
1211                         m_parsedToken_.m_charsLen_ ++;\r
1212                     }\r
1213                     else {\r
1214                         if (newextensionlen == 0) {\r
1215                             extensionoffset = m_current_;\r
1216                         }\r
1217                         newextensionlen ++;\r
1218                     }\r
1219                 }\r
1220             }\r
1221             else {\r
1222                 if (!UCharacterProperty.isRuleWhiteSpace(ch)) {\r
1223                     // Sets the strength for this entry\r
1224                     switch (ch) {\r
1225                     case 0x003D : // '='\r
1226                         if (newstrength != TOKEN_UNSET_) {\r
1227                             return doEndParseNextToken(newstrength,\r
1228                                                        top,\r
1229                                                        extensionoffset,\r
1230                                                        newextensionlen,\r
1231                                                        variabletop, before);\r
1232                         }\r
1233                         // if we start with strength, we'll reset to top\r
1234                         if (startofrules == true) {\r
1235                             m_parsedToken_.m_indirectIndex_ = 5;\r
1236                             top = doSetTop();\r
1237                             return doEndParseNextToken(TOKEN_RESET_,\r
1238                                                        top,\r
1239                                                        extensionoffset,\r
1240                                                        newextensionlen,\r
1241                                                        variabletop, before);\r
1242                         }\r
1243                         newstrength = Collator.IDENTICAL;\r
1244                         if(m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'\r
1245                             m_current_++;\r
1246                             m_prevStrength_ = newstrength;\r
1247                         }else{\r
1248                             m_prevStrength_ = TOKEN_UNSET_;\r
1249                         }\r
1250                         break;\r
1251                     case 0x002C : // ','\r
1252                         if (newstrength != TOKEN_UNSET_) {\r
1253                             return doEndParseNextToken(newstrength,\r
1254                                                        top,\r
1255                                                        extensionoffset,\r
1256                                                        newextensionlen,\r
1257                                                        variabletop, before);\r
1258                         }\r
1259                         // if we start with strength, we'll reset to top\r
1260                         if (startofrules == true) {\r
1261                             m_parsedToken_.m_indirectIndex_ = 5;\r
1262                             top = doSetTop();\r
1263                             return doEndParseNextToken(TOKEN_RESET_,\r
1264                                                        top,\r
1265                                                        extensionoffset,\r
1266                                                        newextensionlen,\r
1267                                                        variabletop, before);\r
1268                         }\r
1269                         newstrength = Collator.TERTIARY;\r
1270                         m_prevStrength_ = TOKEN_UNSET_;\r
1271                         break;\r
1272                     case 0x003B : // ';'\r
1273                         if (newstrength != TOKEN_UNSET_) {\r
1274                             return doEndParseNextToken(newstrength,\r
1275                                                        top,\r
1276                                                        extensionoffset,\r
1277                                                        newextensionlen,\r
1278                                                        variabletop, before);\r
1279                         }\r
1280                         // if we start with strength, we'll reset to top\r
1281                         if (startofrules == true) {\r
1282                             m_parsedToken_.m_indirectIndex_ = 5;\r
1283                             top = doSetTop();\r
1284                             return doEndParseNextToken(TOKEN_RESET_,\r
1285                                                        top,\r
1286                                                        extensionoffset,\r
1287                                                        newextensionlen,\r
1288                                                        variabletop, before);\r
1289                         }\r
1290                         newstrength = Collator.SECONDARY;\r
1291                         m_prevStrength_ = TOKEN_UNSET_;\r
1292                         break;\r
1293                     case 0x003C : // '<'\r
1294                         if (newstrength != TOKEN_UNSET_) {\r
1295                             return doEndParseNextToken(newstrength,\r
1296                                                        top,\r
1297                                                        extensionoffset,\r
1298                                                        newextensionlen,\r
1299                                                        variabletop, before);\r
1300                         }\r
1301                         // if we start with strength, we'll reset to top\r
1302                         if (startofrules == true) {\r
1303                             m_parsedToken_.m_indirectIndex_ = 5;\r
1304                             top = doSetTop();\r
1305                             return doEndParseNextToken(TOKEN_RESET_,\r
1306                                                        top,\r
1307                                                        extensionoffset,\r
1308                                                        newextensionlen,\r
1309                                                        variabletop, before);\r
1310                         }\r
1311                         // before this, do a scan to verify whether this is\r
1312                         // another strength\r
1313                         if (m_source_.charAt(m_current_ + 1) == 0x003C) {\r
1314                             m_current_ ++;\r
1315                             if (m_source_.charAt(m_current_ + 1) == 0x003C) {\r
1316                                 m_current_ ++; // three in a row!\r
1317                                 newstrength = Collator.TERTIARY;\r
1318                             }\r
1319                             else { // two in a row\r
1320                                 newstrength = Collator.SECONDARY;\r
1321                             }\r
1322                         }\r
1323                         else { // just one\r
1324                             newstrength = Collator.PRIMARY;\r
1325                         }\r
1326 \r
1327                         if(m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'\r
1328                             m_current_++;\r
1329                             m_prevStrength_ = newstrength;\r
1330                         }else{\r
1331                             m_prevStrength_ = TOKEN_UNSET_;\r
1332                         }\r
1333                         break;\r
1334                     case 0x0026 : // '&'\r
1335                         if (newstrength != TOKEN_UNSET_) {\r
1336                             return doEndParseNextToken(newstrength,\r
1337                                                        top,\r
1338                                                        extensionoffset,\r
1339                                                        newextensionlen,\r
1340                                                        variabletop, before);\r
1341                         }\r
1342                         newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0\r
1343                         m_prevStrength_ = TOKEN_UNSET_;\r
1344                         break;\r
1345                     case 0x005b : // '['\r
1346                         // options - read an option, analyze it\r
1347                         m_optionEnd_ = m_rules_.indexOf(0x005d, m_current_);\r
1348                         if (m_optionEnd_ != -1) { // ']'\r
1349                             byte result = readAndSetOption();\r
1350                             m_current_ = m_optionEnd_;\r
1351                             if ((result & TOKEN_TOP_MASK_) != 0) {\r
1352                                 if (newstrength == TOKEN_RESET_) {\r
1353                                     top = doSetTop();\r
1354                                     if (before != 0) {\r
1355                                         // This is a combination of before and\r
1356                                         // indirection like\r
1357                                         // '&[before 2][first regular]<b'\r
1358                                         m_source_.append((char)0x002d);\r
1359                                         m_source_.append((char)before);\r
1360                                         m_extraCurrent_ += 2;\r
1361                                         m_parsedToken_.m_charsLen_ += 2;\r
1362                                     }\r
1363                                     m_current_ ++;\r
1364                                     return doEndParseNextToken(newstrength,\r
1365                                                        true,\r
1366                                                        extensionoffset,\r
1367                                                        newextensionlen,\r
1368                                                        variabletop, before);\r
1369                                 }\r
1370                                 else {\r
1371                                     throwParseException(m_rules_, m_current_);\r
1372                                 }\r
1373                             }\r
1374                             else if ((result & TOKEN_VARIABLE_TOP_MASK_) != 0) {\r
1375                                 if (newstrength != TOKEN_RESET_\r
1376                                     && newstrength != TOKEN_UNSET_) {\r
1377                                     variabletop = true;\r
1378                                     m_parsedToken_.m_charsOffset_\r
1379                                                              = m_extraCurrent_;\r
1380                                     m_source_.append((char)0xFFFF);\r
1381                                     m_extraCurrent_ ++;\r
1382                                     m_current_ ++;\r
1383                                     m_parsedToken_.m_charsLen_ = 1;\r
1384                                     return doEndParseNextToken(newstrength,\r
1385                                                        top,\r
1386                                                        extensionoffset,\r
1387                                                        newextensionlen,\r
1388                                                        variabletop, before);\r
1389                                 }\r
1390                                 else {\r
1391                                     throwParseException(m_rules_, m_current_);\r
1392                                 }\r
1393                             }\r
1394                             else if ((result & TOKEN_BEFORE_) != 0){\r
1395                                 if (newstrength == TOKEN_RESET_) {\r
1396                                     before = (byte)(result & TOKEN_BEFORE_);\r
1397                                 }\r
1398                                 else {\r
1399                                     throwParseException(m_rules_, m_current_);\r
1400                                 }\r
1401                             }\r
1402                         }\r
1403                         break;\r
1404                     case 0x002F : // '/'\r
1405                         wasinquote = false; // if we were copying source\r
1406                                             // characters, we want to stop now\r
1407                         inchars = false; // we're now processing expansion\r
1408                         break;\r
1409                     case 0x005C : // back slash for escaped chars\r
1410                         isescaped = true;\r
1411                         break;\r
1412                     // found a quote, we're gonna start copying\r
1413                     case 0x0027 : //'\''\r
1414                         if (newstrength == TOKEN_UNSET_) {\r
1415                             if (m_prevStrength_ == TOKEN_UNSET_) {\r
1416                                 // quote is illegal until we have a strength\r
1417                                 throwParseException(m_rules_, m_current_);\r
1418                             }else{\r
1419                                 newstrength = m_prevStrength_;\r
1420                             }\r
1421                         }\r
1422                         inquote = true;\r
1423                         if (inchars) { // we're doing characters\r
1424                             if (wasinquote == false) {\r
1425                                 m_parsedToken_.m_charsOffset_ = m_extraCurrent_;\r
1426                             }\r
1427                             if (m_parsedToken_.m_charsLen_ != 0) {\r
1428                                 m_source_.append(m_source_.substring(\r
1429                                        m_current_ - m_parsedToken_.m_charsLen_,\r
1430                                        m_current_));\r
1431                                 m_extraCurrent_ += m_parsedToken_.m_charsLen_;\r
1432                             }\r
1433                             m_parsedToken_.m_charsLen_ ++;\r
1434                         }\r
1435                         else { // we're doing an expansion\r
1436                             if (wasinquote == false) {\r
1437                                 extensionoffset = m_extraCurrent_;\r
1438                             }\r
1439                             if (newextensionlen != 0) {\r
1440                                 m_source_.append(m_source_.substring(\r
1441                                                    m_current_ - newextensionlen,\r
1442                                                    m_current_));\r
1443                                 m_extraCurrent_ += newextensionlen;\r
1444                             }\r
1445                             newextensionlen ++;\r
1446                         }\r
1447                         wasinquote = true;\r
1448                         m_current_ ++;\r
1449                         ch = m_source_.charAt(m_current_);\r
1450                         if (ch == 0x0027) { // copy the double quote\r
1451                             m_source_.append(ch);\r
1452                             m_extraCurrent_ ++;\r
1453                             inquote = false;\r
1454                         }\r
1455                         break;\r
1456                     // '@' is french only if the strength is not currently set\r
1457                     // if it is, it's just a regular character in collation\r
1458                     case 0x0040 : // '@'\r
1459                         if (newstrength == TOKEN_UNSET_) {\r
1460                             m_options_.m_isFrenchCollation_ = true;\r
1461                             break;\r
1462                         }\r
1463                         // fall through\r
1464                     case 0x007C : //|\r
1465                         // this means we have actually been reading prefix part\r
1466                         // we want to store read characters to the prefix part\r
1467                         // and continue reading the characters (proper way\r
1468                         // would be to restart reading the chars, but in that\r
1469                         // case we would have to complicate the token hasher,\r
1470                         // which I do not intend to play with. Instead, we will\r
1471                         // do prefixes when prefixes are due (before adding the\r
1472                         // elements).\r
1473                         m_parsedToken_.m_prefixOffset_\r
1474                                                 = m_parsedToken_.m_charsOffset_;\r
1475                         m_parsedToken_.m_prefixLen_\r
1476                                                 = m_parsedToken_.m_charsLen_;\r
1477                         if (inchars) { // we're doing characters\r
1478                             if (wasinquote == false) {\r
1479                                 m_parsedToken_.m_charsOffset_ = m_extraCurrent_;\r
1480                             }\r
1481                             if (m_parsedToken_.m_charsLen_ != 0) {\r
1482                                 String prefix = m_source_.substring(\r
1483                                        m_current_ - m_parsedToken_.m_charsLen_,\r
1484                                        m_current_);\r
1485                                 m_source_.append(prefix);\r
1486                                 m_extraCurrent_ += m_parsedToken_.m_charsLen_;\r
1487                             }\r
1488                             m_parsedToken_.m_charsLen_ ++;\r
1489                         }\r
1490                         wasinquote = true;\r
1491                         do {\r
1492                             m_current_ ++;\r
1493                             ch = m_source_.charAt(m_current_);\r
1494                             // skip whitespace between '|' and the character\r
1495                         } while (UCharacterProperty.isRuleWhiteSpace(ch));\r
1496                         break;\r
1497                     case 0x0023: // '#' // this is a comment, skip everything through the end of line\r
1498                         do {\r
1499                             m_current_ ++;\r
1500                             ch = m_source_.charAt(m_current_);\r
1501                         } while (!isCharNewLine(ch));\r
1502                         break;\r
1503                     case 0x0021: // '!' // ignoring java set thai reordering\r
1504                         break;\r
1505                     default :\r
1506                         if (newstrength == TOKEN_UNSET_) {\r
1507                             if(m_prevStrength_ == TOKEN_UNSET_){\r
1508                                 throwParseException(m_rules_, m_current_);\r
1509                             }else{\r
1510                                 newstrength = m_prevStrength_;\r
1511                             }\r
1512                         }\r
1513                         if (isSpecialChar(ch) && (inquote == false)) {\r
1514                             throwParseException(m_rules_, m_current_);\r
1515                         }\r
1516                         if (ch == 0x0000 && m_current_ + 1 == limit) {\r
1517                             break;\r
1518                         }\r
1519                         if (inchars) {\r
1520                             if (m_parsedToken_.m_charsLen_ == 0) {\r
1521                                 m_parsedToken_.m_charsOffset_ = m_current_;\r
1522                             }\r
1523                             m_parsedToken_.m_charsLen_++;\r
1524                             if(m_prevStrength_ != TOKEN_UNSET_){\r
1525                                 char[] fullchar = Character.toChars(Character.codePointAt(m_source_, m_current_));\r
1526                                 m_current_ += fullchar.length;\r
1527                                 m_parsedToken_.m_charsLen_ += fullchar.length - 1;\r
1528                                 return doEndParseNextToken(newstrength,\r
1529                                                            top,\r
1530                                                            extensionoffset,\r
1531                                                            newextensionlen,\r
1532                                                            variabletop, before);\r
1533                             }\r
1534                         }\r
1535                         else {\r
1536                             if (newextensionlen == 0) {\r
1537                                 extensionoffset = m_current_;\r
1538                             }\r
1539                             newextensionlen ++;\r
1540                         }\r
1541                         break;\r
1542                     }\r
1543                 }\r
1544             }\r
1545             if (wasinquote) {\r
1546                 if (ch != 0x27) {\r
1547                       m_source_.append(ch);\r
1548                     m_extraCurrent_ ++;\r
1549                 }\r
1550             }\r
1551             m_current_ ++;\r
1552         }\r
1553         return doEndParseNextToken(newstrength, top,\r
1554                                    extensionoffset, newextensionlen,\r
1555                                    variabletop, before);\r
1556     }\r
1557 \r
1558     /**\r
1559      * End the next parse token\r
1560      * @param newstrength new strength\r
1561      * @return offset in rules, -1 for end of rules\r
1562      */\r
1563     private int doEndParseNextToken(int newstrength, /*int newcharslen,*/\r
1564                                     boolean top, /*int charsoffset,*/\r
1565                                     int extensionoffset, int newextensionlen,\r
1566                                     boolean variabletop, int before)\r
1567                                     throws ParseException\r
1568     {\r
1569         if (newstrength == TOKEN_UNSET_) {\r
1570             return -1;\r
1571         }\r
1572         if (m_parsedToken_.m_charsLen_ == 0 && top == false) {\r
1573             throwParseException(m_rules_, m_current_);\r
1574         }\r
1575 \r
1576         m_parsedToken_.m_strength_ = newstrength;\r
1577         //m_parsedToken_.m_charsOffset_ = charsoffset;\r
1578         //m_parsedToken_.m_charsLen_ = newcharslen;\r
1579         m_parsedToken_.m_extensionOffset_ = extensionoffset;\r
1580         m_parsedToken_.m_extensionLen_ = newextensionlen;\r
1581         m_parsedToken_.m_flags_ = (char)\r
1582                                   ((variabletop ? TOKEN_VARIABLE_TOP_MASK_ : 0)\r
1583                                   | (top ? TOKEN_TOP_MASK_ : 0) | before);\r
1584         return m_current_;\r
1585     }\r
1586 \r
1587     /**\r
1588      * Token before this element\r
1589      * @param sourcetoken\r
1590      * @param strength collation strength\r
1591      * @return the token before source token\r
1592      * @exception ParseException thrown when rules have the wrong syntax\r
1593      */\r
1594     private Token getVirginBefore(Token sourcetoken, int strength)\r
1595                                                           throws ParseException\r
1596     {\r
1597         // this is a virgin before - we need to fish the anchor from the UCA\r
1598         if (sourcetoken != null) {\r
1599             int offset = sourcetoken.m_source_ & 0xFFFFFF;\r
1600             m_UCAColEIter_.setText(m_source_.substring(offset, offset + 1));\r
1601         }\r
1602         else {\r
1603             m_UCAColEIter_.setText(\r
1604                              m_source_.substring(m_parsedToken_.m_charsOffset_,\r
1605                              m_parsedToken_.m_charsOffset_ + 1));\r
1606         }\r
1607 \r
1608         int basece = m_UCAColEIter_.next() & 0xFFFFFF3F;\r
1609         int basecontce = m_UCAColEIter_.next();\r
1610         if (basecontce == CollationElementIterator.NULLORDER) {\r
1611             basecontce = 0;\r
1612         }\r
1613 \r
1614         int ch = 0;\r
1615 \r
1616 \r
1617         if((basece >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)\r
1618                 && (basece >>> 24 <=  RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */\r
1619 \r
1620             int primary = basece & RuleBasedCollator.CE_PRIMARY_MASK_ | (basecontce & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;\r
1621             int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);\r
1622             ch = RuleBasedCollator.impCEGen_.getCodePointFromRaw(raw-1);\r
1623             int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);\r
1624             m_utilCEBuffer_[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;\r
1625             m_utilCEBuffer_[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;\r
1626 \r
1627             m_parsedToken_.m_charsOffset_ = m_extraCurrent_;\r
1628             m_source_.append('\uFFFE');\r
1629             m_source_.append((char)ch);\r
1630             m_extraCurrent_ += 2;\r
1631             m_parsedToken_.m_charsLen_++;\r
1632 \r
1633             m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)\r
1634             | m_parsedToken_.m_charsOffset_;\r
1635             m_utilToken_.m_rules_ = m_source_;\r
1636             sourcetoken = m_hashTable_.get(m_utilToken_);\r
1637 \r
1638             if(sourcetoken == null) {\r
1639                 m_listHeader_[m_resultLength_] = new TokenListHeader();\r
1640                 m_listHeader_[m_resultLength_].m_baseCE_\r
1641                     = m_utilCEBuffer_[0] & 0xFFFFFF3F;\r
1642                 if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {\r
1643                     m_listHeader_[m_resultLength_].m_baseContCE_\r
1644                     = m_utilCEBuffer_[1];\r
1645                 }\r
1646                 else {\r
1647                     m_listHeader_[m_resultLength_].m_baseContCE_ = 0;\r
1648                 }\r
1649                 m_listHeader_[m_resultLength_].m_nextCE_ = 0;\r
1650                 m_listHeader_[m_resultLength_].m_nextContCE_ = 0;\r
1651                 m_listHeader_[m_resultLength_].m_previousCE_ = 0;\r
1652                 m_listHeader_[m_resultLength_].m_previousContCE_ = 0;\r
1653                 m_listHeader_[m_resultLength_].m_indirect_ = false;\r
1654 \r
1655                 sourcetoken = new Token();\r
1656                 initAReset(-1, sourcetoken);\r
1657             }\r
1658 \r
1659         } else {\r
1660 \r
1661             // first ce and second ce m_utilCEBuffer_\r
1662             /*int invpos = */CollationParsedRuleBuilder.INVERSE_UCA_.getInversePrevCE(\r
1663                                                          basece, basecontce,\r
1664                                                          strength, m_utilCEBuffer_);\r
1665             // we got the previous CE. Now we need to see if the difference between\r
1666             // the two CEs is really of the requested strength.\r
1667             // if it's a bigger difference (we asked for secondary and got primary), we\r
1668             // need to modify the CE.\r
1669             if(CollationParsedRuleBuilder.INVERSE_UCA_.getCEStrengthDifference(basece, basecontce, m_utilCEBuffer_[0], m_utilCEBuffer_[1]) < strength) {\r
1670                 // adjust the strength\r
1671                 // now we are in the situation where our baseCE should actually be modified in\r
1672                 // order to get the CE in the right position.\r
1673                 if(strength == Collator.SECONDARY) {\r
1674                     m_utilCEBuffer_[0] = basece - 0x0200;\r
1675                 } else { // strength == UCOL_TERTIARY\r
1676                     m_utilCEBuffer_[0] = basece - 0x02;\r
1677                 }\r
1678                 if(RuleBasedCollator.isContinuation(basecontce)) {\r
1679                     if(strength == Collator.SECONDARY) {\r
1680                         m_utilCEBuffer_[1] = basecontce - 0x0200;\r
1681                     } else { // strength == UCOL_TERTIARY\r
1682                         m_utilCEBuffer_[1] = basecontce - 0x02;\r
1683                     }\r
1684                 }\r
1685             }\r
1686 \r
1687 /*\r
1688             // the code below relies on getting a code point from the inverse table, in order to be\r
1689             // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:\r
1690             // 1. There are many code points that have the same CE\r
1691             // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.\r
1692             // Also, in case when there is no equivalent strength before an element, we have to actually\r
1693             // construct one. For example, &[before 2]a << x won't result in x << a, because the element\r
1694             // before a is a primary difference.\r
1695             ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_table_[3 * invpos\r
1696                                                                       + 2];\r
1697             if ((ch &  INVERSE_SIZE_MASK_) != 0) {\r
1698                 int offset = ch & INVERSE_OFFSET_MASK_;\r
1699                 ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_continuations_[\r
1700                                                                            offset];\r
1701             }\r
1702             m_source_.append((char)ch);\r
1703             m_extraCurrent_ ++;\r
1704             m_parsedToken_.m_charsOffset_ = m_extraCurrent_ - 1;\r
1705             m_parsedToken_.m_charsLen_ = 1;\r
1706 \r
1707             // We got an UCA before. However, this might have been tailored.\r
1708             // example:\r
1709             // &\u30ca = \u306a\r
1710             // &[before 3]\u306a<<<\u306a|\u309d\r
1711 \r
1712             m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)\r
1713                                                  | m_parsedToken_.m_charsOffset_;\r
1714             m_utilToken_.m_rules_ = m_source_;\r
1715             sourcetoken = (Token)m_hashTable_.get(m_utilToken_);\r
1716 */\r
1717 \r
1718             // here is how it should be. The situation such as &[before 1]a < x, should be\r
1719             // resolved exactly as if we wrote &a > x.\r
1720             // therefore, I don't really care if the UCA value before a has been changed.\r
1721             // However, I do care if the strength between my element and the previous element\r
1722             // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll\r
1723             // have to construct the base CE.\r
1724 \r
1725             // if we found a tailored thing, we have to use the UCA value and\r
1726             // construct a new reset token with constructed name\r
1727             //if (sourcetoken != null && sourcetoken.m_strength_ != TOKEN_RESET_) {\r
1728                 // character to which we want to anchor is already tailored.\r
1729                 // We need to construct a new token which will be the anchor point\r
1730                 //m_source_.setCharAt(m_extraCurrent_ - 1, '\uFFFE');\r
1731                 //m_source_.append(ch);\r
1732                 //m_extraCurrent_ ++;\r
1733                 //m_parsedToken_.m_charsLen_ ++;\r
1734                 // grab before\r
1735                 m_parsedToken_.m_charsOffset_ -= 10;\r
1736                 m_parsedToken_.m_charsLen_ += 10;\r
1737                 m_listHeader_[m_resultLength_] = new TokenListHeader();\r
1738                 m_listHeader_[m_resultLength_].m_baseCE_\r
1739                                                  = m_utilCEBuffer_[0] & 0xFFFFFF3F;\r
1740                 if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {\r
1741                     m_listHeader_[m_resultLength_].m_baseContCE_\r
1742                                                               = m_utilCEBuffer_[1];\r
1743                 }\r
1744                 else {\r
1745                     m_listHeader_[m_resultLength_].m_baseContCE_ = 0;\r
1746                 }\r
1747                 m_listHeader_[m_resultLength_].m_nextCE_ = 0;\r
1748                 m_listHeader_[m_resultLength_].m_nextContCE_ = 0;\r
1749                 m_listHeader_[m_resultLength_].m_previousCE_ = 0;\r
1750                 m_listHeader_[m_resultLength_].m_previousContCE_ = 0;\r
1751                 m_listHeader_[m_resultLength_].m_indirect_ = false;\r
1752                 sourcetoken = new Token();\r
1753                 initAReset(-1, sourcetoken);\r
1754             //}\r
1755         }\r
1756         return sourcetoken;\r
1757     }\r
1758 \r
1759     /**\r
1760      * Processing Description.\r
1761      * 1. Build a m_listHeader_. Each list has a header, which contains two lists\r
1762      * (positive and negative), a reset token, a baseCE, nextCE, and\r
1763      * previousCE. The lists and reset may be null.\r
1764      * 2. As you process, you keep a LAST pointer that points to the last token\r
1765      * you handled.\r
1766      * @param expand string offset, -1 for null strings\r
1767      * @param targetToken token to update\r
1768      * @return expandnext offset\r
1769      * @throws ParseException thrown when rules syntax failed\r
1770      */\r
1771     private int initAReset(int expand, Token targetToken) throws ParseException\r
1772     {\r
1773         if (m_resultLength_ == m_listHeader_.length - 1) {\r
1774             // Unfortunately, this won't work, as we store addresses of lhs in\r
1775             // token\r
1776             TokenListHeader temp[] = new TokenListHeader[m_resultLength_ << 1];\r
1777             System.arraycopy(m_listHeader_, 0, temp, 0, m_resultLength_ + 1);\r
1778             m_listHeader_ = temp;\r
1779         }\r
1780         // do the reset thing\r
1781         targetToken.m_rules_ = m_source_;\r
1782         targetToken.m_source_ = m_parsedToken_.m_charsLen_ << 24\r
1783                                 | m_parsedToken_.m_charsOffset_;\r
1784         targetToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24\r
1785                                    | m_parsedToken_.m_extensionOffset_;\r
1786         // keep the flags around so that we know about before\r
1787         targetToken.m_flags_ = m_parsedToken_.m_flags_;\r
1788 \r
1789         if (m_parsedToken_.m_prefixOffset_ != 0) {\r
1790             throwParseException(m_rules_, m_parsedToken_.m_charsOffset_ - 1);\r
1791         }\r
1792 \r
1793         targetToken.m_prefix_ = 0;\r
1794         // TODO: this should also handle reverse\r
1795         targetToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;\r
1796         targetToken.m_strength_ = TOKEN_RESET_;\r
1797         targetToken.m_next_ = null;\r
1798         targetToken.m_previous_ = null;\r
1799         targetToken.m_CELength_ = 0;\r
1800         targetToken.m_expCELength_ = 0;\r
1801         targetToken.m_listHeader_ = m_listHeader_[m_resultLength_];\r
1802         m_listHeader_[m_resultLength_].m_first_ = null;\r
1803         m_listHeader_[m_resultLength_].m_last_ = null;\r
1804         m_listHeader_[m_resultLength_].m_first_ = null;\r
1805         m_listHeader_[m_resultLength_].m_last_ = null;\r
1806         m_listHeader_[m_resultLength_].m_reset_ = targetToken;\r
1807 \r
1808         /* 3 Consider each item: relation, source, and expansion:\r
1809          * e.g. ...< x / y ...\r
1810          * First convert all expansions into normal form. Examples:\r
1811          * If "xy" doesn't occur earlier in the list or in the UCA, convert\r
1812          * &xy * c * d * ... into &x * c/y * d * ...\r
1813          * Note: reset values can never have expansions, although they can\r
1814          * cause the very next item to have one. They may be contractions, if\r
1815          * they are found earlier in the list.\r
1816          */\r
1817         int result = 0;\r
1818         if (expand > 0) {\r
1819             // check to see if there is an expansion\r
1820             if (m_parsedToken_.m_charsLen_ > 1) {\r
1821                 targetToken.m_source_ = ((expand\r
1822                                           - m_parsedToken_.m_charsOffset_ )\r
1823                                           << 24)\r
1824                                           | m_parsedToken_.m_charsOffset_;\r
1825                 result = ((m_parsedToken_.m_charsLen_\r
1826                                + m_parsedToken_.m_charsOffset_ - expand) << 24)\r
1827                                | expand;\r
1828             }\r
1829         }\r
1830 \r
1831         m_resultLength_ ++;\r
1832         m_hashTable_.put(targetToken, targetToken);\r
1833         return result;\r
1834     }\r
1835 \r
1836     /**\r
1837      * Checks if an character is special\r
1838      * @param ch character to test\r
1839      * @return true if the character is special\r
1840      */\r
1841     private static final boolean isSpecialChar(char ch)\r
1842     {\r
1843         return (ch <= 0x002F && ch >= 0x0020) || (ch <= 0x003F && ch >= 0x003A)\r
1844                || (ch <= 0x0060 && ch >= 0x005B)\r
1845                || (ch <= 0x007E && ch >= 0x007D) || ch == 0x007B;\r
1846     }\r
1847 \r
1848     private\r
1849     UnicodeSet readAndSetUnicodeSet(String source, int start) throws ParseException\r
1850     {\r
1851       while(source.charAt(start) != '[') { /* advance while we find the first '[' */\r
1852         start++;\r
1853       }\r
1854       // now we need to get a balanced set of '[]'. The problem is that a set can have\r
1855       // many, and *end point to the first closing '['\r
1856       int noOpenBraces = 1;\r
1857       int current = 1; // skip the opening brace\r
1858       while(start+current < source.length() && noOpenBraces != 0) {\r
1859         if(source.charAt(start+current) == '[') {\r
1860           noOpenBraces++;\r
1861         } else if(source.charAt(start+current) == ']') { // closing brace\r
1862           noOpenBraces--;\r
1863         }\r
1864         current++;\r
1865       }\r
1866       //int nextBrace = -1;\r
1867 \r
1868       if(noOpenBraces != 0 || (/*nextBrace =*/ source.indexOf("]", start+current) /*']'*/) == -1) {\r
1869         throwParseException(m_rules_, start);\r
1870       }\r
1871       return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current);\r
1872     }\r
1873 \r
1874 \r
1875     /** in C, optionarg is passed by reference to function.\r
1876      *  We use a private int to simulate this.\r
1877      */\r
1878     private int m_optionarg_ = 0;\r
1879 \r
1880     private int readOption(String rules, int start, int optionend)\r
1881     {\r
1882         m_optionarg_ = 0;\r
1883         int i = 0;\r
1884         while (i < RULES_OPTIONS_.length) {\r
1885             String option = RULES_OPTIONS_[i].m_name_;\r
1886             int optionlength = option.length();\r
1887             if (rules.length() > start + optionlength\r
1888                 && option.equalsIgnoreCase(rules.substring(start,\r
1889                                                       start + optionlength))) {\r
1890                 if (optionend - start > optionlength) {\r
1891                     m_optionarg_ = start + optionlength;\r
1892                     // start of the options, skip space\r
1893                     while (m_optionarg_ < optionend && (UCharacter.isWhitespace(rules.charAt(m_optionarg_)) || UCharacterProperty.isRuleWhiteSpace(rules.charAt(m_optionarg_))))\r
1894                     {   // eat whitespace\r
1895                         m_optionarg_ ++;\r
1896                     }\r
1897                 }\r
1898                 break;\r
1899             }\r
1900             i ++;\r
1901         }\r
1902         if(i == RULES_OPTIONS_.length) {\r
1903             i = -1;\r
1904         }\r
1905         return i;\r
1906     }\r
1907     /**\r
1908      * Reads and set collation options\r
1909      * @return TOKEN_SUCCESS if option is set correct, 0 otherwise\r
1910      * @exception ParseException thrown when options in rules are wrong\r
1911      */\r
1912     private byte readAndSetOption() throws ParseException\r
1913     {\r
1914         int start = m_current_ + 1; // skip opening '['\r
1915         int i = readOption(m_rules_, start, m_optionEnd_);\r
1916 \r
1917         int optionarg = m_optionarg_;\r
1918 \r
1919         if (i < 0) {\r
1920             throwParseException(m_rules_, start);\r
1921         }\r
1922 \r
1923         if (i < 7) {\r
1924             if (optionarg != 0) {\r
1925                 for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;\r
1926                                                                         j ++) {\r
1927                      String subname = RULES_OPTIONS_[i].m_subOptions_[j];\r
1928                      int size = optionarg + subname.length();\r
1929                      if (m_rules_.length() > size\r
1930                          && subname.equalsIgnoreCase(m_rules_.substring(\r
1931                                                            optionarg, size))) {\r
1932                          setOptions(m_options_, RULES_OPTIONS_[i].m_attribute_,\r
1933                              RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]);\r
1934                          return TOKEN_SUCCESS_MASK_;\r
1935                      }\r
1936                 }\r
1937             }\r
1938             throwParseException(m_rules_, optionarg);\r
1939         }\r
1940         else if (i == 7) { // variable top\r
1941             return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_;\r
1942         }\r
1943         else if (i == 8) { // rearange\r
1944             return TOKEN_SUCCESS_MASK_;\r
1945         }\r
1946         else if (i == 9) { // before\r
1947             if (optionarg != 0) {\r
1948                 for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;\r
1949                                                                         j ++) {\r
1950                      String subname = RULES_OPTIONS_[i].m_subOptions_[j];\r
1951                      int size = optionarg + subname.length();\r
1952                      if (m_rules_.length() > size\r
1953                          && subname.equalsIgnoreCase(\r
1954                                                m_rules_.substring(optionarg,\r
1955                                               optionarg + subname.length()))) {\r
1956                          return (byte)(TOKEN_SUCCESS_MASK_\r
1957                             | RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]\r
1958                             + 1);\r
1959                      }\r
1960                 }\r
1961             }\r
1962             throwParseException(m_rules_, optionarg);\r
1963         }\r
1964         else if (i == 10) {  // top, we are going to have an array with\r
1965             // structures of limit CEs index to this array will be\r
1966             // src->parsedToken.indirectIndex\r
1967             m_parsedToken_.m_indirectIndex_ = 0;\r
1968             return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;\r
1969         }\r
1970         else if (i < 13) { // first, last\r
1971             for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j ++) {\r
1972                 String subname = RULES_OPTIONS_[i].m_subOptions_[j];\r
1973                 int size = optionarg + subname.length();\r
1974                 if (m_rules_.length() > size\r
1975                     && subname.equalsIgnoreCase(m_rules_.substring(optionarg,\r
1976                                                                    size))) {\r
1977                     m_parsedToken_.m_indirectIndex_ = (char)(i - 10 + (j << 1));\r
1978                     return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;\r
1979                 }\r
1980             }\r
1981             throwParseException(m_rules_, optionarg);\r
1982         }\r
1983         else if(i == 13 || i == 14) { // copy and remove are handled before normalization\r
1984             // we need to move end here\r
1985             int noOpenBraces = 1;\r
1986             m_current_++; // skip opening brace\r
1987             while(m_current_ < m_source_.length() && noOpenBraces != 0) {\r
1988                 if(m_source_.charAt(m_current_) == '[') {\r
1989                   noOpenBraces++;\r
1990                 } else if(m_source_.charAt(m_current_) == ']') { // closing brace\r
1991                   noOpenBraces--;\r
1992                 }\r
1993                 m_current_++;\r
1994             }\r
1995             m_optionEnd_ = m_current_-1;\r
1996             return TOKEN_SUCCESS_MASK_;\r
1997         }\r
1998         else {\r
1999             throwParseException(m_rules_, optionarg);\r
2000         }\r
2001         return TOKEN_SUCCESS_MASK_; // we will never reach here.\r
2002     }\r
2003 \r
2004     /**\r
2005      * Set collation option\r
2006      * @param optionset option set to set\r
2007      * @param attribute type to set\r
2008      * @param value attribute value\r
2009      */\r
2010     private void setOptions(OptionSet optionset, int attribute, int value)\r
2011     {\r
2012         switch (attribute) {\r
2013             case RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_ :\r
2014                 optionset.m_isHiragana4_\r
2015                             = (value == RuleBasedCollator.AttributeValue.ON_);\r
2016                 break;\r
2017             case RuleBasedCollator.Attribute.FRENCH_COLLATION_ :\r
2018                 optionset.m_isFrenchCollation_\r
2019                              = (value == RuleBasedCollator.AttributeValue.ON_);\r
2020                 break;\r
2021             case RuleBasedCollator.Attribute.ALTERNATE_HANDLING_ :\r
2022                 optionset.m_isAlternateHandlingShifted_\r
2023                              = (value\r
2024                                 == RuleBasedCollator.AttributeValue.SHIFTED_);\r
2025                 break;\r
2026             case RuleBasedCollator.Attribute.CASE_FIRST_ :\r
2027                 optionset.m_caseFirst_ = value;\r
2028                 break;\r
2029             case RuleBasedCollator.Attribute.CASE_LEVEL_ :\r
2030                 optionset.m_isCaseLevel_\r
2031                              = (value == RuleBasedCollator.AttributeValue.ON_);\r
2032                 break;\r
2033             case RuleBasedCollator.Attribute.NORMALIZATION_MODE_ :\r
2034                 if (value == RuleBasedCollator.AttributeValue.ON_) {\r
2035                     value = Collator.CANONICAL_DECOMPOSITION;\r
2036                 }\r
2037                 optionset.m_decomposition_ = value;\r
2038                 break;\r
2039             case RuleBasedCollator.Attribute.STRENGTH_ :\r
2040                 optionset.m_strength_ = value;\r
2041                 break;\r
2042             default :\r
2043                 break;\r
2044         }\r
2045       }\r
2046 \r
2047     UnicodeSet getTailoredSet() throws ParseException\r
2048     {\r
2049         boolean startOfRules = true;\r
2050         UnicodeSet tailored = new UnicodeSet();\r
2051         String pattern;\r
2052         CanonicalIterator it = new CanonicalIterator("");\r
2053 \r
2054         m_parsedToken_.m_strength_ = TOKEN_UNSET_;\r
2055         int sourcelimit = m_source_.length();\r
2056         //int expandNext = 0;\r
2057 \r
2058         while (m_current_ < sourcelimit) {\r
2059         m_parsedToken_.m_prefixOffset_ = 0;\r
2060         if (parseNextToken(startOfRules) < 0) {\r
2061             // we have reached the end\r
2062             continue;\r
2063         }\r
2064         startOfRules = false;\r
2065         // The idea is to tokenize the rule set. For each non-reset token,\r
2066         // we add all the canonicaly equivalent FCD sequences\r
2067             if(m_parsedToken_.m_strength_ != TOKEN_RESET_) {\r
2068                 it.setSource(m_source_.substring(\r
2069                       m_parsedToken_.m_charsOffset_,\r
2070                       m_parsedToken_.m_charsOffset_+m_parsedToken_.m_charsLen_));\r
2071                 pattern = it.next();\r
2072                 while(pattern != null) {\r
2073                       if(Normalizer.quickCheck(pattern, Normalizer.FCD,0) != Normalizer.NO) {\r
2074                         tailored.add(pattern);\r
2075                     }\r
2076                     pattern = it.next();\r
2077                 }\r
2078             }\r
2079         }\r
2080         return tailored;\r
2081     }\r
2082 \r
2083     final private void extractSetsFromRules(String rules) throws ParseException {\r
2084       int optionNumber = -1;\r
2085       int setStart = 0;\r
2086       int i = 0;\r
2087       while(i < rules.length()) {\r
2088         if(rules.charAt(i) == 0x005B) {\r
2089           optionNumber = readOption(rules, i+1, rules.length());\r
2090           setStart = m_optionarg_;\r
2091           if(optionNumber == 13) { /* copy - parts of UCA to tailoring */\r
2092             UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);\r
2093               if(m_copySet_ == null) {\r
2094                 m_copySet_ = newSet;\r
2095               } else {\r
2096                 m_copySet_.addAll(newSet);\r
2097               }\r
2098           } else if(optionNumber == 14) {\r
2099             UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);\r
2100               if(m_removeSet_ == null) {\r
2101                 m_removeSet_ = newSet;\r
2102               } else {\r
2103                 m_removeSet_.addAll(newSet);\r
2104               }\r
2105           }\r
2106         }\r
2107         i++;\r
2108       }\r
2109     }\r
2110 }\r