2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.text.ParseException;
\r
10 import java.util.Arrays;
\r
11 import java.util.Hashtable;
\r
13 import com.ibm.icu.impl.UCharacterProperty;
\r
14 import com.ibm.icu.lang.UCharacter;
\r
17 * Class for parsing collation rules, produces a list of tokens that will be
\r
18 * turned into collation elements
\r
19 * @author Syn Wee Quek
\r
20 * @since release 2.2, June 7 2002
\r
22 final class CollationRuleParser
\r
24 // public data members ---------------------------------------------------
\r
26 // package private constructors ------------------------------------------
\r
29 * <p>RuleBasedCollator constructor that takes the rules.
\r
30 * Please see RuleBasedCollator class description for more details on the
\r
31 * collation rule syntax.</p>
\r
32 * @see java.util.Locale
\r
33 * @param rules the collation rules to build the collation table from.
\r
34 * @exception ParseException thrown when argument rules have an invalid
\r
37 CollationRuleParser(String rules) throws ParseException
\r
39 extractSetsFromRules(rules);
\r
40 m_source_ = new StringBuilder(Normalizer.decompose(rules, false).trim());
\r
41 m_rules_ = m_source_.toString();
\r
43 m_extraCurrent_ = m_source_.length();
\r
44 m_variableTop_ = null;
\r
45 m_parsedToken_ = new ParsedToken();
\r
46 m_hashTable_ = new Hashtable<Token, Token>();
\r
47 m_options_ = new OptionSet(RuleBasedCollator.UCA_);
\r
48 m_listHeader_ = new TokenListHeader[512];
\r
49 m_resultLength_ = 0;
\r
50 m_prevStrength_ = TOKEN_UNSET_;
\r
51 // call assembleTokenList() manually, so that we can
\r
52 // init a parser and manually parse tokens
\r
53 //assembleTokenList();
\r
56 // package private inner classes -----------------------------------------
\r
59 * Collation options set
\r
61 static class OptionSet
\r
63 // package private constructor ---------------------------------------
\r
66 * Initializes the option set with the argument collators
\r
67 * @param collator option to use
\r
69 OptionSet(RuleBasedCollator collator)
\r
71 m_variableTopValue_ = collator.m_variableTopValue_;
\r
72 m_isFrenchCollation_ = collator.isFrenchCollation();
\r
73 m_isAlternateHandlingShifted_
\r
74 = collator.isAlternateHandlingShifted();
\r
75 m_caseFirst_ = collator.m_caseFirst_;
\r
76 m_isCaseLevel_ = collator.isCaseLevel();
\r
77 m_decomposition_ = collator.getDecomposition();
\r
78 m_strength_ = collator.getStrength();
\r
79 m_isHiragana4_ = collator.m_isHiragana4_;
\r
82 // package private data members --------------------------------------
\r
84 int m_variableTopValue_;
\r
85 boolean m_isFrenchCollation_;
\r
87 * Attribute for handling variable elements
\r
89 boolean m_isAlternateHandlingShifted_;
\r
91 * who goes first, lower case or uppercase
\r
95 * do we have an extra case level
\r
97 boolean m_isCaseLevel_;
\r
99 * attribute for normalization
\r
101 int m_decomposition_;
\r
103 * attribute for strength
\r
107 * attribute for special Hiragana
\r
109 boolean m_isHiragana4_;
\r
113 * List of tokens used by the collation rules
\r
115 static class TokenListHeader
\r
120 boolean m_indirect_;
\r
126 int m_previousContCE_;
\r
127 int m_pos_[] = new int[Collator.IDENTICAL + 1];
\r
128 int m_gapsLo_[] = new int[3 * (Collator.TERTIARY + 1)];
\r
129 int m_gapsHi_[] = new int[3 * (Collator.TERTIARY + 1)];
\r
130 int m_numStr_[] = new int[3 * (Collator.TERTIARY + 1)];
\r
131 Token m_fStrToken_[] = new Token[Collator.TERTIARY + 1];
\r
132 Token m_lStrToken_[] = new Token[Collator.TERTIARY + 1];
\r
136 * Token wrapper for collation rules
\r
140 // package private data members ---------------------------------------
\r
145 int m_expCELength_;
\r
151 int m_polarity_; // 1 for <, <<, <<<, , ; and 0 for >, >>, >>>
\r
152 TokenListHeader m_listHeader_;
\r
155 StringBuilder m_rules_;
\r
158 // package private constructors ---------------------------------------
\r
162 m_CE_ = new int[128];
\r
163 m_expCE_ = new int[128];
\r
164 // TODO: this should also handle reverse
\r
165 m_polarity_ = TOKEN_POLARITY_POSITIVE_;
\r
167 m_previous_ = null;
\r
169 m_expCELength_ = 0;
\r
172 // package private methods --------------------------------------------
\r
175 * Hashcode calculation for token
\r
176 * @return the hashcode
\r
178 public int hashCode()
\r
181 int len = (m_source_ & 0xFF000000) >>> 24;
\r
182 int inc = ((len - 32) / 32) + 1;
\r
184 int start = m_source_ & 0x00FFFFFF;
\r
185 int limit = start + len;
\r
187 while (start < limit) {
\r
188 result = (result * 37) + m_rules_.charAt(start);
\r
195 * Equals calculation
\r
196 * @param target object to compare
\r
197 * @return true if target is the same as this object
\r
199 public boolean equals(Object target)
\r
201 if (target == this) {
\r
204 if (target instanceof Token) {
\r
205 Token t = (Token)target;
\r
206 int sstart = m_source_ & 0x00FFFFFF;
\r
207 int tstart = t.m_source_ & 0x00FFFFFF;
\r
208 int slimit = (m_source_ & 0xFF000000) >> 24;
\r
209 int tlimit = (m_source_ & 0xFF000000) >> 24;
\r
211 int end = sstart + slimit - 1;
\r
213 if (m_source_ == 0 || t.m_source_ == 0) {
\r
216 if (slimit != tlimit) {
\r
219 if (m_source_ == t.m_source_) {
\r
223 while (sstart < end
\r
224 && m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart))
\r
229 if (m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) {
\r
237 // package private data member -------------------------------------------
\r
240 * Indicator that the token is resetted yet, ie & in the rules
\r
242 static final int TOKEN_RESET_ = 0xDEADBEEF;
\r
245 * Size of the number of tokens
\r
247 int m_resultLength_;
\r
249 * List of parsed tokens
\r
251 TokenListHeader m_listHeader_[];
\r
253 * Variable top token
\r
255 Token m_variableTop_;
\r
257 * Collation options
\r
259 OptionSet m_options_;
\r
261 * Normalized collation rules with some extra characters
\r
263 StringBuilder m_source_;
\r
265 * Hash table to keep all tokens
\r
267 Hashtable<Token, Token> m_hashTable_;
\r
269 // package private method ------------------------------------------------
\r
271 void setDefaultOptionsInCollator(RuleBasedCollator collator)
\r
273 collator.m_defaultStrength_ = m_options_.m_strength_;
\r
274 collator.m_defaultDecomposition_ = m_options_.m_decomposition_;
\r
275 collator.m_defaultIsFrenchCollation_ = m_options_.m_isFrenchCollation_;
\r
276 collator.m_defaultIsAlternateHandlingShifted_
\r
277 = m_options_.m_isAlternateHandlingShifted_;
\r
278 collator.m_defaultIsCaseLevel_ = m_options_.m_isCaseLevel_;
\r
279 collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_;
\r
280 collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_;
\r
281 collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_;
\r
284 // private inner classes -------------------------------------------------
\r
287 * This is a token that has been parsed but not yet processed. Used to
\r
288 * reduce the number of arguments in the parser
\r
290 private static class ParsedToken
\r
292 // private constructor ----------------------------------------------
\r
295 * Empty constructor
\r
300 m_charsOffset_ = 0;
\r
301 m_extensionLen_ = 0;
\r
302 m_extensionOffset_ = 0;
\r
304 m_prefixOffset_ = 0;
\r
306 m_strength_ = TOKEN_UNSET_;
\r
309 // private data members ---------------------------------------------
\r
312 int m_charsOffset_;
\r
314 int m_extensionOffset_;
\r
315 int m_extensionLen_;
\r
316 int m_prefixOffset_;
\r
319 char m_indirectIndex_;
\r
323 * Boundary wrappers
\r
325 private static class IndirectBoundaries
\r
327 // package private constructor ---------------------------------------
\r
329 IndirectBoundaries(int startce[], int limitce[])
\r
331 // Set values for the top - TODO: once we have values for all the
\r
332 // indirects, we are going to initalize here.
\r
333 m_startCE_ = startce[0];
\r
334 m_startContCE_ = startce[1];
\r
335 if (limitce != null) {
\r
336 m_limitCE_ = limitce[0];
\r
337 m_limitContCE_ = limitce[1];
\r
341 m_limitContCE_ = 0;
\r
345 // package private data members --------------------------------------
\r
348 int m_startContCE_;
\r
350 int m_limitContCE_;
\r
354 * Collation option rule tag
\r
356 private static class TokenOption
\r
358 // package private constructor ---------------------------------------
\r
360 TokenOption(String name, int attribute, String suboptions[],
\r
361 int suboptionattributevalue[])
\r
364 m_attribute_ = attribute;
\r
365 m_subOptions_ = suboptions;
\r
366 m_subOptionAttributeValues_ = suboptionattributevalue;
\r
369 // package private data member ---------------------------------------
\r
371 private String m_name_;
\r
372 private int m_attribute_;
\r
373 private String m_subOptions_[];
\r
374 private int m_subOptionAttributeValues_[];
\r
377 // private variables -----------------------------------------------------
\r
380 * Current parsed token
\r
382 private ParsedToken m_parsedToken_;
\r
386 private String m_rules_;
\r
387 private int m_current_;
\r
389 * End of the option while reading.
\r
390 * Need it for UnicodeSet reading support.
\r
392 private int m_optionEnd_;
\r
394 * Current offset in m_source
\r
396 //private int m_sourceLimit_;
\r
398 * Offset to m_source_ ofr the extra expansion characters
\r
400 private int m_extraCurrent_;
\r
403 * UnicodeSet that contains code points to be copied from the UCA
\r
405 UnicodeSet m_copySet_;
\r
408 * UnicodeSet that contains code points for which we want to remove
\r
409 * UCA contractions. It implies copying of these code points from
\r
412 UnicodeSet m_removeSet_;
\r
414 * Stores the previous token's strength when making a list of same level
\r
417 private int m_prevStrength_;
\r
420 * This is space for the extra strings that need to be unquoted during the
\r
421 * parsing of the rules
\r
423 //private static final int TOKEN_EXTRA_RULE_SPACE_SIZE_ = 2048;
\r
425 * Indicator that the token is not set yet
\r
427 private static final int TOKEN_UNSET_ = 0xFFFFFFFF;
\r
429 * Indicator that the rule is in the > polarity, ie everything on the
\r
430 * right of the rule is less than
\r
432 //private static final int TOKEN_POLARITY_NEGATIVE_ = 0;
\r
434 * Indicator that the rule is in the < polarity, ie everything on the
\r
435 * right of the rule is greater than
\r
437 private static final int TOKEN_POLARITY_POSITIVE_ = 1;
\r
439 * Flag mask to determine if top is set
\r
441 private static final int TOKEN_TOP_MASK_ = 0x04;
\r
443 * Flag mask to determine if variable top is set
\r
445 private static final int TOKEN_VARIABLE_TOP_MASK_ = 0x08;
\r
447 * Flag mask to determine if a before attribute is set
\r
449 private static final int TOKEN_BEFORE_ = 0x03;
\r
451 * For use in parsing token options
\r
453 private static final int TOKEN_SUCCESS_MASK_ = 0x10;
\r
456 * These values are used for finding CE values for indirect positioning.
\r
457 * Indirect positioning is a mechanism for allowing resets on symbolic
\r
458 * values. It only works for resets and you cannot tailor indirect names.
\r
459 * An indirect name can define either an anchor point or a range. An anchor
\r
460 * point behaves in exactly the same way as a code point in reset would,
\r
461 * except that it cannot be tailored. A range (we currently only know for
\r
462 * the [top] range will explicitly set the upper bound for generated CEs,
\r
463 * thus allowing for better control over how many CEs can be squeezed
\r
464 * between in the range without performance penalty. In that respect, we use
\r
465 * [top] for tailoring of locales that use CJK characters. Other indirect
\r
466 * values are currently a pure convenience, they can be used to assure that
\r
467 * the CEs will be always positioned in the same place relative to a point
\r
468 * with known properties (e.g. first primary ignorable).
\r
470 private static final IndirectBoundaries INDIRECT_BOUNDARIES_[];
\r
473 // * Inverse UCA constants
\r
475 // private static final int INVERSE_SIZE_MASK_ = 0xFFF00000;
\r
476 // private static final int INVERSE_OFFSET_MASK_ = 0x000FFFFF;
\r
477 // private static final int INVERSE_SHIFT_VALUE_ = 20;
\r
480 * Collation option tags
\r
481 * [last variable] last variable value
\r
482 * [last primary ignorable] largest CE for primary ignorable
\r
483 * [last secondary ignorable] largest CE for secondary ignorable
\r
484 * [last tertiary ignorable] largest CE for tertiary ignorable
\r
485 * [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
\r
487 private static final TokenOption RULES_OPTIONS_[];
\r
491 INDIRECT_BOUNDARIES_ = new IndirectBoundaries[15];
\r
492 // UCOL_RESET_TOP_VALUE
\r
493 INDIRECT_BOUNDARIES_[0] = new IndirectBoundaries(
\r
494 RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
\r
495 RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
\r
496 // UCOL_FIRST_PRIMARY_IGNORABLE
\r
497 INDIRECT_BOUNDARIES_[1] = new IndirectBoundaries(
\r
498 RuleBasedCollator.UCA_CONSTANTS_.FIRST_PRIMARY_IGNORABLE_,
\r
500 // UCOL_LAST_PRIMARY_IGNORABLE
\r
501 INDIRECT_BOUNDARIES_[2] = new IndirectBoundaries(
\r
502 RuleBasedCollator.UCA_CONSTANTS_.LAST_PRIMARY_IGNORABLE_,
\r
505 // UCOL_FIRST_SECONDARY_IGNORABLE
\r
506 INDIRECT_BOUNDARIES_[3] = new IndirectBoundaries(
\r
507 RuleBasedCollator.UCA_CONSTANTS_.FIRST_SECONDARY_IGNORABLE_,
\r
509 // UCOL_LAST_SECONDARY_IGNORABLE
\r
510 INDIRECT_BOUNDARIES_[4] = new IndirectBoundaries(
\r
511 RuleBasedCollator.UCA_CONSTANTS_.LAST_SECONDARY_IGNORABLE_,
\r
513 // UCOL_FIRST_TERTIARY_IGNORABLE
\r
514 INDIRECT_BOUNDARIES_[5] = new IndirectBoundaries(
\r
515 RuleBasedCollator.UCA_CONSTANTS_.FIRST_TERTIARY_IGNORABLE_,
\r
517 // UCOL_LAST_TERTIARY_IGNORABLE
\r
518 INDIRECT_BOUNDARIES_[6] = new IndirectBoundaries(
\r
519 RuleBasedCollator.UCA_CONSTANTS_.LAST_TERTIARY_IGNORABLE_,
\r
521 // UCOL_FIRST_VARIABLE;
\r
522 INDIRECT_BOUNDARIES_[7] = new IndirectBoundaries(
\r
523 RuleBasedCollator.UCA_CONSTANTS_.FIRST_VARIABLE_,
\r
525 // UCOL_LAST_VARIABLE
\r
526 INDIRECT_BOUNDARIES_[8] = new IndirectBoundaries(
\r
527 RuleBasedCollator.UCA_CONSTANTS_.LAST_VARIABLE_,
\r
529 // UCOL_FIRST_NON_VARIABLE
\r
530 INDIRECT_BOUNDARIES_[9] = new IndirectBoundaries(
\r
531 RuleBasedCollator.UCA_CONSTANTS_.FIRST_NON_VARIABLE_,
\r
533 // UCOL_LAST_NON_VARIABLE
\r
534 INDIRECT_BOUNDARIES_[10] = new IndirectBoundaries(
\r
535 RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
\r
536 RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
\r
537 // UCOL_FIRST_IMPLICIT
\r
538 INDIRECT_BOUNDARIES_[11] = new IndirectBoundaries(
\r
539 RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_,
\r
541 // UCOL_LAST_IMPLICIT
\r
542 INDIRECT_BOUNDARIES_[12] = new IndirectBoundaries(
\r
543 RuleBasedCollator.UCA_CONSTANTS_.LAST_IMPLICIT_,
\r
544 RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_);
\r
545 // UCOL_FIRST_TRAILING
\r
546 INDIRECT_BOUNDARIES_[13] = new IndirectBoundaries(
\r
547 RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_,
\r
549 // UCOL_LAST_TRAILING
\r
550 INDIRECT_BOUNDARIES_[14] = new IndirectBoundaries(
\r
551 RuleBasedCollator.UCA_CONSTANTS_.LAST_TRAILING_,
\r
553 INDIRECT_BOUNDARIES_[14].m_limitCE_
\r
554 = RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_SPECIAL_MIN_ << 24;
\r
556 RULES_OPTIONS_ = new TokenOption[19];
\r
557 String option[] = {"non-ignorable", "shifted"};
\r
558 int value[] = {RuleBasedCollator.AttributeValue.NON_IGNORABLE_,
\r
559 RuleBasedCollator.AttributeValue.SHIFTED_};
\r
560 RULES_OPTIONS_[0] = new TokenOption("alternate",
\r
561 RuleBasedCollator.Attribute.ALTERNATE_HANDLING_,
\r
563 option = new String[1];
\r
565 value = new int[1];
\r
566 value[0] = RuleBasedCollator.AttributeValue.ON_;
\r
567 RULES_OPTIONS_[1] = new TokenOption("backwards",
\r
568 RuleBasedCollator.Attribute.FRENCH_COLLATION_,
\r
570 String offonoption[] = new String[2];
\r
571 offonoption[0] = "off";
\r
572 offonoption[1] = "on";
\r
573 int offonvalue[] = new int[2];
\r
574 offonvalue[0] = RuleBasedCollator.AttributeValue.OFF_;
\r
575 offonvalue[1] = RuleBasedCollator.AttributeValue.ON_;
\r
576 RULES_OPTIONS_[2] = new TokenOption("caseLevel",
\r
577 RuleBasedCollator.Attribute.CASE_LEVEL_,
\r
578 offonoption, offonvalue);
\r
579 option = new String[3];
\r
580 option[0] = "lower";
\r
581 option[1] = "upper";
\r
583 value = new int[3];
\r
584 value[0] = RuleBasedCollator.AttributeValue.LOWER_FIRST_;
\r
585 value[1] = RuleBasedCollator.AttributeValue.UPPER_FIRST_;
\r
586 value[2] = RuleBasedCollator.AttributeValue.OFF_;
\r
587 RULES_OPTIONS_[3] = new TokenOption("caseFirst",
\r
588 RuleBasedCollator.Attribute.CASE_FIRST_,
\r
590 RULES_OPTIONS_[4] = new TokenOption("normalization",
\r
591 RuleBasedCollator.Attribute.NORMALIZATION_MODE_,
\r
592 offonoption, offonvalue);
\r
593 RULES_OPTIONS_[5] = new TokenOption("hiraganaQ",
\r
594 RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_,
\r
595 offonoption, offonvalue);
\r
596 option = new String[5];
\r
602 value = new int[5];
\r
603 value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
\r
604 value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
\r
605 value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
\r
606 value[3] = RuleBasedCollator.AttributeValue.QUATERNARY_;
\r
607 value[4] = RuleBasedCollator.AttributeValue.IDENTICAL_;
\r
608 RULES_OPTIONS_[6] = new TokenOption("strength",
\r
609 RuleBasedCollator.Attribute.STRENGTH_,
\r
611 RULES_OPTIONS_[7] = new TokenOption("variable top",
\r
612 RuleBasedCollator.Attribute.LIMIT_,
\r
614 RULES_OPTIONS_[8] = new TokenOption("rearrange",
\r
615 RuleBasedCollator.Attribute.LIMIT_,
\r
617 option = new String[3];
\r
621 value = new int[3];
\r
622 value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
\r
623 value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
\r
624 value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
\r
625 RULES_OPTIONS_[9] = new TokenOption("before",
\r
626 RuleBasedCollator.Attribute.LIMIT_,
\r
628 RULES_OPTIONS_[10] = new TokenOption("top",
\r
629 RuleBasedCollator.Attribute.LIMIT_,
\r
631 String firstlastoption[] = new String[7];
\r
632 firstlastoption[0] = "primary";
\r
633 firstlastoption[1] = "secondary";
\r
634 firstlastoption[2] = "tertiary";
\r
635 firstlastoption[3] = "variable";
\r
636 firstlastoption[4] = "regular";
\r
637 firstlastoption[5] = "implicit";
\r
638 firstlastoption[6] = "trailing";
\r
640 int firstlastvalue[] = new int[7];
\r
641 Arrays.fill(firstlastvalue, RuleBasedCollator.AttributeValue.PRIMARY_);
\r
643 RULES_OPTIONS_[11] = new TokenOption("first",
\r
644 RuleBasedCollator.Attribute.LIMIT_,
\r
645 firstlastoption, firstlastvalue);
\r
646 RULES_OPTIONS_[12] = new TokenOption("last",
\r
647 RuleBasedCollator.Attribute.LIMIT_,
\r
648 firstlastoption, firstlastvalue);
\r
649 RULES_OPTIONS_[13] = new TokenOption("optimize",
\r
650 RuleBasedCollator.Attribute.LIMIT_,
\r
652 RULES_OPTIONS_[14] = new TokenOption("suppressContractions",
\r
653 RuleBasedCollator.Attribute.LIMIT_,
\r
655 RULES_OPTIONS_[15] = new TokenOption("undefined",
\r
656 RuleBasedCollator.Attribute.LIMIT_,
\r
658 RULES_OPTIONS_[16] = new TokenOption("scriptOrder",
\r
659 RuleBasedCollator.Attribute.LIMIT_,
\r
661 RULES_OPTIONS_[17] = new TokenOption("charsetname",
\r
662 RuleBasedCollator.Attribute.LIMIT_,
\r
664 RULES_OPTIONS_[18] = new TokenOption("charset",
\r
665 RuleBasedCollator.Attribute.LIMIT_,
\r
670 * Utility data members
\r
672 private Token m_utilToken_ = new Token();
\r
673 private CollationElementIterator m_UCAColEIter_
\r
674 = RuleBasedCollator.UCA_.getCollationElementIterator("");
\r
675 private int m_utilCEBuffer_[] = new int[2];
\r
677 // private methods -------------------------------------------------------
\r
680 * Assembles the token list
\r
681 * @exception ParseException thrown when rules syntax fails
\r
683 int assembleTokenList() throws ParseException
\r
685 Token lastToken = null;
\r
686 m_parsedToken_.m_strength_ = TOKEN_UNSET_;
\r
687 int sourcelimit = m_source_.length();
\r
688 int expandNext = 0;
\r
690 while (m_current_ < sourcelimit) {
\r
691 m_parsedToken_.m_prefixOffset_ = 0;
\r
692 if (parseNextToken(lastToken == null) < 0) {
\r
693 // we have reached the end
\r
696 char specs = m_parsedToken_.m_flags_;
\r
697 boolean variableTop = ((specs & TOKEN_VARIABLE_TOP_MASK_) != 0);
\r
698 boolean top = ((specs & TOKEN_TOP_MASK_) != 0);
\r
699 int lastStrength = TOKEN_UNSET_;
\r
700 if (lastToken != null) {
\r
701 lastStrength = lastToken.m_strength_;
\r
703 m_utilToken_.m_source_ = m_parsedToken_.m_charsLen_ << 24
\r
704 | m_parsedToken_.m_charsOffset_;
\r
705 m_utilToken_.m_rules_ = m_source_;
\r
706 // 4 Lookup each source in the CharsToToken map, and find a
\r
708 Token sourceToken = m_hashTable_.get(m_utilToken_);
\r
709 if (m_parsedToken_.m_strength_ != TOKEN_RESET_) {
\r
710 if (lastToken == null) {
\r
711 // this means that rules haven't started properly
\r
712 throwParseException(m_source_.toString(), 0);
\r
714 // 6 Otherwise (when relation != reset)
\r
715 if (sourceToken == null) {
\r
716 // If sourceToken is null, create new one
\r
717 sourceToken = new Token();
\r
718 sourceToken.m_rules_ = m_source_;
\r
719 sourceToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
\r
720 | m_parsedToken_.m_charsOffset_;
\r
721 sourceToken.m_prefix_ = m_parsedToken_.m_prefixLen_ << 24
\r
722 | m_parsedToken_.m_prefixOffset_;
\r
723 // TODO: this should also handle reverse
\r
724 sourceToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
\r
725 sourceToken.m_next_ = null;
\r
726 sourceToken.m_previous_ = null;
\r
727 sourceToken.m_CELength_ = 0;
\r
728 sourceToken.m_expCELength_ = 0;
\r
729 m_hashTable_.put(sourceToken, sourceToken);
\r
732 // we could have fished out a reset here
\r
733 if (sourceToken.m_strength_ != TOKEN_RESET_
\r
734 && lastToken != sourceToken) {
\r
735 // otherwise remove sourceToken from where it was.
\r
736 if (sourceToken.m_next_ != null) {
\r
737 if (sourceToken.m_next_.m_strength_
\r
738 > sourceToken.m_strength_) {
\r
739 sourceToken.m_next_.m_strength_
\r
740 = sourceToken.m_strength_;
\r
742 sourceToken.m_next_.m_previous_
\r
743 = sourceToken.m_previous_;
\r
746 sourceToken.m_listHeader_.m_last_
\r
747 = sourceToken.m_previous_;
\r
749 if (sourceToken.m_previous_ != null) {
\r
750 sourceToken.m_previous_.m_next_
\r
751 = sourceToken.m_next_;
\r
754 sourceToken.m_listHeader_.m_first_
\r
755 = sourceToken.m_next_;
\r
757 sourceToken.m_next_ = null;
\r
758 sourceToken.m_previous_ = null;
\r
761 sourceToken.m_strength_ = m_parsedToken_.m_strength_;
\r
762 sourceToken.m_listHeader_ = lastToken.m_listHeader_;
\r
764 // 1. Find the strongest strength in each list, and set
\r
765 // strongestP and strongestN accordingly in the headers.
\r
766 if (lastStrength == TOKEN_RESET_
\r
767 || sourceToken.m_listHeader_.m_first_ == null) {
\r
768 // If LAST is a reset insert sourceToken in the list.
\r
769 if (sourceToken.m_listHeader_.m_first_ == null) {
\r
770 sourceToken.m_listHeader_.m_first_ = sourceToken;
\r
771 sourceToken.m_listHeader_.m_last_ = sourceToken;
\r
773 else { // we need to find a place for us
\r
774 // and we'll get in front of the same strength
\r
775 if (sourceToken.m_listHeader_.m_first_.m_strength_
\r
776 <= sourceToken.m_strength_) {
\r
777 sourceToken.m_next_
\r
778 = sourceToken.m_listHeader_.m_first_;
\r
779 sourceToken.m_next_.m_previous_ = sourceToken;
\r
780 sourceToken.m_listHeader_.m_first_ = sourceToken;
\r
781 sourceToken.m_previous_ = null;
\r
784 lastToken = sourceToken.m_listHeader_.m_first_;
\r
785 while (lastToken.m_next_ != null
\r
786 && lastToken.m_next_.m_strength_
\r
787 > sourceToken.m_strength_) {
\r
788 lastToken = lastToken.m_next_;
\r
790 if (lastToken.m_next_ != null) {
\r
791 lastToken.m_next_.m_previous_ = sourceToken;
\r
794 sourceToken.m_listHeader_.m_last_
\r
797 sourceToken.m_previous_ = lastToken;
\r
798 sourceToken.m_next_ = lastToken.m_next_;
\r
799 lastToken.m_next_ = sourceToken;
\r
804 // Otherwise (when LAST is not a reset)
\r
805 // if polarity (LAST) == polarity(relation), insert
\r
806 // sourceToken after LAST, otherwise insert before.
\r
807 // when inserting after or before, search to the next
\r
808 // position with the same strength in that direction.
\r
809 // (This is called postpone insertion).
\r
810 if (sourceToken != lastToken) {
\r
811 if (lastToken.m_polarity_ == sourceToken.m_polarity_) {
\r
812 while (lastToken.m_next_ != null
\r
813 && lastToken.m_next_.m_strength_
\r
814 > sourceToken.m_strength_) {
\r
815 lastToken = lastToken.m_next_;
\r
817 sourceToken.m_previous_ = lastToken;
\r
818 if (lastToken.m_next_ != null) {
\r
819 lastToken.m_next_.m_previous_ = sourceToken;
\r
822 sourceToken.m_listHeader_.m_last_ = sourceToken;
\r
824 sourceToken.m_next_ = lastToken.m_next_;
\r
825 lastToken.m_next_ = sourceToken;
\r
828 while (lastToken.m_previous_ != null
\r
829 && lastToken.m_previous_.m_strength_
\r
830 > sourceToken.m_strength_) {
\r
831 lastToken = lastToken.m_previous_;
\r
833 sourceToken.m_next_ = lastToken;
\r
834 if (lastToken.m_previous_ != null) {
\r
835 lastToken.m_previous_.m_next_ = sourceToken;
\r
838 sourceToken.m_listHeader_.m_first_
\r
841 sourceToken.m_previous_ = lastToken.m_previous_;
\r
842 lastToken.m_previous_ = sourceToken;
\r
845 else { // repeated one thing twice in rules, stay with the
\r
846 // stronger strength
\r
847 if (lastStrength < sourceToken.m_strength_) {
\r
848 sourceToken.m_strength_ = lastStrength;
\r
852 // if the token was a variable top, we're gonna put it in
\r
853 if (variableTop == true && m_variableTop_ == null) {
\r
854 variableTop = false;
\r
855 m_variableTop_ = sourceToken;
\r
857 // Treat the expansions.
\r
858 // There are two types of expansions: explicit (x / y) and
\r
859 // reset based propagating expansions
\r
860 // (&abc * d * e <=> &ab * d / c * e / c)
\r
861 // if both of them are in effect for a token, they are combined.
\r
862 sourceToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
\r
863 | m_parsedToken_.m_extensionOffset_;
\r
864 if (expandNext != 0) {
\r
865 if (sourceToken.m_strength_ == RuleBasedCollator.PRIMARY) {
\r
866 // primary strength kills off the implicit expansion
\r
869 else if (sourceToken.m_expansion_ == 0) {
\r
870 // if there is no expansion, implicit is just added to
\r
872 sourceToken.m_expansion_ = expandNext;
\r
875 // there is both explicit and implicit expansion.
\r
876 // We need to make a combination
\r
877 int start = expandNext & 0xFFFFFF;
\r
878 int size = expandNext >>> 24;
\r
880 m_source_.append(m_source_.substring(start,
\r
883 start = m_parsedToken_.m_extensionOffset_;
\r
884 m_source_.append(m_source_.substring(start,
\r
885 start + m_parsedToken_.m_extensionLen_));
\r
886 sourceToken.m_expansion_ = (size
\r
887 + m_parsedToken_.m_extensionLen_) << 24
\r
889 m_extraCurrent_ += size + m_parsedToken_.m_extensionLen_;
\r
892 // if the previous token was a reset before, the strength of this
\r
893 // token must match the strength of before. Otherwise we have an
\r
894 // undefined situation.
\r
895 // In other words, we currently have a cludge which we use to
\r
896 // represent &a >> x. This is written as &[before 2]a << x.
\r
897 if((lastToken.m_flags_ & TOKEN_BEFORE_) != 0) {
\r
898 int beforeStrength = (lastToken.m_flags_ & TOKEN_BEFORE_) - 1;
\r
899 if(beforeStrength != sourceToken.m_strength_) {
\r
900 throwParseException(m_source_.toString(), m_current_);
\r
906 if (lastToken != null && lastStrength == TOKEN_RESET_) {
\r
907 // if the previous token was also a reset, this means that
\r
908 // we have two consecutive resets and we want to remove the
\r
909 // previous one if empty
\r
910 if (m_resultLength_ > 0 && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
\r
911 m_resultLength_ --;
\r
914 if (sourceToken == null) {
\r
915 // this is a reset, but it might still be somewhere in the
\r
916 // tailoring, in shorter form
\r
917 int searchCharsLen = m_parsedToken_.m_charsLen_;
\r
918 while (searchCharsLen > 1 && sourceToken == null) {
\r
920 // key = searchCharsLen << 24 | charsOffset;
\r
921 m_utilToken_.m_source_ = searchCharsLen << 24
\r
922 | m_parsedToken_.m_charsOffset_;
\r
923 m_utilToken_.m_rules_ = m_source_;
\r
924 sourceToken = m_hashTable_.get(m_utilToken_);
\r
926 if (sourceToken != null) {
\r
927 expandNext = (m_parsedToken_.m_charsLen_
\r
928 - searchCharsLen) << 24
\r
929 | (m_parsedToken_.m_charsOffset_
\r
933 if ((specs & TOKEN_BEFORE_) != 0) {
\r
934 if (top == false) {
\r
935 // we're doing before & there is no indirection
\r
936 int strength = (specs & TOKEN_BEFORE_) - 1;
\r
937 if (sourceToken != null
\r
938 && sourceToken.m_strength_ != TOKEN_RESET_) {
\r
939 // this is a before that is already ordered in the UCA
\r
940 // - so we need to get the previous with good strength
\r
941 while (sourceToken.m_strength_ > strength
\r
942 && sourceToken.m_previous_ != null) {
\r
943 sourceToken = sourceToken.m_previous_;
\r
945 // here, either we hit the strength or NULL
\r
946 if (sourceToken.m_strength_ == strength) {
\r
947 if (sourceToken.m_previous_ != null) {
\r
948 sourceToken = sourceToken.m_previous_;
\r
950 else { // start of list
\r
952 = sourceToken.m_listHeader_.m_reset_;
\r
955 else { // we hit NULL, we should be doing the else part
\r
957 = sourceToken.m_listHeader_.m_reset_;
\r
958 sourceToken = getVirginBefore(sourceToken,
\r
964 = getVirginBefore(sourceToken, strength);
\r
968 // this is both before and indirection
\r
970 m_listHeader_[m_resultLength_] = new TokenListHeader();
\r
971 m_listHeader_[m_resultLength_].m_previousCE_ = 0;
\r
972 m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
\r
973 m_listHeader_[m_resultLength_].m_indirect_ = true;
\r
974 // we need to do slightly more work. we need to get the
\r
975 // baseCE using the inverse UCA & getPrevious. The next
\r
976 // bound is not set, and will be decided in ucol_bld
\r
977 int strength = (specs & TOKEN_BEFORE_) - 1;
\r
978 int baseCE = INDIRECT_BOUNDARIES_[
\r
979 m_parsedToken_.m_indirectIndex_].m_startCE_;
\r
980 int baseContCE = INDIRECT_BOUNDARIES_[
\r
981 m_parsedToken_.m_indirectIndex_].m_startContCE_;
\r
982 int ce[] = new int[2];
\r
983 if((baseCE >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
\r
984 && (baseCE >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
\r
985 int primary = baseCE & RuleBasedCollator.CE_PRIMARY_MASK_ | (baseContCE & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
\r
986 int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);
\r
987 int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);
\r
988 ce[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
\r
989 ce[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;
\r
991 CollationParsedRuleBuilder.InverseUCA invuca
\r
992 = CollationParsedRuleBuilder.INVERSE_UCA_;
\r
993 invuca.getInversePrevCE(baseCE, baseContCE, strength,
\r
996 m_listHeader_[m_resultLength_].m_baseCE_ = ce[0];
\r
997 m_listHeader_[m_resultLength_].m_baseContCE_ = ce[1];
\r
998 m_listHeader_[m_resultLength_].m_nextCE_ = 0;
\r
999 m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
\r
1001 sourceToken = new Token();
\r
1002 expandNext = initAReset(0, sourceToken);
\r
1005 // 5 If the relation is a reset:
\r
1006 // If sourceToken is null
\r
1007 // Create new list, create new sourceToken, make the baseCE
\r
1008 // from source, put the sourceToken in ListHeader of the new
\r
1010 if (sourceToken == null) {
\r
1011 if (m_listHeader_[m_resultLength_] == null) {
\r
1012 m_listHeader_[m_resultLength_] = new TokenListHeader();
\r
1014 // 3 Consider each item: relation, source, and expansion:
\r
1015 // e.g. ...< x / y ...
\r
1016 // First convert all expansions into normal form.
\r
1018 // If "xy" doesn't occur earlier in the list or in the UCA,
\r
1019 // convert &xy * c * d * ... into &x * c/y * d * ...
\r
1020 // Note: reset values can never have expansions, although
\r
1021 // they can cause the very next item to have one. They may
\r
1022 // be contractions, if they are found earlier in the list.
\r
1023 if (top == false) {
\r
1024 CollationElementIterator coleiter
\r
1025 = RuleBasedCollator.UCA_.getCollationElementIterator(
\r
1026 m_source_.substring(m_parsedToken_.m_charsOffset_,
\r
1027 m_parsedToken_.m_charsOffset_
\r
1028 + m_parsedToken_.m_charsLen_));
\r
1030 int CE = coleiter.next();
\r
1031 // offset to the character in the full rule string
\r
1032 int expand = coleiter.getOffset()
\r
1033 + m_parsedToken_.m_charsOffset_;
\r
1034 int SecondCE = coleiter.next();
\r
1036 m_listHeader_[m_resultLength_].m_baseCE_
\r
1037 = CE & 0xFFFFFF3F;
\r
1038 if (RuleBasedCollator.isContinuation(SecondCE)) {
\r
1039 m_listHeader_[m_resultLength_].m_baseContCE_
\r
1043 m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
\r
1045 m_listHeader_[m_resultLength_].m_nextCE_ = 0;
\r
1046 m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
\r
1047 m_listHeader_[m_resultLength_].m_previousCE_ = 0;
\r
1048 m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
\r
1049 m_listHeader_[m_resultLength_].m_indirect_ = false;
\r
1050 sourceToken = new Token();
\r
1051 expandNext = initAReset(expand, sourceToken);
\r
1053 else { // top == TRUE
\r
1055 m_listHeader_[m_resultLength_].m_previousCE_ = 0;
\r
1056 m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
\r
1057 m_listHeader_[m_resultLength_].m_indirect_ = true;
\r
1058 IndirectBoundaries ib = INDIRECT_BOUNDARIES_[
\r
1059 m_parsedToken_.m_indirectIndex_];
\r
1060 m_listHeader_[m_resultLength_].m_baseCE_
\r
1062 m_listHeader_[m_resultLength_].m_baseContCE_
\r
1063 = ib.m_startContCE_;
\r
1064 m_listHeader_[m_resultLength_].m_nextCE_
\r
1066 m_listHeader_[m_resultLength_].m_nextContCE_
\r
1067 = ib.m_limitContCE_;
\r
1068 sourceToken = new Token();
\r
1069 expandNext = initAReset(0, sourceToken);
\r
1072 else { // reset to something already in rules
\r
1076 // 7 After all this, set LAST to point to sourceToken, and goto
\r
1078 lastToken = sourceToken;
\r
1081 if (m_resultLength_ > 0
\r
1082 && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
\r
1083 m_resultLength_ --;
\r
1085 return m_resultLength_;
\r
1089 * Formats and throws a ParseException
\r
1090 * @param rules collation rule that failed
\r
1091 * @param offset failed offset in rules
\r
1092 * @throws ParseException with failure information
\r
1094 private static final void throwParseException(String rules, int offset)
\r
1095 throws ParseException
\r
1097 // for pre-context
\r
1098 String precontext = rules.substring(0, offset);
\r
1099 String postcontext = rules.substring(offset, rules.length());
\r
1100 StringBuilder error = new StringBuilder(
\r
1101 "Parse error occurred in rule at offset ");
\r
1102 error.append(offset);
\r
1103 error.append("\n after the prefix \"");
\r
1104 error.append(precontext);
\r
1105 error.append("\" before the suffix \"");
\r
1106 error.append(postcontext);
\r
1107 throw new ParseException(error.toString(), offset);
\r
1110 private final boolean doSetTop() {
\r
1111 m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
\r
1112 m_source_.append((char)0xFFFE);
\r
1113 IndirectBoundaries ib =
\r
1114 INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_];
\r
1115 m_source_.append((char)(ib.m_startCE_ >> 16));
\r
1116 m_source_.append((char)(ib.m_startCE_ & 0xFFFF));
\r
1117 m_extraCurrent_ += 3;
\r
1118 if (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_
\r
1119 ].m_startContCE_ == 0) {
\r
1120 m_parsedToken_.m_charsLen_ = 3;
\r
1123 m_source_.append((char)(INDIRECT_BOUNDARIES_[
\r
1124 m_parsedToken_.m_indirectIndex_
\r
1125 ].m_startContCE_ >> 16));
\r
1126 m_source_.append((char)(INDIRECT_BOUNDARIES_[
\r
1127 m_parsedToken_.m_indirectIndex_
\r
1128 ].m_startContCE_ & 0xFFFF));
\r
1129 m_extraCurrent_ += 2;
\r
1130 m_parsedToken_.m_charsLen_ = 5;
\r
1135 private static boolean isCharNewLine(char c) {
\r
1137 case 0x000A: /* LF */
\r
1138 case 0x000D: /* CR */
\r
1139 case 0x000C: /* FF */
\r
1140 case 0x0085: /* NEL */
\r
1141 case 0x2028: /* LS */
\r
1142 case 0x2029: /* PS */
\r
1150 * Getting the next token
\r
1152 * @param startofrules
\r
1153 * flag indicating if we are at the start of rules
\r
1154 * @return the offset of the rules
\r
1155 * @exception ParseException
\r
1156 * thrown when rule parsing fails
\r
1158 @SuppressWarnings("fallthrough")
\r
1159 private int parseNextToken(boolean startofrules) throws ParseException
\r
1162 boolean variabletop = false;
\r
1163 boolean top = false;
\r
1164 boolean inchars = true;
\r
1165 boolean inquote = false;
\r
1166 boolean wasinquote = false;
\r
1168 boolean isescaped = false;
\r
1169 int /*newcharslen = 0,*/ newextensionlen = 0;
\r
1170 int /*charsoffset = 0,*/ extensionoffset = 0;
\r
1171 int newstrength = TOKEN_UNSET_;
\r
1173 m_parsedToken_.m_charsLen_ = 0;
\r
1174 m_parsedToken_.m_charsOffset_ = 0;
\r
1175 m_parsedToken_.m_prefixOffset_ = 0;
\r
1176 m_parsedToken_.m_prefixLen_ = 0;
\r
1177 m_parsedToken_.m_indirectIndex_ = 0;
\r
1179 int limit = m_rules_.length();
\r
1180 while (m_current_ < limit) {
\r
1181 char ch = m_source_.charAt(m_current_);
\r
1183 if (ch == 0x0027) { // '\''
\r
1187 if ((m_parsedToken_.m_charsLen_ == 0) || inchars) {
\r
1188 if (m_parsedToken_.m_charsLen_ == 0) {
\r
1189 m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
\r
1191 m_parsedToken_.m_charsLen_ ++;
\r
1194 if (newextensionlen == 0) {
\r
1195 extensionoffset = m_extraCurrent_;
\r
1197 newextensionlen ++;
\r
1201 else if (isescaped) {
\r
1202 isescaped = false;
\r
1203 if (newstrength == TOKEN_UNSET_) {
\r
1204 throwParseException(m_rules_, m_current_);
\r
1206 if (ch != 0 && m_current_ != limit) {
\r
1208 if (m_parsedToken_.m_charsLen_ == 0) {
\r
1209 m_parsedToken_.m_charsOffset_ = m_current_;
\r
1211 m_parsedToken_.m_charsLen_ ++;
\r
1214 if (newextensionlen == 0) {
\r
1215 extensionoffset = m_current_;
\r
1217 newextensionlen ++;
\r
1222 if (!UCharacterProperty.isRuleWhiteSpace(ch)) {
\r
1223 // Sets the strength for this entry
\r
1225 case 0x003D : // '='
\r
1226 if (newstrength != TOKEN_UNSET_) {
\r
1227 return doEndParseNextToken(newstrength,
\r
1231 variabletop, before);
\r
1233 // if we start with strength, we'll reset to top
\r
1234 if (startofrules == true) {
\r
1235 m_parsedToken_.m_indirectIndex_ = 5;
\r
1237 return doEndParseNextToken(TOKEN_RESET_,
\r
1241 variabletop, before);
\r
1243 newstrength = Collator.IDENTICAL;
\r
1244 if(m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
\r
1246 m_prevStrength_ = newstrength;
\r
1248 m_prevStrength_ = TOKEN_UNSET_;
\r
1251 case 0x002C : // ','
\r
1252 if (newstrength != TOKEN_UNSET_) {
\r
1253 return doEndParseNextToken(newstrength,
\r
1257 variabletop, before);
\r
1259 // if we start with strength, we'll reset to top
\r
1260 if (startofrules == true) {
\r
1261 m_parsedToken_.m_indirectIndex_ = 5;
\r
1263 return doEndParseNextToken(TOKEN_RESET_,
\r
1267 variabletop, before);
\r
1269 newstrength = Collator.TERTIARY;
\r
1270 m_prevStrength_ = TOKEN_UNSET_;
\r
1272 case 0x003B : // ';'
\r
1273 if (newstrength != TOKEN_UNSET_) {
\r
1274 return doEndParseNextToken(newstrength,
\r
1278 variabletop, before);
\r
1280 // if we start with strength, we'll reset to top
\r
1281 if (startofrules == true) {
\r
1282 m_parsedToken_.m_indirectIndex_ = 5;
\r
1284 return doEndParseNextToken(TOKEN_RESET_,
\r
1288 variabletop, before);
\r
1290 newstrength = Collator.SECONDARY;
\r
1291 m_prevStrength_ = TOKEN_UNSET_;
\r
1293 case 0x003C : // '<'
\r
1294 if (newstrength != TOKEN_UNSET_) {
\r
1295 return doEndParseNextToken(newstrength,
\r
1299 variabletop, before);
\r
1301 // if we start with strength, we'll reset to top
\r
1302 if (startofrules == true) {
\r
1303 m_parsedToken_.m_indirectIndex_ = 5;
\r
1305 return doEndParseNextToken(TOKEN_RESET_,
\r
1309 variabletop, before);
\r
1311 // before this, do a scan to verify whether this is
\r
1312 // another strength
\r
1313 if (m_source_.charAt(m_current_ + 1) == 0x003C) {
\r
1315 if (m_source_.charAt(m_current_ + 1) == 0x003C) {
\r
1316 m_current_ ++; // three in a row!
\r
1317 newstrength = Collator.TERTIARY;
\r
1319 else { // two in a row
\r
1320 newstrength = Collator.SECONDARY;
\r
1323 else { // just one
\r
1324 newstrength = Collator.PRIMARY;
\r
1327 if(m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
\r
1329 m_prevStrength_ = newstrength;
\r
1331 m_prevStrength_ = TOKEN_UNSET_;
\r
1334 case 0x0026 : // '&'
\r
1335 if (newstrength != TOKEN_UNSET_) {
\r
1336 return doEndParseNextToken(newstrength,
\r
1340 variabletop, before);
\r
1342 newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0
\r
1343 m_prevStrength_ = TOKEN_UNSET_;
\r
1345 case 0x005b : // '['
\r
1346 // options - read an option, analyze it
\r
1347 m_optionEnd_ = m_rules_.indexOf(0x005d, m_current_);
\r
1348 if (m_optionEnd_ != -1) { // ']'
\r
1349 byte result = readAndSetOption();
\r
1350 m_current_ = m_optionEnd_;
\r
1351 if ((result & TOKEN_TOP_MASK_) != 0) {
\r
1352 if (newstrength == TOKEN_RESET_) {
\r
1354 if (before != 0) {
\r
1355 // This is a combination of before and
\r
1356 // indirection like
\r
1357 // '&[before 2][first regular]<b'
\r
1358 m_source_.append((char)0x002d);
\r
1359 m_source_.append((char)before);
\r
1360 m_extraCurrent_ += 2;
\r
1361 m_parsedToken_.m_charsLen_ += 2;
\r
1364 return doEndParseNextToken(newstrength,
\r
1368 variabletop, before);
\r
1371 throwParseException(m_rules_, m_current_);
\r
1374 else if ((result & TOKEN_VARIABLE_TOP_MASK_) != 0) {
\r
1375 if (newstrength != TOKEN_RESET_
\r
1376 && newstrength != TOKEN_UNSET_) {
\r
1377 variabletop = true;
\r
1378 m_parsedToken_.m_charsOffset_
\r
1379 = m_extraCurrent_;
\r
1380 m_source_.append((char)0xFFFF);
\r
1381 m_extraCurrent_ ++;
\r
1383 m_parsedToken_.m_charsLen_ = 1;
\r
1384 return doEndParseNextToken(newstrength,
\r
1388 variabletop, before);
\r
1391 throwParseException(m_rules_, m_current_);
\r
1394 else if ((result & TOKEN_BEFORE_) != 0){
\r
1395 if (newstrength == TOKEN_RESET_) {
\r
1396 before = (byte)(result & TOKEN_BEFORE_);
\r
1399 throwParseException(m_rules_, m_current_);
\r
1404 case 0x002F : // '/'
\r
1405 wasinquote = false; // if we were copying source
\r
1406 // characters, we want to stop now
\r
1407 inchars = false; // we're now processing expansion
\r
1409 case 0x005C : // back slash for escaped chars
\r
1412 // found a quote, we're gonna start copying
\r
1413 case 0x0027 : //'\''
\r
1414 if (newstrength == TOKEN_UNSET_) {
\r
1415 if (m_prevStrength_ == TOKEN_UNSET_) {
\r
1416 // quote is illegal until we have a strength
\r
1417 throwParseException(m_rules_, m_current_);
\r
1419 newstrength = m_prevStrength_;
\r
1423 if (inchars) { // we're doing characters
\r
1424 if (wasinquote == false) {
\r
1425 m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
\r
1427 if (m_parsedToken_.m_charsLen_ != 0) {
\r
1428 m_source_.append(m_source_.substring(
\r
1429 m_current_ - m_parsedToken_.m_charsLen_,
\r
1431 m_extraCurrent_ += m_parsedToken_.m_charsLen_;
\r
1433 m_parsedToken_.m_charsLen_ ++;
\r
1435 else { // we're doing an expansion
\r
1436 if (wasinquote == false) {
\r
1437 extensionoffset = m_extraCurrent_;
\r
1439 if (newextensionlen != 0) {
\r
1440 m_source_.append(m_source_.substring(
\r
1441 m_current_ - newextensionlen,
\r
1443 m_extraCurrent_ += newextensionlen;
\r
1445 newextensionlen ++;
\r
1447 wasinquote = true;
\r
1449 ch = m_source_.charAt(m_current_);
\r
1450 if (ch == 0x0027) { // copy the double quote
\r
1451 m_source_.append(ch);
\r
1452 m_extraCurrent_ ++;
\r
1456 // '@' is french only if the strength is not currently set
\r
1457 // if it is, it's just a regular character in collation
\r
1458 case 0x0040 : // '@'
\r
1459 if (newstrength == TOKEN_UNSET_) {
\r
1460 m_options_.m_isFrenchCollation_ = true;
\r
1465 // this means we have actually been reading prefix part
\r
1466 // we want to store read characters to the prefix part
\r
1467 // and continue reading the characters (proper way
\r
1468 // would be to restart reading the chars, but in that
\r
1469 // case we would have to complicate the token hasher,
\r
1470 // which I do not intend to play with. Instead, we will
\r
1471 // do prefixes when prefixes are due (before adding the
\r
1473 m_parsedToken_.m_prefixOffset_
\r
1474 = m_parsedToken_.m_charsOffset_;
\r
1475 m_parsedToken_.m_prefixLen_
\r
1476 = m_parsedToken_.m_charsLen_;
\r
1477 if (inchars) { // we're doing characters
\r
1478 if (wasinquote == false) {
\r
1479 m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
\r
1481 if (m_parsedToken_.m_charsLen_ != 0) {
\r
1482 String prefix = m_source_.substring(
\r
1483 m_current_ - m_parsedToken_.m_charsLen_,
\r
1485 m_source_.append(prefix);
\r
1486 m_extraCurrent_ += m_parsedToken_.m_charsLen_;
\r
1488 m_parsedToken_.m_charsLen_ ++;
\r
1490 wasinquote = true;
\r
1493 ch = m_source_.charAt(m_current_);
\r
1494 // skip whitespace between '|' and the character
\r
1495 } while (UCharacterProperty.isRuleWhiteSpace(ch));
\r
1497 case 0x0023: // '#' // this is a comment, skip everything through the end of line
\r
1500 ch = m_source_.charAt(m_current_);
\r
1501 } while (!isCharNewLine(ch));
\r
1503 case 0x0021: // '!' // ignoring java set thai reordering
\r
1506 if (newstrength == TOKEN_UNSET_) {
\r
1507 if(m_prevStrength_ == TOKEN_UNSET_){
\r
1508 throwParseException(m_rules_, m_current_);
\r
1510 newstrength = m_prevStrength_;
\r
1513 if (isSpecialChar(ch) && (inquote == false)) {
\r
1514 throwParseException(m_rules_, m_current_);
\r
1516 if (ch == 0x0000 && m_current_ + 1 == limit) {
\r
1520 if (m_parsedToken_.m_charsLen_ == 0) {
\r
1521 m_parsedToken_.m_charsOffset_ = m_current_;
\r
1523 m_parsedToken_.m_charsLen_++;
\r
1524 if(m_prevStrength_ != TOKEN_UNSET_){
\r
1525 char[] fullchar = Character.toChars(Character.codePointAt(m_source_, m_current_));
\r
1526 m_current_ += fullchar.length;
\r
1527 m_parsedToken_.m_charsLen_ += fullchar.length - 1;
\r
1528 return doEndParseNextToken(newstrength,
\r
1532 variabletop, before);
\r
1536 if (newextensionlen == 0) {
\r
1537 extensionoffset = m_current_;
\r
1539 newextensionlen ++;
\r
1547 m_source_.append(ch);
\r
1548 m_extraCurrent_ ++;
\r
1553 return doEndParseNextToken(newstrength, top,
\r
1554 extensionoffset, newextensionlen,
\r
1555 variabletop, before);
\r
1559 * End the next parse token
\r
1560 * @param newstrength new strength
\r
1561 * @return offset in rules, -1 for end of rules
\r
1563 private int doEndParseNextToken(int newstrength, /*int newcharslen,*/
\r
1564 boolean top, /*int charsoffset,*/
\r
1565 int extensionoffset, int newextensionlen,
\r
1566 boolean variabletop, int before)
\r
1567 throws ParseException
\r
1569 if (newstrength == TOKEN_UNSET_) {
\r
1572 if (m_parsedToken_.m_charsLen_ == 0 && top == false) {
\r
1573 throwParseException(m_rules_, m_current_);
\r
1576 m_parsedToken_.m_strength_ = newstrength;
\r
1577 //m_parsedToken_.m_charsOffset_ = charsoffset;
\r
1578 //m_parsedToken_.m_charsLen_ = newcharslen;
\r
1579 m_parsedToken_.m_extensionOffset_ = extensionoffset;
\r
1580 m_parsedToken_.m_extensionLen_ = newextensionlen;
\r
1581 m_parsedToken_.m_flags_ = (char)
\r
1582 ((variabletop ? TOKEN_VARIABLE_TOP_MASK_ : 0)
\r
1583 | (top ? TOKEN_TOP_MASK_ : 0) | before);
\r
1584 return m_current_;
\r
1588 * Token before this element
\r
1589 * @param sourcetoken
\r
1590 * @param strength collation strength
\r
1591 * @return the token before source token
\r
1592 * @exception ParseException thrown when rules have the wrong syntax
\r
1594 private Token getVirginBefore(Token sourcetoken, int strength)
\r
1595 throws ParseException
\r
1597 // this is a virgin before - we need to fish the anchor from the UCA
\r
1598 if (sourcetoken != null) {
\r
1599 int offset = sourcetoken.m_source_ & 0xFFFFFF;
\r
1600 m_UCAColEIter_.setText(m_source_.substring(offset, offset + 1));
\r
1603 m_UCAColEIter_.setText(
\r
1604 m_source_.substring(m_parsedToken_.m_charsOffset_,
\r
1605 m_parsedToken_.m_charsOffset_ + 1));
\r
1608 int basece = m_UCAColEIter_.next() & 0xFFFFFF3F;
\r
1609 int basecontce = m_UCAColEIter_.next();
\r
1610 if (basecontce == CollationElementIterator.NULLORDER) {
\r
1617 if((basece >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
\r
1618 && (basece >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
\r
1620 int primary = basece & RuleBasedCollator.CE_PRIMARY_MASK_ | (basecontce & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
\r
1621 int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);
\r
1622 ch = RuleBasedCollator.impCEGen_.getCodePointFromRaw(raw-1);
\r
1623 int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);
\r
1624 m_utilCEBuffer_[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
\r
1625 m_utilCEBuffer_[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;
\r
1627 m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
\r
1628 m_source_.append('\uFFFE');
\r
1629 m_source_.append((char)ch);
\r
1630 m_extraCurrent_ += 2;
\r
1631 m_parsedToken_.m_charsLen_++;
\r
1633 m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
\r
1634 | m_parsedToken_.m_charsOffset_;
\r
1635 m_utilToken_.m_rules_ = m_source_;
\r
1636 sourcetoken = m_hashTable_.get(m_utilToken_);
\r
1638 if(sourcetoken == null) {
\r
1639 m_listHeader_[m_resultLength_] = new TokenListHeader();
\r
1640 m_listHeader_[m_resultLength_].m_baseCE_
\r
1641 = m_utilCEBuffer_[0] & 0xFFFFFF3F;
\r
1642 if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
\r
1643 m_listHeader_[m_resultLength_].m_baseContCE_
\r
1644 = m_utilCEBuffer_[1];
\r
1647 m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
\r
1649 m_listHeader_[m_resultLength_].m_nextCE_ = 0;
\r
1650 m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
\r
1651 m_listHeader_[m_resultLength_].m_previousCE_ = 0;
\r
1652 m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
\r
1653 m_listHeader_[m_resultLength_].m_indirect_ = false;
\r
1655 sourcetoken = new Token();
\r
1656 initAReset(-1, sourcetoken);
\r
1661 // first ce and second ce m_utilCEBuffer_
\r
1662 /*int invpos = */CollationParsedRuleBuilder.INVERSE_UCA_.getInversePrevCE(
\r
1663 basece, basecontce,
\r
1664 strength, m_utilCEBuffer_);
\r
1665 // we got the previous CE. Now we need to see if the difference between
\r
1666 // the two CEs is really of the requested strength.
\r
1667 // if it's a bigger difference (we asked for secondary and got primary), we
\r
1668 // need to modify the CE.
\r
1669 if(CollationParsedRuleBuilder.INVERSE_UCA_.getCEStrengthDifference(basece, basecontce, m_utilCEBuffer_[0], m_utilCEBuffer_[1]) < strength) {
\r
1670 // adjust the strength
\r
1671 // now we are in the situation where our baseCE should actually be modified in
\r
1672 // order to get the CE in the right position.
\r
1673 if(strength == Collator.SECONDARY) {
\r
1674 m_utilCEBuffer_[0] = basece - 0x0200;
\r
1675 } else { // strength == UCOL_TERTIARY
\r
1676 m_utilCEBuffer_[0] = basece - 0x02;
\r
1678 if(RuleBasedCollator.isContinuation(basecontce)) {
\r
1679 if(strength == Collator.SECONDARY) {
\r
1680 m_utilCEBuffer_[1] = basecontce - 0x0200;
\r
1681 } else { // strength == UCOL_TERTIARY
\r
1682 m_utilCEBuffer_[1] = basecontce - 0x02;
\r
1688 // the code below relies on getting a code point from the inverse table, in order to be
\r
1689 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
\r
1690 // 1. There are many code points that have the same CE
\r
1691 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
\r
1692 // Also, in case when there is no equivalent strength before an element, we have to actually
\r
1693 // construct one. For example, &[before 2]a << x won't result in x << a, because the element
\r
1694 // before a is a primary difference.
\r
1695 ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_table_[3 * invpos
\r
1697 if ((ch & INVERSE_SIZE_MASK_) != 0) {
\r
1698 int offset = ch & INVERSE_OFFSET_MASK_;
\r
1699 ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_continuations_[
\r
1702 m_source_.append((char)ch);
\r
1703 m_extraCurrent_ ++;
\r
1704 m_parsedToken_.m_charsOffset_ = m_extraCurrent_ - 1;
\r
1705 m_parsedToken_.m_charsLen_ = 1;
\r
1707 // We got an UCA before. However, this might have been tailored.
\r
1709 // &\u30ca = \u306a
\r
1710 // &[before 3]\u306a<<<\u306a|\u309d
\r
1712 m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
\r
1713 | m_parsedToken_.m_charsOffset_;
\r
1714 m_utilToken_.m_rules_ = m_source_;
\r
1715 sourcetoken = (Token)m_hashTable_.get(m_utilToken_);
\r
1718 // here is how it should be. The situation such as &[before 1]a < x, should be
\r
1719 // resolved exactly as if we wrote &a > x.
\r
1720 // therefore, I don't really care if the UCA value before a has been changed.
\r
1721 // However, I do care if the strength between my element and the previous element
\r
1722 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
\r
1723 // have to construct the base CE.
\r
1725 // if we found a tailored thing, we have to use the UCA value and
\r
1726 // construct a new reset token with constructed name
\r
1727 //if (sourcetoken != null && sourcetoken.m_strength_ != TOKEN_RESET_) {
\r
1728 // character to which we want to anchor is already tailored.
\r
1729 // We need to construct a new token which will be the anchor point
\r
1730 //m_source_.setCharAt(m_extraCurrent_ - 1, '\uFFFE');
\r
1731 //m_source_.append(ch);
\r
1732 //m_extraCurrent_ ++;
\r
1733 //m_parsedToken_.m_charsLen_ ++;
\r
1735 m_parsedToken_.m_charsOffset_ -= 10;
\r
1736 m_parsedToken_.m_charsLen_ += 10;
\r
1737 m_listHeader_[m_resultLength_] = new TokenListHeader();
\r
1738 m_listHeader_[m_resultLength_].m_baseCE_
\r
1739 = m_utilCEBuffer_[0] & 0xFFFFFF3F;
\r
1740 if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
\r
1741 m_listHeader_[m_resultLength_].m_baseContCE_
\r
1742 = m_utilCEBuffer_[1];
\r
1745 m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
\r
1747 m_listHeader_[m_resultLength_].m_nextCE_ = 0;
\r
1748 m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
\r
1749 m_listHeader_[m_resultLength_].m_previousCE_ = 0;
\r
1750 m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
\r
1751 m_listHeader_[m_resultLength_].m_indirect_ = false;
\r
1752 sourcetoken = new Token();
\r
1753 initAReset(-1, sourcetoken);
\r
1756 return sourcetoken;
\r
1760 * Processing Description.
\r
1761 * 1. Build a m_listHeader_. Each list has a header, which contains two lists
\r
1762 * (positive and negative), a reset token, a baseCE, nextCE, and
\r
1763 * previousCE. The lists and reset may be null.
\r
1764 * 2. As you process, you keep a LAST pointer that points to the last token
\r
1766 * @param expand string offset, -1 for null strings
\r
1767 * @param targetToken token to update
\r
1768 * @return expandnext offset
\r
1769 * @throws ParseException thrown when rules syntax failed
\r
1771 private int initAReset(int expand, Token targetToken) throws ParseException
\r
1773 if (m_resultLength_ == m_listHeader_.length - 1) {
\r
1774 // Unfortunately, this won't work, as we store addresses of lhs in
\r
1776 TokenListHeader temp[] = new TokenListHeader[m_resultLength_ << 1];
\r
1777 System.arraycopy(m_listHeader_, 0, temp, 0, m_resultLength_ + 1);
\r
1778 m_listHeader_ = temp;
\r
1780 // do the reset thing
\r
1781 targetToken.m_rules_ = m_source_;
\r
1782 targetToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
\r
1783 | m_parsedToken_.m_charsOffset_;
\r
1784 targetToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
\r
1785 | m_parsedToken_.m_extensionOffset_;
\r
1786 // keep the flags around so that we know about before
\r
1787 targetToken.m_flags_ = m_parsedToken_.m_flags_;
\r
1789 if (m_parsedToken_.m_prefixOffset_ != 0) {
\r
1790 throwParseException(m_rules_, m_parsedToken_.m_charsOffset_ - 1);
\r
1793 targetToken.m_prefix_ = 0;
\r
1794 // TODO: this should also handle reverse
\r
1795 targetToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
\r
1796 targetToken.m_strength_ = TOKEN_RESET_;
\r
1797 targetToken.m_next_ = null;
\r
1798 targetToken.m_previous_ = null;
\r
1799 targetToken.m_CELength_ = 0;
\r
1800 targetToken.m_expCELength_ = 0;
\r
1801 targetToken.m_listHeader_ = m_listHeader_[m_resultLength_];
\r
1802 m_listHeader_[m_resultLength_].m_first_ = null;
\r
1803 m_listHeader_[m_resultLength_].m_last_ = null;
\r
1804 m_listHeader_[m_resultLength_].m_first_ = null;
\r
1805 m_listHeader_[m_resultLength_].m_last_ = null;
\r
1806 m_listHeader_[m_resultLength_].m_reset_ = targetToken;
\r
1808 /* 3 Consider each item: relation, source, and expansion:
\r
1809 * e.g. ...< x / y ...
\r
1810 * First convert all expansions into normal form. Examples:
\r
1811 * If "xy" doesn't occur earlier in the list or in the UCA, convert
\r
1812 * &xy * c * d * ... into &x * c/y * d * ...
\r
1813 * Note: reset values can never have expansions, although they can
\r
1814 * cause the very next item to have one. They may be contractions, if
\r
1815 * they are found earlier in the list.
\r
1819 // check to see if there is an expansion
\r
1820 if (m_parsedToken_.m_charsLen_ > 1) {
\r
1821 targetToken.m_source_ = ((expand
\r
1822 - m_parsedToken_.m_charsOffset_ )
\r
1824 | m_parsedToken_.m_charsOffset_;
\r
1825 result = ((m_parsedToken_.m_charsLen_
\r
1826 + m_parsedToken_.m_charsOffset_ - expand) << 24)
\r
1831 m_resultLength_ ++;
\r
1832 m_hashTable_.put(targetToken, targetToken);
\r
1837 * Checks if an character is special
\r
1838 * @param ch character to test
\r
1839 * @return true if the character is special
\r
1841 private static final boolean isSpecialChar(char ch)
\r
1843 return (ch <= 0x002F && ch >= 0x0020) || (ch <= 0x003F && ch >= 0x003A)
\r
1844 || (ch <= 0x0060 && ch >= 0x005B)
\r
1845 || (ch <= 0x007E && ch >= 0x007D) || ch == 0x007B;
\r
1849 UnicodeSet readAndSetUnicodeSet(String source, int start) throws ParseException
\r
1851 while(source.charAt(start) != '[') { /* advance while we find the first '[' */
\r
1854 // now we need to get a balanced set of '[]'. The problem is that a set can have
\r
1855 // many, and *end point to the first closing '['
\r
1856 int noOpenBraces = 1;
\r
1857 int current = 1; // skip the opening brace
\r
1858 while(start+current < source.length() && noOpenBraces != 0) {
\r
1859 if(source.charAt(start+current) == '[') {
\r
1861 } else if(source.charAt(start+current) == ']') { // closing brace
\r
1866 //int nextBrace = -1;
\r
1868 if(noOpenBraces != 0 || (/*nextBrace =*/ source.indexOf("]", start+current) /*']'*/) == -1) {
\r
1869 throwParseException(m_rules_, start);
\r
1871 return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current);
\r
1875 /** in C, optionarg is passed by reference to function.
\r
1876 * We use a private int to simulate this.
\r
1878 private int m_optionarg_ = 0;
\r
1880 private int readOption(String rules, int start, int optionend)
\r
1884 while (i < RULES_OPTIONS_.length) {
\r
1885 String option = RULES_OPTIONS_[i].m_name_;
\r
1886 int optionlength = option.length();
\r
1887 if (rules.length() > start + optionlength
\r
1888 && option.equalsIgnoreCase(rules.substring(start,
\r
1889 start + optionlength))) {
\r
1890 if (optionend - start > optionlength) {
\r
1891 m_optionarg_ = start + optionlength;
\r
1892 // start of the options, skip space
\r
1893 while (m_optionarg_ < optionend && (UCharacter.isWhitespace(rules.charAt(m_optionarg_)) || UCharacterProperty.isRuleWhiteSpace(rules.charAt(m_optionarg_))))
\r
1894 { // eat whitespace
\r
1902 if(i == RULES_OPTIONS_.length) {
\r
1908 * Reads and set collation options
\r
1909 * @return TOKEN_SUCCESS if option is set correct, 0 otherwise
\r
1910 * @exception ParseException thrown when options in rules are wrong
\r
1912 private byte readAndSetOption() throws ParseException
\r
1914 int start = m_current_ + 1; // skip opening '['
\r
1915 int i = readOption(m_rules_, start, m_optionEnd_);
\r
1917 int optionarg = m_optionarg_;
\r
1920 throwParseException(m_rules_, start);
\r
1924 if (optionarg != 0) {
\r
1925 for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;
\r
1927 String subname = RULES_OPTIONS_[i].m_subOptions_[j];
\r
1928 int size = optionarg + subname.length();
\r
1929 if (m_rules_.length() > size
\r
1930 && subname.equalsIgnoreCase(m_rules_.substring(
\r
1931 optionarg, size))) {
\r
1932 setOptions(m_options_, RULES_OPTIONS_[i].m_attribute_,
\r
1933 RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]);
\r
1934 return TOKEN_SUCCESS_MASK_;
\r
1938 throwParseException(m_rules_, optionarg);
\r
1940 else if (i == 7) { // variable top
\r
1941 return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_;
\r
1943 else if (i == 8) { // rearange
\r
1944 return TOKEN_SUCCESS_MASK_;
\r
1946 else if (i == 9) { // before
\r
1947 if (optionarg != 0) {
\r
1948 for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;
\r
1950 String subname = RULES_OPTIONS_[i].m_subOptions_[j];
\r
1951 int size = optionarg + subname.length();
\r
1952 if (m_rules_.length() > size
\r
1953 && subname.equalsIgnoreCase(
\r
1954 m_rules_.substring(optionarg,
\r
1955 optionarg + subname.length()))) {
\r
1956 return (byte)(TOKEN_SUCCESS_MASK_
\r
1957 | RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]
\r
1962 throwParseException(m_rules_, optionarg);
\r
1964 else if (i == 10) { // top, we are going to have an array with
\r
1965 // structures of limit CEs index to this array will be
\r
1966 // src->parsedToken.indirectIndex
\r
1967 m_parsedToken_.m_indirectIndex_ = 0;
\r
1968 return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
\r
1970 else if (i < 13) { // first, last
\r
1971 for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j ++) {
\r
1972 String subname = RULES_OPTIONS_[i].m_subOptions_[j];
\r
1973 int size = optionarg + subname.length();
\r
1974 if (m_rules_.length() > size
\r
1975 && subname.equalsIgnoreCase(m_rules_.substring(optionarg,
\r
1977 m_parsedToken_.m_indirectIndex_ = (char)(i - 10 + (j << 1));
\r
1978 return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
\r
1981 throwParseException(m_rules_, optionarg);
\r
1983 else if(i == 13 || i == 14) { // copy and remove are handled before normalization
\r
1984 // we need to move end here
\r
1985 int noOpenBraces = 1;
\r
1986 m_current_++; // skip opening brace
\r
1987 while(m_current_ < m_source_.length() && noOpenBraces != 0) {
\r
1988 if(m_source_.charAt(m_current_) == '[') {
\r
1990 } else if(m_source_.charAt(m_current_) == ']') { // closing brace
\r
1995 m_optionEnd_ = m_current_-1;
\r
1996 return TOKEN_SUCCESS_MASK_;
\r
1999 throwParseException(m_rules_, optionarg);
\r
2001 return TOKEN_SUCCESS_MASK_; // we will never reach here.
\r
2005 * Set collation option
\r
2006 * @param optionset option set to set
\r
2007 * @param attribute type to set
\r
2008 * @param value attribute value
\r
2010 private void setOptions(OptionSet optionset, int attribute, int value)
\r
2012 switch (attribute) {
\r
2013 case RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_ :
\r
2014 optionset.m_isHiragana4_
\r
2015 = (value == RuleBasedCollator.AttributeValue.ON_);
\r
2017 case RuleBasedCollator.Attribute.FRENCH_COLLATION_ :
\r
2018 optionset.m_isFrenchCollation_
\r
2019 = (value == RuleBasedCollator.AttributeValue.ON_);
\r
2021 case RuleBasedCollator.Attribute.ALTERNATE_HANDLING_ :
\r
2022 optionset.m_isAlternateHandlingShifted_
\r
2024 == RuleBasedCollator.AttributeValue.SHIFTED_);
\r
2026 case RuleBasedCollator.Attribute.CASE_FIRST_ :
\r
2027 optionset.m_caseFirst_ = value;
\r
2029 case RuleBasedCollator.Attribute.CASE_LEVEL_ :
\r
2030 optionset.m_isCaseLevel_
\r
2031 = (value == RuleBasedCollator.AttributeValue.ON_);
\r
2033 case RuleBasedCollator.Attribute.NORMALIZATION_MODE_ :
\r
2034 if (value == RuleBasedCollator.AttributeValue.ON_) {
\r
2035 value = Collator.CANONICAL_DECOMPOSITION;
\r
2037 optionset.m_decomposition_ = value;
\r
2039 case RuleBasedCollator.Attribute.STRENGTH_ :
\r
2040 optionset.m_strength_ = value;
\r
2047 UnicodeSet getTailoredSet() throws ParseException
\r
2049 boolean startOfRules = true;
\r
2050 UnicodeSet tailored = new UnicodeSet();
\r
2052 CanonicalIterator it = new CanonicalIterator("");
\r
2054 m_parsedToken_.m_strength_ = TOKEN_UNSET_;
\r
2055 int sourcelimit = m_source_.length();
\r
2056 //int expandNext = 0;
\r
2058 while (m_current_ < sourcelimit) {
\r
2059 m_parsedToken_.m_prefixOffset_ = 0;
\r
2060 if (parseNextToken(startOfRules) < 0) {
\r
2061 // we have reached the end
\r
2064 startOfRules = false;
\r
2065 // The idea is to tokenize the rule set. For each non-reset token,
\r
2066 // we add all the canonicaly equivalent FCD sequences
\r
2067 if(m_parsedToken_.m_strength_ != TOKEN_RESET_) {
\r
2068 it.setSource(m_source_.substring(
\r
2069 m_parsedToken_.m_charsOffset_,
\r
2070 m_parsedToken_.m_charsOffset_+m_parsedToken_.m_charsLen_));
\r
2071 pattern = it.next();
\r
2072 while(pattern != null) {
\r
2073 if(Normalizer.quickCheck(pattern, Normalizer.FCD,0) != Normalizer.NO) {
\r
2074 tailored.add(pattern);
\r
2076 pattern = it.next();
\r
2083 final private void extractSetsFromRules(String rules) throws ParseException {
\r
2084 int optionNumber = -1;
\r
2087 while(i < rules.length()) {
\r
2088 if(rules.charAt(i) == 0x005B) {
\r
2089 optionNumber = readOption(rules, i+1, rules.length());
\r
2090 setStart = m_optionarg_;
\r
2091 if(optionNumber == 13) { /* copy - parts of UCA to tailoring */
\r
2092 UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);
\r
2093 if(m_copySet_ == null) {
\r
2094 m_copySet_ = newSet;
\r
2096 m_copySet_.addAll(newSet);
\r
2098 } else if(optionNumber == 14) {
\r
2099 UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);
\r
2100 if(m_removeSet_ == null) {
\r
2101 m_removeSet_ = newSet;
\r
2103 m_removeSet_.addAll(newSet);
\r