2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import java.text.CharacterIterator;
\r
11 import java.text.StringCharacterIterator;
\r
12 import java.util.Locale;
\r
14 import com.ibm.icu.impl.CharacterIteratorWrapper;
\r
15 import com.ibm.icu.impl.Norm2AllModes;
\r
16 import com.ibm.icu.impl.Normalizer2Impl;
\r
17 import com.ibm.icu.lang.UCharacter;
\r
18 import com.ibm.icu.util.ULocale;
\r
22 * <code>StringSearch</code> is the concrete subclass of
\r
23 * <code>SearchIterator</code> that provides language-sensitive text searching
\r
24 * based on the comparison rules defined in a {@link RuleBasedCollator} object.
\r
27 * <code>StringSearch</code> uses a version of the fast Boyer-Moore search
\r
28 * algorithm that has been adapted to work with the large character set of
\r
29 * Unicode. Refer to
\r
30 * <a href="http://www.icu-project.org/docs/papers/efficient_text_searching_in_java.html">
\r
31 * "Efficient Text Searching in Java"</a>, published in the
\r
32 * <i>Java Report</i> on February, 1999, for further information on the
\r
36 * Users are also strongly encouraged to read the section on
\r
37 * <a href="http://www.icu-project.org/userguide/searchString.html">
\r
38 * String Search</a> and
\r
39 * <a href="http://www.icu-project.org/userguide/Collate_Intro.html">
\r
40 * Collation</a> in the user guide before attempting to use this class.
\r
43 * String searching becomes a little complicated when accents are encountered at
\r
44 * match boundaries. If a match is found and it has preceding or trailing
\r
45 * accents not part of the match, the result returned will include the
\r
46 * preceding accents up to the first base character, if the pattern searched
\r
47 * for starts an accent. Likewise,
\r
48 * if the pattern ends with an accent, all trailing accents up to the first
\r
49 * base character will be included in the result.
\r
52 * For example, if a match is found in target text "a\u0325\u0300" for
\r
54 * "a\u0325", the result returned by StringSearch will be the index 0 and
\r
55 * length 3 <0, 3>. If a match is found in the target
\r
56 * "a\u0325\u0300"
\r
57 * for the pattern "\u0300", then the result will be index 1 and length 2
\r
61 * In the case where the decomposition mode is on for the RuleBasedCollator,
\r
62 * all matches that starts or ends with an accent will have its results include
\r
63 * preceding or following accents respectively. For example, if pattern "a" is
\r
64 * looked for in the target text "á\u0325", the result will be
\r
65 * index 0 and length 2 <0, 2>.
\r
68 * The StringSearch class provides two options to handle accent matching
\r
72 * Let S' be the sub-string of a text string S between the offsets start and
\r
73 * end <start, end>.
\r
75 * A pattern string P matches a text string S at the offsets <start,
\r
80 * option 1. P matches some canonical equivalent string of S'. Suppose the
\r
81 * RuleBasedCollator used for searching has a collation strength of
\r
82 * TERTIARY, all accents are non-ignorable. If the pattern
\r
83 * "a\u0300" is searched in the target text
\r
84 * "a\u0325\u0300",
\r
85 * a match will be found, since the target text is canonically
\r
86 * equivalent to "a\u0300\u0325"
\r
87 * option 2. P matches S' and if P starts or ends with a combining mark,
\r
88 * there exists no non-ignorable combining mark before or after S'
\r
89 * in S respectively. Following the example above, the pattern
\r
90 * "a\u0300" will not find a match in "a\u0325\u0300",
\r
92 * there exists a non-ignorable accent '\u0325' in the middle of
\r
93 * 'a' and '\u0300'. Even with a target text of
\r
94 * "a\u0300\u0325" a match will not be found because of the
\r
95 * non-ignorable trailing accent \u0325.
\r
97 * Option 2. will be the default mode for dealing with boundary accents unless
\r
98 * specified via the API setCanonical(boolean).
\r
99 * One restriction is to be noted for option 1. Currently there are no
\r
100 * composite characters that consists of a character with combining class > 0
\r
101 * before a character with combining class == 0. However, if such a character
\r
102 * exists in the future, the StringSearch may not work correctly with option 1
\r
103 * when such characters are encountered.
\r
106 * <tt>SearchIterator</tt> provides APIs to specify the starting position
\r
107 * within the text string to be searched, e.g. <tt>setIndex</tt>,
\r
108 * <tt>preceding</tt> and <tt>following</tt>. Since the starting position will
\r
109 * be set as it is specified, please take note that there are some dangerous
\r
110 * positions which the search may render incorrect results:
\r
112 * <li> The midst of a substring that requires decomposition.
\r
113 * <li> If the following match is to be found, the position should not be the
\r
114 * second character which requires to be swapped with the preceding
\r
115 * character. Vice versa, if the preceding match is to be found,
\r
116 * position to search from should not be the first character which
\r
117 * requires to be swapped with the next character. E.g certain Thai and
\r
118 * Lao characters require swapping.
\r
119 * <li> If a following pattern match is to be found, any position within a
\r
120 * contracting sequence except the first will fail. Vice versa if a
\r
121 * preceding pattern match is to be found, a invalid starting point
\r
122 * would be any character within a contracting sequence except the last.
\r
126 * Though collator attributes will be taken into consideration while
\r
127 * performing matches, there are no APIs provided in StringSearch for setting
\r
128 * and getting the attributes. These attributes can be set by getting the
\r
129 * collator from <tt>getCollator</tt> and using the APIs in
\r
130 * <tt>com.ibm.icu.text.Collator</tt>. To update StringSearch to the new
\r
131 * collator attributes, <tt>reset()</tt> or
\r
132 * <tt>setCollator(RuleBasedCollator)</tt> has to be called.
\r
136 * <a href="http://www.icu-project.org/userguide/searchString.html">
\r
137 * String Search</a> user guide and the <code>SearchIterator</code>
\r
138 * documentation for more information and examples of use.
\r
141 * This class is not subclassable
\r
143 * @see SearchIterator
\r
144 * @see RuleBasedCollator
\r
145 * @author Laura Werner, synwee
\r
148 // internal notes: all methods do not guarantee the correct status of the
\r
149 // characteriterator. the caller has to maintain the original index position
\r
150 // if necessary. methods could change the index position as it deems fit
\r
151 public final class StringSearch extends SearchIterator
\r
154 // public constructors --------------------------------------------------
\r
157 * Initializes the iterator to use the language-specific rules defined in
\r
158 * the argument collator to search for argument pattern in the argument
\r
159 * target text. The argument breakiter is used to define logical matches.
\r
160 * See super class documentation for more details on the use of the target
\r
161 * text and BreakIterator.
\r
162 * @param pattern text to look for.
\r
163 * @param target target text to search for pattern.
\r
164 * @param collator RuleBasedCollator that defines the language rules
\r
165 * @param breakiter A {@link BreakIterator} that is used to determine the
\r
166 * boundaries of a logical match. This argument can be null.
\r
167 * @exception IllegalArgumentException thrown when argument target is null,
\r
169 * @see BreakIterator
\r
170 * @see RuleBasedCollator
\r
171 * @see SearchIterator
\r
174 public StringSearch(String pattern, CharacterIterator target,
\r
175 RuleBasedCollator collator, BreakIterator breakiter)
\r
177 super(target, breakiter);
\r
178 m_textBeginOffset_ = targetText.getBeginIndex();
\r
179 m_textLimitOffset_ = targetText.getEndIndex();
\r
180 m_collator_ = collator;
\r
181 m_colEIter_ = m_collator_.getCollationElementIterator(target);
\r
182 m_utilColEIter_ = collator.getCollationElementIterator("");
\r
183 m_ceMask_ = getMask(m_collator_.getStrength());
\r
184 m_isCanonicalMatch_ = false;
\r
185 m_pattern_ = new Pattern(pattern);
\r
186 m_matchedIndex_ = DONE;
\r
187 m_charBreakIter_ = BreakIterator.getCharacterInstance(/*m_collator_.getLocale(ULocale.ACTUAL_LOCALE)*/);
\r
188 m_charBreakIter_.setText(target);
\r
193 * Initializes the iterator to use the language-specific rules defined in
\r
194 * the argument collator to search for argument pattern in the argument
\r
195 * target text. No BreakIterators are set to test for logical matches.
\r
196 * @param pattern text to look for.
\r
197 * @param target target text to search for pattern.
\r
198 * @param collator RuleBasedCollator that defines the language rules
\r
199 * @exception IllegalArgumentException thrown when argument target is null,
\r
201 * @see RuleBasedCollator
\r
202 * @see SearchIterator
\r
205 public StringSearch(String pattern, CharacterIterator target,
\r
206 RuleBasedCollator collator)
\r
208 this(pattern, target, collator, null/*BreakIterator.getCharacterInstance()*/);
\r
212 * Initializes the iterator to use the language-specific rules and
\r
213 * break iterator rules defined in the argument locale to search for
\r
214 * argument pattern in the argument target text.
\r
215 * See super class documentation for more details on the use of the target
\r
216 * text and BreakIterator.
\r
217 * @param pattern text to look for.
\r
218 * @param target target text to search for pattern.
\r
219 * @param locale locale to use for language and break iterator rules
\r
220 * @exception IllegalArgumentException thrown when argument target is null,
\r
221 * or of length 0. ClassCastException thrown if the collator for
\r
222 * the specified locale is not a RuleBasedCollator.
\r
223 * @see BreakIterator
\r
224 * @see RuleBasedCollator
\r
225 * @see SearchIterator
\r
228 public StringSearch(String pattern, CharacterIterator target, Locale locale)
\r
230 this(pattern, target, ULocale.forLocale(locale));
\r
234 * Initializes the iterator to use the language-specific rules and
\r
235 * break iterator rules defined in the argument locale to search for
\r
236 * argument pattern in the argument target text.
\r
237 * See super class documentation for more details on the use of the target
\r
238 * text and BreakIterator.
\r
239 * @param pattern text to look for.
\r
240 * @param target target text to search for pattern.
\r
241 * @param locale ulocale to use for language and break iterator rules
\r
242 * @exception IllegalArgumentException thrown when argument target is null,
\r
243 * or of length 0. ClassCastException thrown if the collator for
\r
244 * the specified locale is not a RuleBasedCollator.
\r
245 * @see BreakIterator
\r
246 * @see RuleBasedCollator
\r
247 * @see SearchIterator
\r
250 public StringSearch(String pattern, CharacterIterator target, ULocale locale)
\r
252 this(pattern, target, (RuleBasedCollator)Collator.getInstance(locale),
\r
253 null/*BreakIterator.getCharacterInstance(locale)*/);
\r
257 * Initializes the iterator to use the language-specific rules and
\r
258 * break iterator rules defined in the default locale to search for
\r
259 * argument pattern in the argument target text.
\r
260 * See super class documentation for more details on the use of the target
\r
261 * text and BreakIterator.
\r
262 * @param pattern text to look for.
\r
263 * @param target target text to search for pattern.
\r
264 * @exception IllegalArgumentException thrown when argument target is null,
\r
265 * or of length 0. ClassCastException thrown if the collator for
\r
266 * the default locale is not a RuleBasedCollator.
\r
267 * @see BreakIterator
\r
268 * @see RuleBasedCollator
\r
269 * @see SearchIterator
\r
272 public StringSearch(String pattern, String target)
\r
274 this(pattern, new StringCharacterIterator(target),
\r
275 (RuleBasedCollator)Collator.getInstance(),
\r
276 null/*BreakIterator.getCharacterInstance()*/);
\r
279 // public getters -----------------------------------------------------
\r
283 * Gets the RuleBasedCollator used for the language rules.
\r
286 * Since StringSearch depends on the returned RuleBasedCollator, any
\r
287 * changes to the RuleBasedCollator result should follow with a call to
\r
288 * either StringSearch.reset() or
\r
289 * StringSearch.setCollator(RuleBasedCollator) to ensure the correct
\r
290 * search behaviour.
\r
292 * @return RuleBasedCollator used by this StringSearch
\r
293 * @see RuleBasedCollator
\r
294 * @see #setCollator
\r
297 public RuleBasedCollator getCollator()
\r
299 return m_collator_;
\r
303 * Returns the pattern for which StringSearch is searching for.
\r
304 * @return the pattern searched for
\r
307 public String getPattern()
\r
309 return m_pattern_.targetText;
\r
313 * Return the index in the target text where the iterator is currently
\r
315 * If the iteration has gone past the end of the target text or past
\r
316 * the beginning for a backwards search, {@link #DONE} is returned.
\r
317 * @return index in the target text where the iterator is currently
\r
321 public int getIndex()
\r
323 int result = m_colEIter_.getOffset();
\r
324 if (isOutOfBounds(m_textBeginOffset_, m_textLimitOffset_, result)) {
\r
331 * Determines whether canonical matches (option 1, as described in the
\r
332 * class documentation) is set.
\r
333 * See setCanonical(boolean) for more information.
\r
334 * @see #setCanonical
\r
335 * @return true if canonical matches is set, false otherwise
\r
338 public boolean isCanonical()
\r
340 return m_isCanonicalMatch_;
\r
343 // public setters -----------------------------------------------------
\r
347 * Sets the RuleBasedCollator to be used for language-specific searching.
\r
350 * This method causes internal data such as Boyer-Moore shift tables
\r
351 * to be recalculated, but the iterator's position is unchanged.
\r
353 * @param collator to use for this StringSearch
\r
354 * @exception IllegalArgumentException thrown when collator is null
\r
355 * @see #getCollator
\r
358 public void setCollator(RuleBasedCollator collator)
\r
360 if (collator == null) {
\r
361 throw new IllegalArgumentException("Collator can not be null");
\r
363 m_collator_ = collator;
\r
364 m_ceMask_ = getMask(m_collator_.getStrength());
\r
365 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
\r
367 m_colEIter_.setCollator(m_collator_);
\r
368 m_utilColEIter_.setCollator(m_collator_);
\r
369 m_charBreakIter_ = BreakIterator.getCharacterInstance(/*collator.getLocale(ULocale.VALID_LOCALE)*/);
\r
370 m_charBreakIter_.setText(targetText);
\r
375 * Set the pattern to search for.
\r
378 * This method causes internal data such as Boyer-Moore shift tables
\r
379 * to be recalculated, but the iterator's position is unchanged.
\r
381 * @param pattern for searching
\r
383 * @exception IllegalArgumentException thrown if pattern is null or of
\r
387 public void setPattern(String pattern)
\r
389 if (pattern == null || pattern.length() <= 0) {
\r
390 throw new IllegalArgumentException(
\r
391 "Pattern to search for can not be null or of length 0");
\r
393 m_pattern_.targetText = pattern;
\r
398 * Set the target text to be searched. Text iteration will hence begin at
\r
399 * the start of the text string. This method is useful if you want to
\r
400 * re-use an iterator to search within a different body of text.
\r
401 * @param text new text iterator to look for match,
\r
402 * @exception IllegalArgumentException thrown when text is null or has
\r
407 public void setTarget(CharacterIterator text)
\r
409 super.setTarget(text);
\r
410 m_textBeginOffset_ = targetText.getBeginIndex();
\r
411 m_textLimitOffset_ = targetText.getEndIndex();
\r
412 m_colEIter_.setText(targetText);
\r
413 m_charBreakIter_.setText(targetText);
\r
418 * Sets the position in the target text which the next search will start
\r
419 * from to the argument. This method clears all previous states.
\r
422 * This method takes the argument position and sets the position in the
\r
423 * target text accordingly, without checking if position is pointing to a
\r
424 * valid starting point to begin searching.
\r
427 * Search positions that may render incorrect results are highlighted in
\r
428 * the class documentation.
\r
430 * @param position index to start next search from.
\r
431 * @exception IndexOutOfBoundsException thrown if argument position is out
\r
432 * of the target text range.
\r
436 public void setIndex(int position)
\r
438 super.setIndex(position);
\r
439 m_matchedIndex_ = DONE;
\r
440 m_colEIter_.setExactOffset(position);
\r
445 * Set the canonical match mode. See class documentation for details.
\r
446 * The default setting for this property is false.
\r
448 * @param allowCanonical flag indicator if canonical matches are allowed
\r
449 * @see #isCanonical
\r
452 public void setCanonical(boolean allowCanonical)
\r
454 m_isCanonicalMatch_ = allowCanonical;
\r
455 if (m_isCanonicalMatch_ == true) {
\r
456 if (m_canonicalPrefixAccents_ == null) {
\r
457 m_canonicalPrefixAccents_ = new StringBuilder();
\r
460 m_canonicalPrefixAccents_.delete(0,
\r
461 m_canonicalPrefixAccents_.length());
\r
463 if (m_canonicalSuffixAccents_ == null) {
\r
464 m_canonicalSuffixAccents_ = new StringBuilder();
\r
467 m_canonicalSuffixAccents_.delete(0,
\r
468 m_canonicalSuffixAccents_.length());
\r
473 // public miscellaneous methods -----------------------------------------
\r
477 * Resets the search iteration. All properties will be reset to the
\r
481 * Search will begin at the start of the target text if a forward iteration
\r
482 * is initiated before a backwards iteration. Otherwise if a
\r
483 * backwards iteration is initiated before a forwards iteration, the search
\r
484 * will begin at the end of the target text.
\r
487 * Canonical match option will be reset to false, ie an exact match.
\r
491 public void reset()
\r
493 // reset is setting the attributes that are already in string search,
\r
494 // hence all attributes in the collator should be retrieved without any
\r
497 m_isCanonicalMatch_ = false;
\r
498 m_ceMask_ = getMask(m_collator_.getStrength());
\r
499 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
\r
501 m_colEIter_.setCollator(m_collator_);
\r
502 m_colEIter_.reset();
\r
503 m_utilColEIter_.setCollator(m_collator_);
\r
506 // protected methods -----------------------------------------------------
\r
510 * Concrete method to provide the mechanism
\r
511 * for finding the next <b>forwards</b> match in the target text.
\r
512 * See super class documentation for its use.
\r
514 * @param start index in the target text at which the forwards search
\r
516 * @return the starting index of the next forwards match if found, DONE
\r
518 * @see #handlePrevious(int)
\r
522 protected int handleNext(int start)
\r
524 if (m_pattern_.m_CELength_ == 0) {
\r
526 if (m_matchedIndex_ == DONE && start == m_textBeginOffset_) {
\r
527 m_matchedIndex_ = start;
\r
528 return m_matchedIndex_;
\r
531 targetText.setIndex(start);
\r
532 char ch = targetText.current();
\r
533 // ch can never be done, it is handled by next()
\r
534 char ch2 = targetText.next();
\r
535 if (ch2 == CharacterIterator.DONE) {
\r
536 m_matchedIndex_ = DONE;
\r
539 m_matchedIndex_ = targetText.getIndex();
\r
541 if (UTF16.isLeadSurrogate(ch) && UTF16.isTrailSurrogate(ch2)) {
\r
543 m_matchedIndex_ = targetText.getIndex();
\r
547 if (matchLength <= 0) {
\r
548 // we must have reversed direction after we reached the start
\r
549 // of the target text
\r
550 // see SearchIterator next(), it checks the bounds and returns
\r
551 // if it exceeds the range. It does not allow setting of
\r
553 if (start == m_textBeginOffset_) {
\r
554 m_matchedIndex_ = DONE;
\r
557 // for boundary check purposes. this will ensure that the
\r
558 // next match will not preceed the current offset
\r
559 // note search->matchedIndex will always be set to something
\r
561 m_matchedIndex_ = start - 1;
\r
565 // status checked below
\r
566 if (m_isCanonicalMatch_) {
\r
567 // can't use exact here since extra accents are allowed.
\r
568 handleNextCanonical(start);
\r
571 handleNextExact(start);
\r
574 if (m_matchedIndex_ == DONE) {
\r
575 targetText.setIndex(m_textLimitOffset_);
\r
578 targetText.setIndex(m_matchedIndex_);
\r
580 return m_matchedIndex_;
\r
585 * Concrete method to provide the mechanism
\r
586 * for finding the next <b>backwards</b> match in the target text.
\r
587 * See super class documentation for its use.
\r
589 * @param start index in the target text at which the backwards search
\r
591 * @return the starting index of the next backwards match if found, DONE
\r
593 * @see #handleNext(int)
\r
597 protected int handlePrevious(int start)
\r
599 if (m_pattern_.m_CELength_ == 0) {
\r
601 // start can never be DONE or 0, it is handled in previous
\r
602 targetText.setIndex(start);
\r
603 char ch = targetText.previous();
\r
604 if (ch == CharacterIterator.DONE) {
\r
605 m_matchedIndex_ = DONE;
\r
608 m_matchedIndex_ = targetText.getIndex();
\r
609 if (UTF16.isTrailSurrogate(ch)) {
\r
610 if (UTF16.isLeadSurrogate(targetText.previous())) {
\r
611 m_matchedIndex_ = targetText.getIndex();
\r
617 if (matchLength == 0) {
\r
618 // we must have reversed direction after we reached the end
\r
619 // of the target text
\r
620 // see SearchIterator next(), it checks the bounds and returns
\r
621 // if it exceeds the range. It does not allow setting of
\r
623 m_matchedIndex_ = DONE;
\r
625 if (m_isCanonicalMatch_) {
\r
626 // can't use exact here since extra accents are allowed.
\r
627 handlePreviousCanonical(start);
\r
630 handlePreviousExact(start);
\r
634 if (m_matchedIndex_ == DONE) {
\r
635 targetText.setIndex(m_textBeginOffset_);
\r
638 targetText.setIndex(m_matchedIndex_);
\r
640 return m_matchedIndex_;
\r
643 // private static inner classes ----------------------------------------
\r
645 private static class Pattern
\r
647 // protected methods -----------------------------------------------
\r
652 protected String targetText;
\r
654 * Array containing the collation elements of targetText
\r
656 protected int m_CE_[];
\r
658 * Number of collation elements in m_CE_
\r
660 protected int m_CELength_;
\r
662 * Flag indicator if targetText starts with an accent
\r
664 protected boolean m_hasPrefixAccents_;
\r
666 * Flag indicator if targetText ends with an accent
\r
668 protected boolean m_hasSuffixAccents_;
\r
670 * Default number of characters to shift for Boyer Moore
\r
672 protected int m_defaultShiftSize_;
\r
674 * Number of characters to shift for Boyer Moore, depending on the
\r
675 * source text to search
\r
677 protected char m_shift_[];
\r
679 * Number of characters to shift backwards for Boyer Moore, depending
\r
680 * on the source text to search
\r
682 protected char m_backShift_[];
\r
684 // protected constructors ------------------------------------------
\r
687 * Empty constructor
\r
689 protected Pattern(String pattern)
\r
691 targetText = pattern;
\r
692 m_CE_ = new int[INITIAL_ARRAY_SIZE_];
\r
694 m_hasPrefixAccents_ = false;
\r
695 m_hasSuffixAccents_ = false;
\r
696 m_defaultShiftSize_ = 1;
\r
697 m_shift_ = new char[MAX_TABLE_SIZE_];
\r
698 m_backShift_ = new char[MAX_TABLE_SIZE_];
\r
703 // private data members ------------------------------------------------
\r
706 * target text begin offset. Each targetText has a valid contiguous region
\r
707 * to iterate and this data member is the offset to the first such
\r
708 * character in the region.
\r
710 private int m_textBeginOffset_;
\r
712 * target text limit offset. Each targetText has a valid contiguous region
\r
713 * to iterate and this data member is the offset to 1 after the last such
\r
714 * character in the region.
\r
716 private int m_textLimitOffset_;
\r
718 * Upon completion of a search, m_matchIndex_ will store starting offset in
\r
719 * m_text for the match. The Value DONE is the default value.
\r
720 * If we are not at the start of the text or the end of the text and
\r
721 * m_matchedIndex_ is DONE it means that we can find any more matches in
\r
722 * that particular direction
\r
724 private int m_matchedIndex_;
\r
726 * Current pattern to search for
\r
728 private Pattern m_pattern_;
\r
730 * Collator whose rules are used to perform the search
\r
732 private RuleBasedCollator m_collator_;
\r
734 * The collation element iterator for the text source.
\r
736 private CollationElementIterator m_colEIter_;
\r
738 * Utility collation element, used throughout program for temporary
\r
741 private CollationElementIterator m_utilColEIter_;
\r
743 * The mask used on the collation elements to retrieve the valid strength
\r
746 private int m_ceMask_;
\r
748 * Buffer storing accents during a canonical search
\r
750 private StringBuilder m_canonicalPrefixAccents_;
\r
752 * Buffer storing accents during a canonical search
\r
754 private StringBuilder m_canonicalSuffixAccents_;
\r
756 * Flag to indicate if canonical search is to be done.
\r
757 * E.g looking for "a\u0300" in "a\u0318\u0300" will yield the match at 0.
\r
759 private boolean m_isCanonicalMatch_;
\r
761 * Character break iterator for boundary checking.
\r
763 private BreakIterator m_charBreakIter_;
\r
764 private final Normalizer2Impl m_nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
\r
766 * Size of the shift tables
\r
768 private static final int MAX_TABLE_SIZE_ = 257;
\r
770 * Initial array size
\r
772 private static final int INITIAL_ARRAY_SIZE_ = 256;
\r
776 private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
\r
780 private static final int LAST_BYTE_MASK_ = 0xff;
\r
782 * Utility buffer for return values and temporary storage
\r
784 private int m_utilBuffer_[] = new int[2];
\r
786 * Unsigned 32-Bit Integer Mask
\r
788 private static final long UNSIGNED_32BIT_MASK = 0xffffffffL;
\r
790 // private methods -------------------------------------------------------
\r
793 * Hash a collation element from its full size (32 bits) down into a
\r
794 * value that can be used as an index into the shift tables. Right
\r
795 * now we do a modulus by the size of the hash table.
\r
796 * @param ce collation element
\r
797 * @return collapsed version of the collation element
\r
799 private static final int hash(int ce)
\r
801 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
\r
802 // well with the new collation where most of the latin 1 characters
\r
803 // are of the value xx000xxx. their hashes will most of the time be 0
\r
804 // to be discussed on the hash algo.
\r
805 return CollationElementIterator.primaryOrder(ce) % MAX_TABLE_SIZE_;
\r
808 private final char getFCD(int c) {
\r
809 return (char)m_nfcImpl_.getFCD16(c);
\r
812 * Gets the fcd value for a character at the argument index.
\r
813 * This method takes into accounts of the supplementary characters.
\r
814 * Note this method changes the offset in the character iterator.
\r
815 * @param str UTF16 string where character for fcd retrieval resides
\r
816 * @param offset position of the character whose fcd is to be retrieved
\r
817 * @return fcd value
\r
819 private final char getFCD(CharacterIterator str, int offset)
\r
821 char ch = str.setIndex(offset);
\r
822 int result = m_nfcImpl_.getFCD16FromSingleLead(ch);
\r
823 if (result != 0 && Character.isHighSurrogate(ch)) {
\r
824 char c2 = str.next();
\r
825 if (Character.isLowSurrogate(c2)) {
\r
826 result = m_nfcImpl_.getFCD16(Character.toCodePoint(ch, c2));
\r
831 return (char)result;
\r
834 * Gets the FCD value for the code point before the input offset.
\r
835 * Modifies the iterator's index.
\r
836 * @param iter text iterator
\r
837 * @param offset index after the character to test
\r
838 * @return FCD value for the character before offset
\r
840 private final int getFCDBefore(CharacterIterator iter, int offset) {
\r
842 iter.setIndex(offset);
\r
843 char c = iter.previous();
\r
844 if (UTF16.isSurrogate(c)) {
\r
845 if (Normalizer2Impl.UTF16Plus.isSurrogateLead(c)) {
\r
848 char lead = iter.previous();
\r
849 if (Character.isHighSurrogate(lead)) {
\r
850 result = m_nfcImpl_.getFCD16(Character.toCodePoint(lead, c));
\r
856 result = m_nfcImpl_.getFCD16FromSingleLead(c);
\r
861 * Gets the fcd value for a character at the argument index.
\r
862 * This method takes into accounts of the supplementary characters.
\r
863 * @param str UTF16 string where character for fcd retrieval resides
\r
864 * @param offset position of the character whose fcd is to be retrieved
\r
865 * @return fcd value
\r
867 private final char getFCD(String str, int offset)
\r
869 char ch = str.charAt(offset);
\r
870 int result = m_nfcImpl_.getFCD16FromSingleLead(ch);
\r
871 if (result != 0 && Character.isHighSurrogate(ch)) {
\r
873 if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) {
\r
874 result = m_nfcImpl_.getFCD16(Character.toCodePoint(ch, c2));
\r
879 return (char)result;
\r
883 * Getting the modified collation elements taking into account the collation
\r
886 * @return the modified collation element
\r
888 private final int getCE(int ce)
\r
890 // note for tertiary we can't use the collator->tertiaryMask, that
\r
891 // is a preprocessed mask that takes into account case options. since
\r
892 // we are only concerned with exact matches, we don't need that.
\r
895 if (m_collator_.isAlternateHandlingShifted()) {
\r
896 // alternate handling here, since only the 16 most significant
\r
897 // digits is only used, we can safely do a compare without masking
\r
898 // if the ce is a variable, we mask and get only the primary values
\r
899 // no shifting to quartenary is required since all primary values
\r
900 // less than variabletop will need to be masked off anyway.
\r
901 if (((m_collator_.m_variableTopValue_ << 16) & UNSIGNED_32BIT_MASK) > (ce & UNSIGNED_32BIT_MASK)) {
\r
902 if (m_collator_.getStrength() == Collator.QUATERNARY) {
\r
903 ce = CollationElementIterator.primaryOrder(ce);
\r
906 ce = CollationElementIterator.IGNORABLE;
\r
915 * Appends a int to a int array, increasing the size of the array when
\r
916 * we are out of space.
\r
917 * @param offset in array to append to
\r
918 * @param value to append
\r
919 * @param array to append to
\r
920 * @return the array appended to, this could be a new and bigger array
\r
922 private static final int[] append(int offset, int value, int array[])
\r
924 if (offset >= array.length) {
\r
925 int temp[] = new int[offset + INITIAL_ARRAY_SIZE_];
\r
926 System.arraycopy(array, 0, temp, 0, array.length);
\r
929 array[offset] = value;
\r
934 * Initializing the ce table for a pattern. Stores non-ignorable collation
\r
935 * keys. Table size will be estimated by the size of the pattern text.
\r
936 * Table expansion will be perform as we go along. Adding 1 to ensure that
\r
937 * the table size definitely increases.
\r
938 * Internal method, status assumed to be a success.
\r
939 * @return total number of expansions
\r
941 private final int initializePatternCETable()
\r
943 m_utilColEIter_.setText(m_pattern_.targetText);
\r
947 int ce = m_utilColEIter_.next();
\r
949 while (ce != CollationElementIterator.NULLORDER) {
\r
950 int newce = getCE(ce);
\r
951 if (newce != CollationElementIterator.IGNORABLE) {
\r
952 m_pattern_.m_CE_ = append(offset, newce, m_pattern_.m_CE_);
\r
955 result += m_utilColEIter_.getMaxExpansion(ce) - 1;
\r
956 ce = m_utilColEIter_.next();
\r
959 m_pattern_.m_CE_ = append(offset, 0, m_pattern_.m_CE_);
\r
960 m_pattern_.m_CELength_ = offset;
\r
966 * Initializes the pattern struct.
\r
967 * Internal method, status assumed to be success.
\r
968 * @return expansionsize the total expansion size of the pattern
\r
970 private final int initializePattern()
\r
972 if (m_collator_.getStrength() == Collator.PRIMARY) {
\r
973 m_pattern_.m_hasPrefixAccents_ = false;
\r
974 m_pattern_.m_hasSuffixAccents_ = false;
\r
976 m_pattern_.m_hasPrefixAccents_ = (getFCD(m_pattern_.targetText, 0)
\r
977 >> SECOND_LAST_BYTE_SHIFT_) != 0;
\r
978 m_pattern_.m_hasSuffixAccents_ = (getFCD(m_pattern_.targetText.codePointBefore(
\r
979 m_pattern_.targetText.length()))
\r
980 & LAST_BYTE_MASK_) != 0;
\r
982 // since intializePattern is an internal method status is a success.
\r
983 return initializePatternCETable();
\r
987 * Initializing shift tables, with the default values.
\r
988 * If a corresponding default value is 0, the shift table is not set.
\r
989 * @param shift table for forwards shift
\r
990 * @param backshift table for backwards shift
\r
991 * @param cetable table containing pattern ce
\r
992 * @param cesize size of the pattern ces
\r
993 * @param expansionsize total size of the expansions
\r
994 * @param defaultforward the default forward value
\r
995 * @param defaultbackward the default backward value
\r
997 private final void setShiftTable(char shift[],
\r
999 int cetable[], int cesize,
\r
1000 int expansionsize,
\r
1001 char defaultforward,
\r
1002 char defaultbackward)
\r
1004 // estimate the value to shift. to do that we estimate the smallest
\r
1005 // number of characters to give the relevant ces, ie approximately
\r
1006 // the number of ces minus their expansion, since expansions can come
\r
1007 // from a character.
\r
1008 for (int count = 0; count < MAX_TABLE_SIZE_; count ++) {
\r
1009 shift[count] = defaultforward;
\r
1011 cesize --; // down to the last index
\r
1012 for (int count = 0; count < cesize; count ++) {
\r
1013 // number of ces from right of array to the count
\r
1014 int temp = defaultforward - count - 1;
\r
1015 shift[hash(cetable[count])] = temp > 1 ? ((char)temp) : 1;
\r
1017 shift[hash(cetable[cesize])] = 1;
\r
1018 // for ignorables we just shift by one. see test examples.
\r
1019 shift[hash(0)] = 1;
\r
1021 for (int count = 0; count < MAX_TABLE_SIZE_; count ++) {
\r
1022 backshift[count] = defaultbackward;
\r
1024 for (int count = cesize; count > 0; count --) {
\r
1025 // the original value count does not seem to work
\r
1026 backshift[hash(cetable[count])] = (char)(count > expansionsize ?
\r
1027 count - expansionsize : 1);
\r
1029 backshift[hash(cetable[0])] = 1;
\r
1030 backshift[hash(0)] = 1;
\r
1034 * <p>Building of the pattern collation element list and the Boyer Moore
\r
1035 * StringSearch table.</p>
\r
1036 * <p>The canonical match will only be performed after the default match
\r
1038 * <p>For both cases we need to remember the size of the composed and
\r
1039 * decomposed versions of the string. Since the Boyer-Moore shift
\r
1040 * calculations shifts by a number of characters in the text and tries to
\r
1041 * match the pattern from that offset, the shift value can not be too large
\r
1042 * in case we miss some characters. To choose a right shift size, we
\r
1043 * estimate the NFC form of the and use its size as a shift guide. The NFC
\r
1044 * form should be the small possible representation of the pattern. Anyways,
\r
1045 * we'll err on the smaller shift size. Hence the calculation for
\r
1046 * minlength. Canonical match will be performed slightly differently. We'll
\r
1047 * split the pattern into 3 parts, the prefix accents (PA), the middle
\r
1048 * string bounded by the first and last base character (MS), the ending
\r
1049 * accents (EA). Matches will be done on MS first, and only when we match
\r
1050 * MS then some processing will be required for the prefix and end accents
\r
1051 * in order to determine if they match PA and EA. Hence the default shift
\r
1052 * values for the canonical match will take the size of either end's accent
\r
1053 * into consideration. Forwards search will take the end accents into
\r
1054 * consideration for the default shift values and the backwards search will
\r
1055 * take the prefix accents into consideration.</p>
\r
1056 * <p>If pattern has no non-ignorable ce, we return a illegal argument
\r
1059 private final void initialize()
\r
1061 int expandlength = initializePattern();
\r
1062 if (m_pattern_.m_CELength_ > 0) {
\r
1063 char minlength = (char)(m_pattern_.m_CELength_ > expandlength
\r
1064 ? m_pattern_.m_CELength_ - expandlength : 1);
\r
1065 m_pattern_.m_defaultShiftSize_ = minlength;
\r
1066 setShiftTable(m_pattern_.m_shift_, m_pattern_.m_backShift_,
\r
1067 m_pattern_.m_CE_, m_pattern_.m_CELength_,
\r
1068 expandlength, minlength, minlength);
\r
1071 m_pattern_.m_defaultShiftSize_ = 0;
\r
1076 * Determine whether the search text bounded by the offset start and end is
\r
1077 * one or more whole units of text as determined by the breakiterator in
\r
1079 * @param start target text start offset
\r
1080 * @param end target text end offset
\r
1082 private final boolean isBreakUnit(int start, int end)
\r
1084 if (breakIterator != null) {
\r
1085 int startindex = breakIterator.first();
\r
1086 int endindex = breakIterator.last();
\r
1088 // out-of-range indexes are never boundary positions
\r
1089 if (start < startindex || start > endindex || end < startindex
\r
1090 || end > endindex) {
\r
1093 // otherwise, we can use following() on the position before the
\r
1094 // specified one and return true of the position we get back is the
\r
1095 // one the user specified
\r
1096 boolean result = (start == startindex
\r
1097 || breakIterator.following(start - 1) == start)
\r
1098 && (end == endindex
\r
1099 || breakIterator.following(end - 1) == end);
\r
1101 // iterates the individual ces
\r
1102 m_utilColEIter_.setText(
\r
1103 new CharacterIteratorWrapper(targetText), start);
\r
1104 for (int count = 0; count < m_pattern_.m_CELength_;
\r
1106 int ce = getCE(m_utilColEIter_.next());
\r
1107 if (ce == CollationElementIterator.IGNORABLE) {
\r
1111 if (ce != m_pattern_.m_CE_[count]) {
\r
1115 int nextce = m_utilColEIter_.next();
\r
1116 while (m_utilColEIter_.getOffset() == end
\r
1117 && getCE(nextce) == CollationElementIterator.IGNORABLE) {
\r
1118 nextce = m_utilColEIter_.next();
\r
1120 if (nextce != CollationElementIterator.NULLORDER
\r
1121 && m_utilColEIter_.getOffset() == end) {
\r
1122 // extra collation elements at the end of the match
\r
1132 * Getting the next base character offset if current offset is an accent,
\r
1133 * or the current offset if the current character contains a base character.
\r
1134 * accents the following base character will be returned
\r
1135 * @param text string
\r
1136 * @param textoffset current offset
\r
1137 * @param textlength length of text string
\r
1138 * @return the next base character or the current offset
\r
1139 * if the current character is contains a base character.
\r
1141 private final int getNextBaseOffset(CharacterIterator text, int textoffset)
\r
1143 if (textoffset >= text.getEndIndex()) {
\r
1144 return textoffset;
\r
1146 // iteration ends with reading CharacterIterator.DONE which has fcd==0
\r
1147 char c = text.setIndex(textoffset);
\r
1149 if ((m_nfcImpl_.getFCD16FromSingleLead(c) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
\r
1150 return textoffset;
\r
1152 char next = text.next();
\r
1153 if (Character.isSurrogatePair(c, next)) {
\r
1154 int fcd = m_nfcImpl_.getFCD16(Character.toCodePoint(c, next));
\r
1155 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
\r
1156 return textoffset;
\r
1158 next = text.next();
\r
1168 * Gets the next base character offset depending on the string search
\r
1170 * @param textoffset one offset away from the last character
\r
1172 * @return start index of the next base character or the current offset
\r
1173 * if the current character is contains a base character.
\r
1175 private final int getNextBaseOffset(int textoffset)
\r
1177 if (m_pattern_.m_hasSuffixAccents_ && textoffset < m_textLimitOffset_) {
\r
1178 if ((getFCDBefore(targetText, textoffset) & LAST_BYTE_MASK_) != 0) {
\r
1179 return getNextBaseOffset(targetText, textoffset);
\r
1182 return textoffset;
\r
1186 * Shifting the collation element iterator position forward to prepare for
\r
1187 * a following match. If the last character is a unsafe character, we'll
\r
1188 * only shift by 1 to capture contractions, normalization etc.
\r
1189 * Internal method, status assumed to be success.
\r
1190 * @param textoffset start text position to do search
\r
1191 * @param ce the text ce which failed the match.
\r
1192 * @param patternceindex index of the ce within the pattern ce buffer which
\r
1193 * failed the match
\r
1194 * @return final offset
\r
1196 private int shiftForward(int textoffset, int ce, int patternceindex)
\r
1199 if (ce != CollationElementIterator.NULLORDER) {
\r
1200 int shift = m_pattern_.m_shift_[hash(ce)];
\r
1201 // this is to adjust for characters in the middle of the
\r
1202 // substring for matching that failed.
\r
1203 int adjust = m_pattern_.m_CELength_ - patternceindex;
\r
1204 if (adjust > 1 && shift >= adjust) {
\r
1205 shift -= adjust - 1;
\r
1207 textoffset += shift;
\r
1210 textoffset += m_pattern_.m_defaultShiftSize_;
\r
1213 textoffset = getNextBaseOffset(textoffset);
\r
1214 // check for unsafe characters
\r
1215 // * if it is the start or middle of a contraction: to be done after
\r
1216 // a initial match is found
\r
1217 // * thai or lao base consonant character: similar to contraction
\r
1218 // * high surrogate character: similar to contraction
\r
1219 // * next character is a accent: shift to the next base character
\r
1220 return textoffset;
\r
1224 * Gets the offset to the next safe point in text.
\r
1225 * ie. not the middle of a contraction, swappable characters or
\r
1226 * supplementary characters.
\r
1227 * @param textoffset offset in string
\r
1228 * @param end offset in string
\r
1229 * @return offset to the next safe character
\r
1231 private final int getNextSafeOffset(int textoffset, int end)
\r
1233 int result = textoffset; // first contraction character
\r
1234 targetText.setIndex(result);
\r
1235 while (result != end &&
\r
1236 m_collator_.isUnsafe(targetText.current())) {
\r
1238 targetText.setIndex(result);
\r
1244 * This checks for accents in the potential match started with a composite
\r
1246 * This is really painful... we have to check that composite character do
\r
1247 * not have any extra accents. We have to normalize the potential match and
\r
1248 * find the immediate decomposed character before the match.
\r
1249 * The first composite character would have been taken care of by the fcd
\r
1250 * checks in checkForwardExactMatch.
\r
1251 * This is the slow path after the fcd of the first character and
\r
1252 * the last character has been checked by checkForwardExactMatch and we
\r
1253 * determine that the potential match has extra non-ignorable preceding
\r
1255 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
\r
1256 * checkExtraMatchAccent should fail since there is a middle ring in
\r
1257 * \u01FA Note here that accents checking are slow and cautioned in the API
\r
1259 * Internal method, status assumed to be a success, caller should check
\r
1260 * status before calling this method
\r
1261 * @param start index of the potential unfriendly composite character
\r
1262 * @param end index of the potential unfriendly composite character
\r
1263 * @return true if there is non-ignorable accents before at the beginning
\r
1264 * of the match, false otherwise.
\r
1266 private final boolean checkExtraMatchAccents(int start, int end)
\r
1268 boolean result = false;
\r
1269 if (m_pattern_.m_hasPrefixAccents_) {
\r
1270 targetText.setIndex(start);
\r
1272 if (UTF16.isLeadSurrogate(targetText.next())) {
\r
1273 if (!UTF16.isTrailSurrogate(targetText.next())) {
\r
1274 targetText.previous();
\r
1277 // we are only concerned with the first composite character
\r
1278 String str = getString(targetText, start, end);
\r
1279 if (Normalizer.quickCheck(str, Normalizer.NFD,0)
\r
1280 == Normalizer.NO) {
\r
1281 int safeoffset = getNextSafeOffset(start, end);
\r
1282 if (safeoffset != end) {
\r
1285 String decomp = Normalizer.decompose(
\r
1286 str.substring(0, safeoffset - start), false);
\r
1287 m_utilColEIter_.setText(decomp);
\r
1288 int firstce = m_pattern_.m_CE_[0];
\r
1289 boolean ignorable = true;
\r
1290 int ce = CollationElementIterator.IGNORABLE;
\r
1292 while (ce != firstce) {
\r
1293 offset = m_utilColEIter_.getOffset();
\r
1294 if (ce != firstce
\r
1295 && ce != CollationElementIterator.IGNORABLE) {
\r
1296 ignorable = false;
\r
1298 ce = m_utilColEIter_.next();
\r
1300 m_utilColEIter_.setExactOffset(offset); // back up 1 to the
\r
1301 m_utilColEIter_.previous(); // right offset
\r
1302 offset = m_utilColEIter_.getOffset();
\r
1303 result = !ignorable && (UCharacter.getCombiningClass(
\r
1304 UTF16.charAt(decomp, offset)) != 0);
\r
1312 * Used by exact matches, checks if there are accents before the match.
\r
1313 * This is really painful... we have to check that composite characters at
\r
1314 * the start of the matches have to not have any extra accents.
\r
1315 * We check the FCD of the character first, if it starts with an accent and
\r
1316 * the first pattern ce does not match the first ce of the character, we
\r
1318 * Otherwise we try normalizing the first composite
\r
1319 * character and find the immediate decomposed character before the match to
\r
1320 * see if it is an non-ignorable accent.
\r
1321 * Now normalizing the first composite character is enough because we ensure
\r
1322 * that when the match is passed in here with extra beginning ces, the
\r
1323 * first or last ce that match has to occur within the first character.
\r
1324 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
\r
1325 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
\r
1326 * Note here that accents checking are slow and cautioned in the API docs.
\r
1327 * @param start offset
\r
1328 * @param end offset
\r
1329 * @return true if there are accents on either side of the match,
\r
1332 private final boolean hasAccentsBeforeMatch(int start, int end)
\r
1334 if (m_pattern_.m_hasPrefixAccents_) {
\r
1335 // we have been iterating forwards previously
\r
1336 boolean ignorable = true;
\r
1337 int firstce = m_pattern_.m_CE_[0];
\r
1338 m_colEIter_.setExactOffset(start);
\r
1339 int ce = getCE(m_colEIter_.next());
\r
1340 while (ce != firstce) {
\r
1341 if (ce != CollationElementIterator.IGNORABLE) {
\r
1342 ignorable = false;
\r
1344 ce = getCE(m_colEIter_.next());
\r
1346 if (!ignorable && m_colEIter_.isInBuffer()) {
\r
1347 // within normalization buffer, discontiguous handled here
\r
1352 boolean accent = (getFCD(targetText, start) >> SECOND_LAST_BYTE_SHIFT_)
\r
1355 return checkExtraMatchAccents(start, end);
\r
1360 if (start > m_textBeginOffset_) {
\r
1361 targetText.setIndex(start);
\r
1362 targetText.previous();
\r
1363 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_)
\r
1365 m_colEIter_.setExactOffset(start);
\r
1366 ce = m_colEIter_.previous();
\r
1367 if (ce != CollationElementIterator.NULLORDER
\r
1368 && ce != CollationElementIterator.IGNORABLE) {
\r
1379 * Used by exact matches, checks if there are accents bounding the match.
\r
1380 * Note this is the initial boundary check. If the potential match
\r
1381 * starts or ends with composite characters, the accents in those
\r
1382 * characters will be determined later.
\r
1383 * Not doing backwards iteration here, since discontiguos contraction for
\r
1384 * backwards collation element iterator, use up too many characters.
\r
1385 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
\r
1386 * should fail since there is a acute at the end of \u01FA
\r
1387 * Note here that accents checking are slow and cautioned in the API docs.
\r
1388 * @param start offset of match
\r
1389 * @param end end offset of the match
\r
1390 * @return true if there are accents on either side of the match,
\r
1393 private final boolean hasAccentsAfterMatch(int start, int end)
\r
1395 if (m_pattern_.m_hasSuffixAccents_) {
\r
1396 targetText.setIndex(end);
\r
1397 if (end > m_textBeginOffset_
\r
1398 && UTF16.isTrailSurrogate(targetText.previous())) {
\r
1399 if (targetText.getIndex() > m_textBeginOffset_ &&
\r
1400 !UTF16.isLeadSurrogate(targetText.previous())) {
\r
1401 targetText.next();
\r
1404 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) != 0) {
\r
1405 int firstce = m_pattern_.m_CE_[0];
\r
1406 m_colEIter_.setExactOffset(start);
\r
1407 while (getCE(m_colEIter_.next()) != firstce) {
\r
1410 while (count < m_pattern_.m_CELength_) {
\r
1411 if (getCE(m_colEIter_.next())
\r
1412 == CollationElementIterator.IGNORABLE) {
\r
1417 //int ce = getCE(m_colEIter_.next());
\r
1418 int ce = m_colEIter_.next();
\r
1419 if (ce != CollationElementIterator.NULLORDER
\r
1420 && ce != CollationElementIterator.IGNORABLE) {
\r
1423 if (ce != CollationElementIterator.NULLORDER
\r
1424 && ce != CollationElementIterator.IGNORABLE) {
\r
1425 if (m_colEIter_.getOffset() <= end) {
\r
1428 if ((getFCD(targetText, end) >> SECOND_LAST_BYTE_SHIFT_)
\r
1439 * Checks if the offset runs out of the text string range
\r
1440 * @param textstart offset of the first character in the range
\r
1441 * @param textlimit limit offset of the text string range
\r
1442 * @param offset to test
\r
1443 * @return true if offset is out of bounds, false otherwise
\r
1445 private static final boolean isOutOfBounds(int textstart, int textlimit,
\r
1448 return offset < textstart || offset > textlimit;
\r
1452 * Checks for identical match
\r
1453 * @param strsrch string search data
\r
1454 * @param start offset of possible match
\r
1455 * @param end offset of possible match
\r
1456 * @return true if identical match is found
\r
1458 private final boolean checkIdentical(int start, int end)
\r
1460 if (m_collator_.getStrength() != Collator.IDENTICAL) {
\r
1464 String textstr = getString(targetText, start, end - start);
\r
1465 if (Normalizer.quickCheck(textstr, Normalizer.NFD,0)
\r
1466 == Normalizer.NO) {
\r
1467 textstr = Normalizer.decompose(textstr, false);
\r
1469 String patternstr = m_pattern_.targetText;
\r
1470 if (Normalizer.quickCheck(patternstr, Normalizer.NFD,0)
\r
1471 == Normalizer.NO) {
\r
1472 patternstr = Normalizer.decompose(patternstr, false);
\r
1474 return textstr.equals(patternstr);
\r
1478 * Checks to see if the match is repeated
\r
1479 * @param start new match start index
\r
1480 * @param limit new match limit index
\r
1481 * @return true if the the match is repeated, false otherwise
\r
1483 private final boolean checkRepeatedMatch(int start, int limit)
\r
1485 if (m_matchedIndex_ == DONE) {
\r
1488 int end = limit - 1; // last character in the match
\r
1489 int lastmatchend = m_matchedIndex_ + matchLength - 1;
\r
1490 if (!isOverlapping()) {
\r
1491 return (start >= m_matchedIndex_ && start <= lastmatchend)
\r
1492 || (end >= m_matchedIndex_ && end <= lastmatchend)
\r
1493 || (start <= m_matchedIndex_ && end >= lastmatchend);
\r
1496 return start <= m_matchedIndex_ && end >= lastmatchend;
\r
1500 * Checks match for contraction.
\r
1501 * If the match ends with a partial contraction we fail.
\r
1502 * If the match starts too far off (because of backwards iteration) we try
\r
1503 * to chip off the extra characters depending on whether a breakiterator
\r
1505 * Temporary utility buffer used to return modified start and end.
\r
1506 * @param start offset of potential match, to be modified if necessary
\r
1507 * @param end offset of potential match, to be modified if necessary
\r
1508 * @return true if match passes the contraction test, false otherwise.
\r
1510 private final boolean checkNextExactContractionMatch(int start, int end)
\r
1512 // This part checks if either ends of the match contains potential
\r
1513 // contraction. If so we'll have to iterate through them
\r
1515 if (end < m_textLimitOffset_) {
\r
1516 targetText.setIndex(end);
\r
1517 endchar = targetText.current();
\r
1519 char poststartchar = 0;
\r
1520 if (start + 1 < m_textLimitOffset_) {
\r
1521 targetText.setIndex(start + 1);
\r
1522 poststartchar = targetText.current();
\r
1524 if (m_collator_.isUnsafe(endchar)
\r
1525 || m_collator_.isUnsafe(poststartchar)) {
\r
1526 // expansion prefix, what's left to iterate
\r
1527 int bufferedCEOffset = m_colEIter_.m_CEBufferOffset_;
\r
1528 boolean hasBufferedCE = bufferedCEOffset > 0;
\r
1529 m_colEIter_.setExactOffset(start);
\r
1531 while (bufferedCEOffset > 0) {
\r
1532 // getting rid of the redundant ce, caused by setOffset.
\r
1533 // since backward contraction/expansion may have extra ces if
\r
1534 // we are in the normalization buffer, hasAccentsBeforeMatch
\r
1535 // would have taken care of it.
\r
1536 // E.g. the character \u01FA will have an expansion of 3, but
\r
1537 // if we are only looking for acute and ring \u030A and \u0301,
\r
1538 // we'll have to skip the first ce in the expansion buffer.
\r
1539 m_colEIter_.next();
\r
1540 if (m_colEIter_.getOffset() != temp) {
\r
1542 temp = m_colEIter_.getOffset();
\r
1544 bufferedCEOffset --;
\r
1548 while (count < m_pattern_.m_CELength_) {
\r
1549 int ce = getCE(m_colEIter_.next());
\r
1550 if (ce == CollationElementIterator.IGNORABLE) {
\r
1553 if (hasBufferedCE && count == 0
\r
1554 && m_colEIter_.getOffset() != temp) {
\r
1556 temp = m_colEIter_.getOffset();
\r
1558 if (ce != m_pattern_.m_CE_[count]) {
\r
1560 end = getNextBaseOffset(end);
\r
1561 m_utilBuffer_[0] = start;
\r
1562 m_utilBuffer_[1] = end;
\r
1568 m_utilBuffer_[0] = start;
\r
1569 m_utilBuffer_[1] = end;
\r
1575 * Checks and sets the match information if found.
\r
1578 * <li> the potential match does not repeat the previous match
\r
1579 * <li> boundaries are correct
\r
1580 * <li> exact matches has no extra accents
\r
1581 * <li> identical matchesb
\r
1582 * <li> potential match does not end in the middle of a contraction
\r
1584 * Otherwise the offset will be shifted to the next character.
\r
1585 * The result m_matchIndex_ and m_matchLength_ will be set to the truncated
\r
1586 * more fitting result value.
\r
1587 * Uses the temporary utility buffer for storing the modified textoffset.
\r
1588 * @param textoffset offset in the collation element text.
\r
1589 * @return true if the match is valid, false otherwise
\r
1591 private final boolean checkNextExactMatch(int textoffset)
\r
1593 int start = m_colEIter_.getOffset();
\r
1594 if (!checkNextExactContractionMatch(start, textoffset)) {
\r
1595 // returns the modified textoffset
\r
1596 m_utilBuffer_[0] = m_utilBuffer_[1];
\r
1600 start = m_utilBuffer_[0];
\r
1601 textoffset = m_utilBuffer_[1];
\r
1602 // this totally matches, however we need to check if it is repeating
\r
1603 if (!isBreakUnit(start, textoffset)
\r
1604 || checkRepeatedMatch(start, textoffset)
\r
1605 || hasAccentsBeforeMatch(start, textoffset)
\r
1606 || !checkIdentical(start, textoffset)
\r
1607 || hasAccentsAfterMatch(start, textoffset)) {
\r
1609 textoffset = getNextBaseOffset(textoffset);
\r
1610 m_utilBuffer_[0] = textoffset;
\r
1614 if (m_collator_.getStrength() == Collator.PRIMARY) {
\r
1615 textoffset = checkBreakBoundary(textoffset);
\r
1618 // totally match, we will get rid of the ending ignorables.
\r
1619 m_matchedIndex_ = start;
\r
1620 matchLength = textoffset - start;
\r
1625 * Getting the previous base character offset, or the current offset if the
\r
1626 * current character is a base character
\r
1627 * @param text the source text to work on
\r
1628 * @param textoffset one offset after the current character
\r
1629 * @return the offset of the next character after the base character or the
\r
1630 * first composed character with accents
\r
1632 private final int getPreviousBaseOffset(CharacterIterator text,
\r
1635 if (textoffset > m_textBeginOffset_) {
\r
1637 int result = textoffset;
\r
1638 text.setIndex(result);
\r
1639 if (UTF16.isTrailSurrogate(text.previous())) {
\r
1640 if (text.getIndex() != text.getBeginIndex() &&
\r
1641 !UTF16.isLeadSurrogate(text.previous())) {
\r
1645 textoffset = text.getIndex();
\r
1646 char fcd = getFCD(text, textoffset);
\r
1647 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
\r
1648 if ((fcd & LAST_BYTE_MASK_) != 0) {
\r
1649 return textoffset;
\r
1653 if (textoffset == m_textBeginOffset_) {
\r
1654 return m_textBeginOffset_;
\r
1658 return textoffset;
\r
1662 * Getting the indexes of the accents that are not blocked in the argument
\r
1664 * @param accents accents in nfd.
\r
1665 * @param accentsindex array to store the indexes of accents in accents that
\r
1667 * @return the length of populated accentsindex
\r
1669 private int getUnblockedAccentIndex(StringBuilder accents,
\r
1670 int accentsindex[])
\r
1673 int length = accents.length();
\r
1676 while (index < length) {
\r
1677 int codepoint = UTF16.charAt(accents, index);
\r
1678 int tempclass = UCharacter.getCombiningClass(codepoint);
\r
1679 if (tempclass != cclass) {
\r
1680 cclass = tempclass;
\r
1681 accentsindex[result] = index;
\r
1684 if (UCharacter.isSupplementary(codepoint)) {
\r
1691 accentsindex[result] = length;
\r
1696 * Appends 3 StringBuilder/CharacterIterator together into a destination
\r
1698 * @param source1 string buffer
\r
1699 * @param source2 character iterator
\r
1700 * @param start2 start of the character iterator to merge
\r
1701 * @param end2 end of the character iterator to merge
\r
1702 * @param source3 string buffer
\r
1703 * @return appended string buffer
\r
1705 private static final StringBuilder merge(StringBuilder source1,
\r
1706 CharacterIterator source2,
\r
1707 int start2, int end2,
\r
1708 StringBuilder source3)
\r
1710 StringBuilder result = new StringBuilder();
\r
1711 if (source1 != null && source1.length() != 0) {
\r
1712 result.append(source1);
\r
1714 source2.setIndex(start2);
\r
1715 while (source2.getIndex() < end2) {
\r
1716 result.append(source2.current());
\r
1719 if (source3 != null && source3.length() != 0) {
\r
1720 result.append(source3);
\r
1726 * Running through a collation element iterator to see if the contents
\r
1727 * matches pattern in string search data
\r
1728 * @param coleiter collation element iterator to test
\r
1729 * @return true if a match if found, false otherwise
\r
1731 private final boolean checkCollationMatch(CollationElementIterator coleiter)
\r
1733 int patternceindex = m_pattern_.m_CELength_;
\r
1735 while (patternceindex > 0) {
\r
1736 int ce = getCE(coleiter.next());
\r
1737 if (ce == CollationElementIterator.IGNORABLE) {
\r
1740 if (ce != m_pattern_.m_CE_[offset]) {
\r
1744 patternceindex --;
\r
1750 * Rearranges the front accents to try matching.
\r
1751 * Prefix accents in the text will be grouped according to their combining
\r
1752 * class and the groups will be mixed and matched to try find the perfect
\r
1753 * match with the pattern.
\r
1754 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
\r
1755 * step 1: split "\u030A\u0301" into 6 other type of potential accent
\r
1756 * substrings "\u030A", "\u0301", "\u0325", "\u030A\u0301",
\r
1757 * "\u030A\u0325", "\u0301\u0325".
\r
1758 * step 2: check if any of the generated substrings matches the pattern.
\r
1759 * Internal method, status is assumed to be success, caller has to check
\r
1760 * status before calling this method.
\r
1761 * @param start first offset of the accents to start searching
\r
1762 * @param end start of the last accent set
\r
1763 * @return DONE if a match is not found, otherwise return the starting
\r
1764 * offset of the match. Note this start includes all preceding
\r
1767 private int doNextCanonicalPrefixMatch(int start, int end)
\r
1769 if ((getFCD(targetText, start) & LAST_BYTE_MASK_) == 0) {
\r
1770 // die... failed at a base character
\r
1774 start = targetText.getIndex(); // index changed by fcd
\r
1775 int offset = getNextBaseOffset(targetText, start);
\r
1776 start = getPreviousBaseOffset(start);
\r
1778 StringBuilder accents = new StringBuilder();
\r
1779 String accentstr = getString(targetText, start, offset - start);
\r
1780 // normalizing the offensive string
\r
1781 if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0)
\r
1782 == Normalizer.NO) {
\r
1783 accentstr = Normalizer.decompose(accentstr, false);
\r
1785 accents.append(accentstr);
\r
1787 int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
\r
1788 int accentsize = getUnblockedAccentIndex(accents, accentsindex);
\r
1789 int count = (2 << (accentsize - 1)) - 1;
\r
1790 while (count > 0) {
\r
1791 // copy the base characters
\r
1792 m_canonicalPrefixAccents_.delete(0,
\r
1793 m_canonicalPrefixAccents_.length());
\r
1795 for (; k < accentsindex[0]; k ++) {
\r
1796 m_canonicalPrefixAccents_.append(accents.charAt(k));
\r
1798 // forming all possible canonical rearrangement by dropping
\r
1799 // sets of accents
\r
1800 for (int i = 0; i <= accentsize - 1; i ++) {
\r
1801 int mask = 1 << (accentsize - i - 1);
\r
1802 if ((count & mask) != 0) {
\r
1803 for (int j = accentsindex[i]; j < accentsindex[i + 1];
\r
1805 m_canonicalPrefixAccents_.append(accents.charAt(j));
\r
1809 StringBuilder match = merge(m_canonicalPrefixAccents_,
\r
1810 targetText, offset, end,
\r
1811 m_canonicalSuffixAccents_);
\r
1813 // if status is a failure, ucol_setText does nothing.
\r
1814 // run the collator iterator through this match
\r
1815 m_utilColEIter_.setText(match.toString());
\r
1816 if (checkCollationMatch(m_utilColEIter_)) {
\r
1825 * Gets the offset to the safe point in text before textoffset.
\r
1826 * ie. not the middle of a contraction, swappable characters or
\r
1827 * supplementary characters.
\r
1828 * @param start offset in string
\r
1829 * @param textoffset offset in string
\r
1830 * @return offset to the previous safe character
\r
1832 private final int getPreviousSafeOffset(int start, int textoffset)
\r
1834 int result = textoffset; // first contraction character
\r
1835 targetText.setIndex(textoffset);
\r
1836 while (result >= start && m_collator_.isUnsafe(targetText.previous())) {
\r
1837 result = targetText.getIndex();
\r
1839 if (result != start) {
\r
1840 // the first contraction character is consider unsafe here
\r
1841 result = targetText.getIndex(); // originally result --;
\r
1847 * Take the rearranged end accents and tries matching. If match failed at
\r
1848 * a seperate preceding set of accents (seperated from the rearranged on by
\r
1849 * at least a base character) then we rearrange the preceding accents and
\r
1850 * tries matching again.
\r
1851 * We allow skipping of the ends of the accent set if the ces do not match.
\r
1852 * However if the failure is found before the accent set, it fails.
\r
1853 * Internal method, status assumed to be success, caller has to check
\r
1854 * status before calling this method.
\r
1855 * @param textoffset of the start of the rearranged accent
\r
1856 * @return DONE if a match is not found, otherwise return the starting
\r
1857 * offset of the match. Note this start includes all preceding
\r
1860 private int doNextCanonicalSuffixMatch(int textoffset)
\r
1862 int safelength = 0;
\r
1863 StringBuilder safetext;
\r
1864 int safeoffset = m_textBeginOffset_;
\r
1866 if (textoffset != m_textBeginOffset_
\r
1867 && m_canonicalSuffixAccents_.length() > 0
\r
1868 && m_collator_.isUnsafe(m_canonicalSuffixAccents_.charAt(0))) {
\r
1869 safeoffset = getPreviousSafeOffset(m_textBeginOffset_,
\r
1871 safelength = textoffset - safeoffset;
\r
1872 safetext = merge(null, targetText, safeoffset, textoffset,
\r
1873 m_canonicalSuffixAccents_);
\r
1876 safetext = m_canonicalSuffixAccents_;
\r
1879 // if status is a failure, ucol_setText does nothing
\r
1880 CollationElementIterator coleiter = m_utilColEIter_;
\r
1881 coleiter.setText(safetext.toString());
\r
1882 // status checked in loop below
\r
1884 int ceindex = m_pattern_.m_CELength_ - 1;
\r
1885 boolean isSafe = true; // indication flag for position in safe zone
\r
1887 while (ceindex >= 0) {
\r
1888 int textce = coleiter.previous();
\r
1889 if (textce == CollationElementIterator.NULLORDER) {
\r
1890 // check if we have passed the safe buffer
\r
1891 if (coleiter == m_colEIter_) {
\r
1894 coleiter = m_colEIter_;
\r
1895 if (safetext != m_canonicalSuffixAccents_) {
\r
1896 safetext.delete(0, safetext.length());
\r
1898 coleiter.setExactOffset(safeoffset);
\r
1899 // status checked at the start of the loop
\r
1903 textce = getCE(textce);
\r
1904 if (textce != CollationElementIterator.IGNORABLE
\r
1905 && textce != m_pattern_.m_CE_[ceindex]) {
\r
1906 // do the beginning stuff
\r
1907 int failedoffset = coleiter.getOffset();
\r
1908 if (isSafe && failedoffset >= safelength) {
\r
1909 // alas... no hope. failed at rearranged accent set
\r
1914 failedoffset += safeoffset;
\r
1917 // try rearranging the front accents
\r
1918 int result = doNextCanonicalPrefixMatch(failedoffset,
\r
1920 if (result != DONE) {
\r
1921 // if status is a failure, ucol_setOffset does nothing
\r
1922 m_colEIter_.setExactOffset(result);
\r
1927 if (textce == m_pattern_.m_CE_[ceindex]) {
\r
1931 // set offset here
\r
1933 int result = coleiter.getOffset();
\r
1934 // sets the text iterator with the correct expansion and offset
\r
1935 int leftoverces = coleiter.m_CEBufferOffset_;
\r
1936 if (result >= safelength) {
\r
1937 result = textoffset;
\r
1940 result += safeoffset;
\r
1942 m_colEIter_.setExactOffset(result);
\r
1943 m_colEIter_.m_CEBufferOffset_ = leftoverces;
\r
1947 return coleiter.getOffset();
\r
1951 * Trying out the substring and sees if it can be a canonical match.
\r
1952 * This will try normalizing the end accents and arranging them into
\r
1953 * canonical equivalents and check their corresponding ces with the pattern
\r
1955 * Suffix accents in the text will be grouped according to their combining
\r
1956 * class and the groups will be mixed and matched to try find the perfect
\r
1957 * match with the pattern.
\r
1958 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
\r
1959 * step 1: split "\u030A\u0301" into 6 other type of potential accent
\r
1961 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
\r
1963 * step 2: check if any of the generated substrings matches the pattern.
\r
1964 * @param textoffset end offset in the collation element text that ends with
\r
1965 * the accents to be rearranged
\r
1966 * @return true if the match is valid, false otherwise
\r
1968 private boolean doNextCanonicalMatch(int textoffset)
\r
1970 int offset = m_colEIter_.getOffset();
\r
1971 targetText.setIndex(textoffset);
\r
1972 if (UTF16.isTrailSurrogate(targetText.previous())
\r
1973 && targetText.getIndex() > m_textBeginOffset_) {
\r
1974 if (!UTF16.isLeadSurrogate(targetText.previous())) {
\r
1975 targetText.next();
\r
1978 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) {
\r
1979 if (m_pattern_.m_hasPrefixAccents_) {
\r
1980 offset = doNextCanonicalPrefixMatch(offset, textoffset);
\r
1981 if (offset != DONE) {
\r
1982 m_colEIter_.setExactOffset(offset);
\r
1989 if (!m_pattern_.m_hasSuffixAccents_) {
\r
1993 StringBuilder accents = new StringBuilder();
\r
1994 // offset to the last base character in substring to search
\r
1995 int baseoffset = getPreviousBaseOffset(targetText, textoffset);
\r
1996 // normalizing the offensive string
\r
1997 String accentstr = getString(targetText, baseoffset,
\r
1998 textoffset - baseoffset);
\r
1999 if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0)
\r
2000 == Normalizer.NO) {
\r
2001 accentstr = Normalizer.decompose(accentstr, false);
\r
2003 accents.append(accentstr);
\r
2004 // status checked in loop below
\r
2006 int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
\r
2007 int size = getUnblockedAccentIndex(accents, accentsindex);
\r
2009 // 2 power n - 1 plus the full set of accents
\r
2010 int count = (2 << (size - 1)) - 1;
\r
2011 while (count > 0) {
\r
2012 m_canonicalSuffixAccents_.delete(0,
\r
2013 m_canonicalSuffixAccents_.length());
\r
2014 // copy the base characters
\r
2015 for (int k = 0; k < accentsindex[0]; k ++) {
\r
2016 m_canonicalSuffixAccents_.append(accents.charAt(k));
\r
2018 // forming all possible canonical rearrangement by dropping
\r
2019 // sets of accents
\r
2020 for (int i = 0; i <= size - 1; i ++) {
\r
2021 int mask = 1 << (size - i - 1);
\r
2022 if ((count & mask) != 0) {
\r
2023 for (int j = accentsindex[i]; j < accentsindex[i + 1];
\r
2025 m_canonicalSuffixAccents_.append(accents.charAt(j));
\r
2029 offset = doNextCanonicalSuffixMatch(baseoffset);
\r
2030 if (offset != DONE) {
\r
2031 return true; // match found
\r
2039 * Gets the previous base character offset depending on the string search
\r
2041 * @param strsrch string search data
\r
2042 * @param textoffset current offset, current character
\r
2043 * @return the offset of the next character after this base character or
\r
2044 * itself if it is a composed character with accents
\r
2046 private final int getPreviousBaseOffset(int textoffset)
\r
2048 if (m_pattern_.m_hasPrefixAccents_ && textoffset > m_textBeginOffset_) {
\r
2049 int offset = textoffset;
\r
2050 if ((getFCD(targetText, offset) >> SECOND_LAST_BYTE_SHIFT_) != 0) {
\r
2051 return getPreviousBaseOffset(targetText, textoffset);
\r
2054 return textoffset;
\r
2058 * Checks match for contraction.
\r
2059 * If the match ends with a partial contraction we fail.
\r
2060 * If the match starts too far off (because of backwards iteration) we try
\r
2061 * to chip off the extra characters.
\r
2062 * Uses the temporary util buffer for return values of the modified start
\r
2064 * @param start offset of potential match, to be modified if necessary
\r
2065 * @param end offset of potential match, to be modified if necessary
\r
2066 * @return true if match passes the contraction test, false otherwise.
\r
2068 private boolean checkNextCanonicalContractionMatch(int start, int end)
\r
2070 // This part checks if either ends of the match contains potential
\r
2071 // contraction. If so we'll have to iterate through them
\r
2074 if (end < m_textLimitOffset_) {
\r
2075 targetText.setIndex(end);
\r
2076 echar = targetText.current();
\r
2078 if (start < m_textLimitOffset_) {
\r
2079 targetText.setIndex(start + 1);
\r
2080 schar = targetText.current();
\r
2082 if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
\r
2083 int expansion = m_colEIter_.m_CEBufferOffset_;
\r
2084 boolean hasExpansion = expansion > 0;
\r
2085 m_colEIter_.setExactOffset(start);
\r
2087 while (expansion > 0) {
\r
2088 // getting rid of the redundant ce, caused by setOffset.
\r
2089 // since backward contraction/expansion may have extra ces if
\r
2090 // we are in the normalization buffer, hasAccentsBeforeMatch
\r
2091 // would have taken care of it.
\r
2092 // E.g. the character \u01FA will have an expansion of 3, but
\r
2093 // if we are only looking for acute and ring \u030A and \u0301,
\r
2094 // we'll have to skip the first ce in the expansion buffer.
\r
2095 m_colEIter_.next();
\r
2096 if (m_colEIter_.getOffset() != temp) {
\r
2098 temp = m_colEIter_.getOffset();
\r
2104 while (count < m_pattern_.m_CELength_) {
\r
2105 int ce = getCE(m_colEIter_.next());
\r
2106 // status checked below, note that if status is a failure
\r
2107 // ucol_next returns UCOL_NULLORDER
\r
2108 if (ce == CollationElementIterator.IGNORABLE) {
\r
2111 if (hasExpansion && count == 0
\r
2112 && m_colEIter_.getOffset() != temp) {
\r
2114 temp = m_colEIter_.getOffset();
\r
2117 if (count == 0 && ce != m_pattern_.m_CE_[0]) {
\r
2118 // accents may have extra starting ces, this occurs when a
\r
2119 // pure accent pattern is matched without rearrangement
\r
2120 // text \u0325\u0300 and looking for \u0300
\r
2121 int expected = m_pattern_.m_CE_[0];
\r
2122 if ((getFCD(targetText, start) & LAST_BYTE_MASK_) != 0) {
\r
2123 ce = getCE(m_colEIter_.next());
\r
2124 while (ce != expected
\r
2125 && ce != CollationElementIterator.NULLORDER
\r
2126 && m_colEIter_.getOffset() <= end) {
\r
2127 ce = getCE(m_colEIter_.next());
\r
2131 if (ce != m_pattern_.m_CE_[count]) {
\r
2133 end = getNextBaseOffset(end);
\r
2134 m_utilBuffer_[0] = start;
\r
2135 m_utilBuffer_[1] = end;
\r
2141 m_utilBuffer_[0] = start;
\r
2142 m_utilBuffer_[1] = end;
\r
2147 * Checks and sets the match information if found.
\r
2150 * <li> the potential match does not repeat the previous match
\r
2151 * <li> boundaries are correct
\r
2152 * <li> potential match does not end in the middle of a contraction
\r
2153 * <li> identical matches
\r
2155 * Otherwise the offset will be shifted to the next character.
\r
2156 * The result m_matchIndex_ and m_matchLength_ will be set to the truncated
\r
2157 * more fitting result value.
\r
2158 * Uses the temporary utility buffer for storing the modified textoffset.
\r
2159 * @param textoffset offset in the collation element text.
\r
2160 * @return true if the match is valid, false otherwise
\r
2162 private boolean checkNextCanonicalMatch(int textoffset)
\r
2164 // to ensure that the start and ends are not composite characters
\r
2165 // if we have a canonical accent match
\r
2166 if ((m_pattern_.m_hasSuffixAccents_
\r
2167 && m_canonicalSuffixAccents_.length() != 0) ||
\r
2168 (m_pattern_.m_hasPrefixAccents_
\r
2169 && m_canonicalPrefixAccents_.length() != 0)) {
\r
2170 m_matchedIndex_ = getPreviousBaseOffset(m_colEIter_.getOffset());
\r
2171 matchLength = textoffset - m_matchedIndex_;
\r
2175 int start = m_colEIter_.getOffset();
\r
2176 if (!checkNextCanonicalContractionMatch(start, textoffset)) {
\r
2177 // return the modified textoffset
\r
2178 m_utilBuffer_[0] = m_utilBuffer_[1];
\r
2181 start = m_utilBuffer_[0];
\r
2182 textoffset = m_utilBuffer_[1];
\r
2183 start = getPreviousBaseOffset(start);
\r
2184 // this totally matches, however we need to check if it is repeating
\r
2185 if (checkRepeatedMatch(start, textoffset)
\r
2186 || !isBreakUnit(start, textoffset)
\r
2187 || !checkIdentical(start, textoffset)) {
\r
2189 textoffset = getNextBaseOffset(targetText, textoffset);
\r
2190 m_utilBuffer_[0] = textoffset;
\r
2194 m_matchedIndex_ = start;
\r
2195 matchLength = textoffset - start;
\r
2200 * Shifting the collation element iterator position forward to prepare for
\r
2201 * a preceding match. If the first character is a unsafe character, we'll
\r
2202 * only shift by 1 to capture contractions, normalization etc.
\r
2203 * @param textoffset start text position to do search
\r
2204 * @param ce the text ce which failed the match.
\r
2205 * @param patternceindex index of the ce within the pattern ce buffer which
\r
2206 * failed the match
\r
2207 * @return final offset
\r
2209 private int reverseShift(int textoffset, int ce, int patternceindex)
\r
2211 if (isOverlapping()) {
\r
2212 if (textoffset != m_textLimitOffset_) {
\r
2216 textoffset -= m_pattern_.m_defaultShiftSize_;
\r
2220 if (ce != CollationElementIterator.NULLORDER) {
\r
2221 int shift = m_pattern_.m_backShift_[hash(ce)];
\r
2223 // this is to adjust for characters in the middle of the substring
\r
2224 // for matching that failed.
\r
2225 int adjust = patternceindex;
\r
2226 if (adjust > 1 && shift > adjust) {
\r
2227 shift -= adjust - 1;
\r
2229 textoffset -= shift;
\r
2232 textoffset -= m_pattern_.m_defaultShiftSize_;
\r
2236 textoffset = getPreviousBaseOffset(textoffset);
\r
2237 return textoffset;
\r
2241 * Checks match for contraction.
\r
2242 * If the match starts with a partial contraction we fail.
\r
2243 * Uses the temporary utility buffer to return the modified start and end.
\r
2244 * @param start offset of potential match, to be modified if necessary
\r
2245 * @param end offset of potential match, to be modified if necessary
\r
2246 * @return true if match passes the contraction test, false otherwise.
\r
2248 private boolean checkPreviousExactContractionMatch(int start, int end)
\r
2250 // This part checks if either ends of the match contains potential
\r
2251 // contraction. If so we'll have to iterate through them
\r
2253 if (end < m_textLimitOffset_) {
\r
2254 targetText.setIndex(end);
\r
2255 echar = targetText.current();
\r
2258 if (start + 1 < m_textLimitOffset_) {
\r
2259 targetText.setIndex(start + 1);
\r
2260 schar = targetText.current();
\r
2262 if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
\r
2263 // expansion suffix, what's left to iterate
\r
2264 int expansion = m_colEIter_.m_CEBufferSize_
\r
2265 - m_colEIter_.m_CEBufferOffset_;
\r
2266 boolean hasExpansion = expansion > 0;
\r
2267 m_colEIter_.setExactOffset(end);
\r
2269 while (expansion > 0) {
\r
2270 // getting rid of the redundant ce
\r
2271 // since forward contraction/expansion may have extra ces
\r
2272 // if we are in the normalization buffer, hasAccentsBeforeMatch
\r
2273 // would have taken care of it.
\r
2274 // E.g. the character \u01FA will have an expansion of 3, but if
\r
2275 // we are only looking for A ring A\u030A, we'll have to skip the
\r
2276 // last ce in the expansion buffer
\r
2277 m_colEIter_.previous();
\r
2278 if (m_colEIter_.getOffset() != temp) {
\r
2280 temp = m_colEIter_.getOffset();
\r
2285 int count = m_pattern_.m_CELength_;
\r
2286 while (count > 0) {
\r
2287 int ce = getCE(m_colEIter_.previous());
\r
2288 // status checked below, note that if status is a failure
\r
2289 // ucol_previous returns UCOL_NULLORDER
\r
2290 if (ce == CollationElementIterator.IGNORABLE) {
\r
2293 if (hasExpansion && count == 0
\r
2294 && m_colEIter_.getOffset() != temp) {
\r
2296 temp = m_colEIter_.getOffset();
\r
2298 if (ce != m_pattern_.m_CE_[count - 1]) {
\r
2300 start = getPreviousBaseOffset(targetText, start);
\r
2301 m_utilBuffer_[0] = start;
\r
2302 m_utilBuffer_[1] = end;
\r
2308 m_utilBuffer_[0] = start;
\r
2309 m_utilBuffer_[1] = end;
\r
2314 * Checks and sets the match information if found.
\r
2317 * <li> the current match does not repeat the last match
\r
2318 * <li> boundaries are correct
\r
2319 * <li> exact matches has no extra accents
\r
2320 * <li> identical matches
\r
2322 * Otherwise the offset will be shifted to the preceding character.
\r
2323 * Uses the temporary utility buffer to store the modified textoffset.
\r
2324 * @param textoffset offset in the collation element text. the returned value
\r
2325 * will be the truncated start offset of the match or the new start
\r
2327 * @return true if the match is valid, false otherwise
\r
2329 private final boolean checkPreviousExactMatch(int textoffset)
\r
2331 // to ensure that the start and ends are not composite characters
\r
2332 int end = m_colEIter_.getOffset();
\r
2333 if (!checkPreviousExactContractionMatch(textoffset, end)) {
\r
2336 textoffset = m_utilBuffer_[0];
\r
2337 end = m_utilBuffer_[1];
\r
2339 // this totally matches, however we need to check if it is repeating
\r
2341 if (checkRepeatedMatch(textoffset, end)
\r
2342 || !isBreakUnit(textoffset, end)
\r
2343 || hasAccentsBeforeMatch(textoffset, end)
\r
2344 || !checkIdentical(textoffset, end)
\r
2345 || hasAccentsAfterMatch(textoffset, end)) {
\r
2347 textoffset = getPreviousBaseOffset(targetText, textoffset);
\r
2348 m_utilBuffer_[0] = textoffset;
\r
2352 if (m_collator_.getStrength() == Collator.PRIMARY) {
\r
2353 end = checkBreakBoundary(end);
\r
2356 m_matchedIndex_ = textoffset;
\r
2357 matchLength = end - textoffset;
\r
2362 * Rearranges the end accents to try matching.
\r
2363 * Suffix accents in the text will be grouped according to their combining
\r
2364 * class and the groups will be mixed and matched to try find the perfect
\r
2365 * match with the pattern.
\r
2366 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
\r
2367 * step 1: split "\u030A\u0301" into 6 other type of potential accent
\r
2369 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
\r
2371 * step 2: check if any of the generated substrings matches the pattern.
\r
2372 * @param start offset of the first base character
\r
2373 * @param end start of the last accent set
\r
2374 * @return DONE if a match is not found, otherwise return the ending
\r
2375 * offset of the match. Note this start includes all following
\r
2378 private int doPreviousCanonicalSuffixMatch(int start, int end)
\r
2380 targetText.setIndex(end);
\r
2381 if (UTF16.isTrailSurrogate(targetText.previous())
\r
2382 && targetText.getIndex() > m_textBeginOffset_) {
\r
2383 if (!UTF16.isLeadSurrogate(targetText.previous())) {
\r
2384 targetText.next();
\r
2387 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) {
\r
2388 // die... failed at a base character
\r
2391 end = getNextBaseOffset(targetText, end);
\r
2393 StringBuilder accents = new StringBuilder();
\r
2394 int offset = getPreviousBaseOffset(targetText, end);
\r
2395 // normalizing the offensive string
\r
2396 String accentstr = getString(targetText, offset, end - offset);
\r
2397 if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0)
\r
2398 == Normalizer.NO) {
\r
2399 accentstr = Normalizer.decompose(accentstr, false);
\r
2401 accents.append(accentstr);
\r
2403 int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
\r
2404 int accentsize = getUnblockedAccentIndex(accents, accentsindex);
\r
2405 int count = (2 << (accentsize - 1)) - 1;
\r
2406 while (count > 0) {
\r
2407 m_canonicalSuffixAccents_.delete(0,
\r
2408 m_canonicalSuffixAccents_.length());
\r
2409 // copy the base characters
\r
2410 for (int k = 0; k < accentsindex[0]; k ++) {
\r
2411 m_canonicalSuffixAccents_.append(accents.charAt(k));
\r
2413 // forming all possible canonical rearrangement by dropping
\r
2414 // sets of accents
\r
2415 for (int i = 0; i <= accentsize - 1; i ++) {
\r
2416 int mask = 1 << (accentsize - i - 1);
\r
2417 if ((count & mask) != 0) {
\r
2418 for (int j = accentsindex[i]; j < accentsindex[i + 1];
\r
2420 m_canonicalSuffixAccents_.append(accents.charAt(j));
\r
2424 StringBuilder match = merge(m_canonicalPrefixAccents_, targetText,
\r
2426 m_canonicalSuffixAccents_);
\r
2427 // run the collator iterator through this match
\r
2428 // if status is a failure ucol_setText does nothing
\r
2429 m_utilColEIter_.setText(match.toString());
\r
2430 if (checkCollationMatch(m_utilColEIter_)) {
\r
2439 * Take the rearranged start accents and tries matching. If match failed at
\r
2440 * a seperate following set of accents (seperated from the rearranged on by
\r
2441 * at least a base character) then we rearrange the preceding accents and
\r
2442 * tries matching again.
\r
2443 * We allow skipping of the ends of the accent set if the ces do not match.
\r
2444 * However if the failure is found before the accent set, it fails.
\r
2445 * Internal method, status assumed to be success, caller has to check
\r
2446 * status before calling this method.
\r
2447 * @param textoffset of the ends of the rearranged accent
\r
2448 * @return DONE if a match is not found, otherwise return the ending offset
\r
2449 * of the match. Note this start includes all following accents.
\r
2451 private int doPreviousCanonicalPrefixMatch(int textoffset)
\r
2453 // int safelength = 0;
\r
2454 StringBuilder safetext;
\r
2455 int safeoffset = textoffset;
\r
2457 if (textoffset > m_textBeginOffset_
\r
2458 && m_collator_.isUnsafe(m_canonicalPrefixAccents_.charAt(
\r
2459 m_canonicalPrefixAccents_.length() - 1))) {
\r
2460 safeoffset = getNextSafeOffset(textoffset, m_textLimitOffset_);
\r
2461 //safelength = safeoffset - textoffset;
\r
2462 safetext = merge(m_canonicalPrefixAccents_, targetText, textoffset,
\r
2463 safeoffset, null);
\r
2466 safetext = m_canonicalPrefixAccents_;
\r
2469 // if status is a failure, ucol_setText does nothing
\r
2470 CollationElementIterator coleiter = m_utilColEIter_;
\r
2471 coleiter.setText(safetext.toString());
\r
2472 // status checked in loop below
\r
2475 boolean isSafe = true; // safe zone indication flag for position
\r
2476 int prefixlength = m_canonicalPrefixAccents_.length();
\r
2478 while (ceindex < m_pattern_.m_CELength_) {
\r
2479 int textce = coleiter.next();
\r
2480 if (textce == CollationElementIterator.NULLORDER) {
\r
2481 // check if we have passed the safe buffer
\r
2482 if (coleiter == m_colEIter_) {
\r
2485 if (safetext != m_canonicalPrefixAccents_) {
\r
2486 safetext.delete(0, safetext.length());
\r
2488 coleiter = m_colEIter_;
\r
2489 coleiter.setExactOffset(safeoffset);
\r
2490 // status checked at the start of the loop
\r
2494 textce = getCE(textce);
\r
2495 if (textce != CollationElementIterator.IGNORABLE
\r
2496 && textce != m_pattern_.m_CE_[ceindex]) {
\r
2497 // do the beginning stuff
\r
2498 int failedoffset = coleiter.getOffset();
\r
2499 if (isSafe && failedoffset <= prefixlength) {
\r
2500 // alas... no hope. failed at rearranged accent set
\r
2505 failedoffset = safeoffset - failedoffset;
\r
2506 if (safetext != m_canonicalPrefixAccents_) {
\r
2507 safetext.delete(0, safetext.length());
\r
2511 // try rearranging the end accents
\r
2512 int result = doPreviousCanonicalSuffixMatch(textoffset,
\r
2514 if (result != DONE) {
\r
2515 // if status is a failure, ucol_setOffset does nothing
\r
2516 m_colEIter_.setExactOffset(result);
\r
2521 if (textce == m_pattern_.m_CE_[ceindex]) {
\r
2525 // set offset here
\r
2527 int result = coleiter.getOffset();
\r
2528 // sets the text iterator here with the correct expansion and offset
\r
2529 int leftoverces = coleiter.m_CEBufferSize_
\r
2530 - coleiter.m_CEBufferOffset_;
\r
2531 if (result <= prefixlength) {
\r
2532 result = textoffset;
\r
2535 result = textoffset + (safeoffset - result);
\r
2537 m_colEIter_.setExactOffset(result);
\r
2538 m_colEIter_.m_CEBufferOffset_ = m_colEIter_.m_CEBufferSize_
\r
2543 return coleiter.getOffset();
\r
2547 * Trying out the substring and sees if it can be a canonical match.
\r
2548 * This will try normalizing the starting accents and arranging them into
\r
2549 * canonical equivalents and check their corresponding ces with the pattern
\r
2551 * Prefix accents in the text will be grouped according to their combining
\r
2552 * class and the groups will be mixed and matched to try find the perfect
\r
2553 * match with the pattern.
\r
2554 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
\r
2555 * step 1: split "\u030A\u0301" into 6 other type of potential accent
\r
2557 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
\r
2559 * step 2: check if any of the generated substrings matches the pattern.
\r
2560 * @param textoffset start offset in the collation element text that starts
\r
2561 * with the accents to be rearranged
\r
2562 * @return true if the match is valid, false otherwise
\r
2564 private boolean doPreviousCanonicalMatch(int textoffset)
\r
2566 int offset = m_colEIter_.getOffset();
\r
2567 if ((getFCD(targetText, textoffset) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
\r
2568 if (m_pattern_.m_hasSuffixAccents_) {
\r
2569 offset = doPreviousCanonicalSuffixMatch(textoffset, offset);
\r
2570 if (offset != DONE) {
\r
2571 m_colEIter_.setExactOffset(offset);
\r
2578 if (!m_pattern_.m_hasPrefixAccents_) {
\r
2582 StringBuilder accents = new StringBuilder();
\r
2583 // offset to the last base character in substring to search
\r
2584 int baseoffset = getNextBaseOffset(targetText, textoffset);
\r
2585 // normalizing the offensive string
\r
2586 String textstr = getString(targetText, textoffset,
\r
2587 baseoffset - textoffset);
\r
2588 if (Normalizer.quickCheck(textstr, Normalizer.NFD,0)
\r
2589 == Normalizer.NO) {
\r
2590 textstr = Normalizer.decompose(textstr, false);
\r
2592 accents.append(textstr);
\r
2593 // status checked in loop
\r
2595 int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
\r
2596 int size = getUnblockedAccentIndex(accents, accentsindex);
\r
2598 // 2 power n - 1 plus the full set of accents
\r
2599 int count = (2 << (size - 1)) - 1;
\r
2600 while (count > 0) {
\r
2601 m_canonicalPrefixAccents_.delete(0,
\r
2602 m_canonicalPrefixAccents_.length());
\r
2603 // copy the base characters
\r
2604 for (int k = 0; k < accentsindex[0]; k ++) {
\r
2605 m_canonicalPrefixAccents_.append(accents.charAt(k));
\r
2607 // forming all possible canonical rearrangement by dropping
\r
2608 // sets of accents
\r
2609 for (int i = 0; i <= size - 1; i ++) {
\r
2610 int mask = 1 << (size - i - 1);
\r
2611 if ((count & mask) != 0) {
\r
2612 for (int j = accentsindex[i]; j < accentsindex[i + 1];
\r
2614 m_canonicalPrefixAccents_.append(accents.charAt(j));
\r
2618 offset = doPreviousCanonicalPrefixMatch(baseoffset);
\r
2619 if (offset != DONE) {
\r
2620 return true; // match found
\r
2628 * Checks match for contraction.
\r
2629 * If the match starts with a partial contraction we fail.
\r
2630 * Uses the temporary utility buffer to return the modified start and end.
\r
2631 * @param start offset of potential match, to be modified if necessary
\r
2632 * @param end offset of potential match, to be modified if necessary
\r
2633 * @return true if match passes the contraction test, false otherwise.
\r
2635 private boolean checkPreviousCanonicalContractionMatch(int start, int end)
\r
2638 // This part checks if either ends of the match contains potential
\r
2639 // contraction. If so we'll have to iterate through them
\r
2642 if (end < m_textLimitOffset_) {
\r
2643 targetText.setIndex(end);
\r
2644 echar = targetText.current();
\r
2646 if (start + 1 < m_textLimitOffset_) {
\r
2647 targetText.setIndex(start + 1);
\r
2648 schar = targetText.current();
\r
2650 if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
\r
2651 int expansion = m_colEIter_.m_CEBufferSize_
\r
2652 - m_colEIter_.m_CEBufferOffset_;
\r
2653 boolean hasExpansion = expansion > 0;
\r
2654 m_colEIter_.setExactOffset(end);
\r
2655 while (expansion > 0) {
\r
2656 // getting rid of the redundant ce
\r
2657 // since forward contraction/expansion may have extra ces
\r
2658 // if we are in the normalization buffer, hasAccentsBeforeMatch
\r
2659 // would have taken care of it.
\r
2660 // E.g. the character \u01FA will have an expansion of 3, but
\r
2661 // if we are only looking for A ring A\u030A, we'll have to
\r
2662 // skip the last ce in the expansion buffer
\r
2663 m_colEIter_.previous();
\r
2664 if (m_colEIter_.getOffset() != temp) {
\r
2666 temp = m_colEIter_.getOffset();
\r
2671 int count = m_pattern_.m_CELength_;
\r
2672 while (count > 0) {
\r
2673 int ce = getCE(m_colEIter_.previous());
\r
2674 // status checked below, note that if status is a failure
\r
2675 // previous() returns NULLORDER
\r
2676 if (ce == CollationElementIterator.IGNORABLE) {
\r
2679 if (hasExpansion && count == 0
\r
2680 && m_colEIter_.getOffset() != temp) {
\r
2682 temp = m_colEIter_.getOffset();
\r
2684 if (count == m_pattern_.m_CELength_
\r
2685 && ce != m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1]) {
\r
2686 // accents may have extra starting ces, this occurs when a
\r
2687 // pure accent pattern is matched without rearrangement
\r
2688 int expected = m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1];
\r
2689 targetText.setIndex(end);
\r
2690 if (UTF16.isTrailSurrogate(targetText.previous())) {
\r
2691 if (targetText.getIndex() > m_textBeginOffset_ &&
\r
2692 !UTF16.isLeadSurrogate(targetText.previous())) {
\r
2693 targetText.next();
\r
2696 end = targetText.getIndex();
\r
2697 if ((getFCD(targetText, end) & LAST_BYTE_MASK_) != 0) {
\r
2698 ce = getCE(m_colEIter_.previous());
\r
2699 while (ce != expected
\r
2700 && ce != CollationElementIterator.NULLORDER
\r
2701 && m_colEIter_.getOffset() <= start) {
\r
2702 ce = getCE(m_colEIter_.previous());
\r
2706 if (ce != m_pattern_.m_CE_[count - 1]) {
\r
2708 start = getPreviousBaseOffset(start);
\r
2709 m_utilBuffer_[0] = start;
\r
2710 m_utilBuffer_[1] = end;
\r
2716 m_utilBuffer_[0] = start;
\r
2717 m_utilBuffer_[1] = end;
\r
2722 * Checks and sets the match information if found.
\r
2725 * <li> the potential match does not repeat the previous match
\r
2726 * <li> boundaries are correct
\r
2727 * <li> potential match does not end in the middle of a contraction
\r
2728 * <li> identical matches
\r
2730 * Otherwise the offset will be shifted to the next character.
\r
2731 * Uses the temporary utility buffer for storing the modified textoffset.
\r
2732 * @param textoffset offset in the collation element text. the returned
\r
2733 * value will be the truncated start offset of the match or the
\r
2734 * new start search offset.
\r
2735 * @return true if the match is valid, false otherwise
\r
2737 private boolean checkPreviousCanonicalMatch(int textoffset)
\r
2739 // to ensure that the start and ends are not composite characters
\r
2740 // if we have a canonical accent match
\r
2741 if (m_pattern_.m_hasSuffixAccents_
\r
2742 && m_canonicalSuffixAccents_.length() != 0
\r
2743 || m_pattern_.m_hasPrefixAccents_
\r
2744 && m_canonicalPrefixAccents_.length() != 0) {
\r
2745 m_matchedIndex_ = textoffset;
\r
2746 matchLength = getNextBaseOffset(m_colEIter_.getOffset())
\r
2751 int end = m_colEIter_.getOffset();
\r
2752 if (!checkPreviousCanonicalContractionMatch(textoffset, end)) {
\r
2753 // storing the modified textoffset
\r
2756 textoffset = m_utilBuffer_[0];
\r
2757 end = m_utilBuffer_[1];
\r
2758 end = getNextBaseOffset(end);
\r
2759 // this totally matches, however we need to check if it is repeating
\r
2760 if (checkRepeatedMatch(textoffset, end)
\r
2761 || !isBreakUnit(textoffset, end)
\r
2762 || !checkIdentical(textoffset, end)) {
\r
2764 textoffset = getPreviousBaseOffset(textoffset);
\r
2765 m_utilBuffer_[0] = textoffset;
\r
2769 m_matchedIndex_ = textoffset;
\r
2770 matchLength = end - textoffset;
\r
2775 * Method that does the next exact match
\r
2776 * @param start the offset to start shifting from and performing the
\r
2777 * next exact match
\r
2779 private void handleNextExact(int start)
\r
2781 int textoffset = shiftForward(start,
\r
2782 CollationElementIterator.NULLORDER,
\r
2783 m_pattern_.m_CELength_);
\r
2784 int targetce = CollationElementIterator.IGNORABLE;
\r
2785 while (textoffset <= m_textLimitOffset_) {
\r
2786 m_colEIter_.setExactOffset(textoffset);
\r
2787 int patternceindex = m_pattern_.m_CELength_ - 1;
\r
2788 boolean found = false;
\r
2789 int lastce = CollationElementIterator.NULLORDER;
\r
2792 // finding the last pattern ce match, imagine composite
\r
2793 // characters. for example: search for pattern A in text \u00C0
\r
2794 // we'll have to skip \u0300 the grave first before we get to A
\r
2795 targetce = m_colEIter_.previous();
\r
2796 if (targetce == CollationElementIterator.NULLORDER) {
\r
2800 targetce = getCE(targetce);
\r
2801 if (targetce == CollationElementIterator.IGNORABLE &&
\r
2802 m_colEIter_.isInBuffer()) {
\r
2803 // this is for the text \u0315\u0300 that requires
\r
2804 // normalization and pattern \u0300, where \u0315 is ignorable
\r
2807 if (lastce == CollationElementIterator.NULLORDER
\r
2808 || lastce == CollationElementIterator.IGNORABLE) {
\r
2809 lastce = targetce;
\r
2811 if (targetce == m_pattern_.m_CE_[patternceindex]) {
\r
2812 // the first ce can be a contraction
\r
2816 if (m_colEIter_.m_CEBufferOffset_ <= 0) {
\r
2822 while (found && patternceindex > 0) {
\r
2823 lastce = targetce;
\r
2824 targetce = m_colEIter_.previous();
\r
2825 if (targetce == CollationElementIterator.NULLORDER) {
\r
2829 targetce = getCE(targetce);
\r
2830 if (targetce == CollationElementIterator.IGNORABLE) {
\r
2834 patternceindex --;
\r
2835 found = found && targetce == m_pattern_.m_CE_[patternceindex];
\r
2838 targetce = lastce;
\r
2841 textoffset = shiftForward(textoffset, lastce, patternceindex);
\r
2842 // status checked at loop.
\r
2843 patternceindex = m_pattern_.m_CELength_;
\r
2847 if (checkNextExactMatch(textoffset)) {
\r
2848 // status checked in ucol_setOffset
\r
2851 textoffset = m_utilBuffer_[0];
\r
2853 setMatchNotFound();
\r
2857 * Method that does the next canonical match
\r
2858 * @param start the offset to start shifting from and performing the
\r
2859 * next canonical match
\r
2861 private void handleNextCanonical(int start)
\r
2863 boolean hasPatternAccents =
\r
2864 m_pattern_.m_hasSuffixAccents_ || m_pattern_.m_hasPrefixAccents_;
\r
2866 // shifting it check for setting offset
\r
2867 // if setOffset is called previously or there was no previous match, we
\r
2868 // leave the offset as it is.
\r
2869 int textoffset = shiftForward(start, CollationElementIterator.NULLORDER,
\r
2870 m_pattern_.m_CELength_);
\r
2871 m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_.length());
\r
2872 m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_.length());
\r
2873 int targetce = CollationElementIterator.IGNORABLE;
\r
2875 while (textoffset <= m_textLimitOffset_)
\r
2877 m_colEIter_.setExactOffset(textoffset);
\r
2878 int patternceindex = m_pattern_.m_CELength_ - 1;
\r
2879 boolean found = false;
\r
2880 int lastce = CollationElementIterator.NULLORDER;
\r
2883 // finding the last pattern ce match, imagine composite characters
\r
2884 // for example: search for pattern A in text \u00C0
\r
2885 // we'll have to skip \u0300 the grave first before we get to A
\r
2886 targetce = m_colEIter_.previous();
\r
2887 if (targetce == CollationElementIterator.NULLORDER) {
\r
2891 targetce = getCE(targetce);
\r
2892 if (lastce == CollationElementIterator.NULLORDER
\r
2893 || lastce == CollationElementIterator.IGNORABLE) {
\r
2894 lastce = targetce;
\r
2896 if (targetce == m_pattern_.m_CE_[patternceindex]) {
\r
2897 // the first ce can be a contraction
\r
2901 if (m_colEIter_.m_CEBufferOffset_ <= 0) {
\r
2907 while (found && patternceindex > 0) {
\r
2908 targetce = m_colEIter_.previous();
\r
2909 if (targetce == CollationElementIterator.NULLORDER) {
\r
2913 targetce = getCE(targetce);
\r
2914 if (targetce == CollationElementIterator.IGNORABLE) {
\r
2918 patternceindex --;
\r
2919 found = found && targetce == m_pattern_.m_CE_[patternceindex];
\r
2922 // initializing the rearranged accent array
\r
2923 if (hasPatternAccents && !found) {
\r
2924 found = doNextCanonicalMatch(textoffset);
\r
2928 textoffset = shiftForward(textoffset, lastce, patternceindex);
\r
2929 // status checked at loop
\r
2930 patternceindex = m_pattern_.m_CELength_;
\r
2934 if (checkNextCanonicalMatch(textoffset)) {
\r
2937 textoffset = m_utilBuffer_[0];
\r
2939 setMatchNotFound();
\r
2943 * Method that does the previous exact match
\r
2944 * @param start the offset to start shifting from and performing the
\r
2945 * previous exact match
\r
2947 private void handlePreviousExact(int start)
\r
2949 int textoffset = reverseShift(start, CollationElementIterator.NULLORDER,
\r
2950 m_pattern_.m_CELength_);
\r
2951 while (textoffset >= m_textBeginOffset_)
\r
2953 m_colEIter_.setExactOffset(textoffset);
\r
2954 int patternceindex = 1;
\r
2955 int targetce = CollationElementIterator.IGNORABLE;
\r
2956 boolean found = false;
\r
2957 int firstce = CollationElementIterator.NULLORDER;
\r
2960 // finding the first pattern ce match, imagine composite
\r
2961 // characters. for example: search for pattern \u0300 in text
\r
2962 // \u00C0, we'll have to skip A first before we get to
\r
2963 // \u0300 the grave accent
\r
2964 targetce = m_colEIter_.next();
\r
2965 if (targetce == CollationElementIterator.NULLORDER) {
\r
2969 targetce = getCE(targetce);
\r
2970 if (firstce == CollationElementIterator.NULLORDER
\r
2971 || firstce == CollationElementIterator.IGNORABLE) {
\r
2972 firstce = targetce;
\r
2974 if (targetce == CollationElementIterator.IGNORABLE && m_collator_.getStrength() != Collator.PRIMARY) {
\r
2977 if (targetce == m_pattern_.m_CE_[0]) {
\r
2981 if (m_colEIter_.m_CEBufferOffset_ == -1
\r
2982 || m_colEIter_.m_CEBufferOffset_
\r
2983 == m_colEIter_.m_CEBufferSize_) {
\r
2984 // checking for accents in composite character
\r
2990 //targetce = firstce;
\r
2992 while (found && patternceindex < m_pattern_.m_CELength_) {
\r
2993 firstce = targetce;
\r
2994 targetce = m_colEIter_.next();
\r
2995 if (targetce == CollationElementIterator.NULLORDER) {
\r
2999 targetce = getCE(targetce);
\r
3000 if (targetce == CollationElementIterator.IGNORABLE) {
\r
3004 found = found && targetce == m_pattern_.m_CE_[patternceindex];
\r
3005 patternceindex ++;
\r
3008 targetce = firstce;
\r
3011 textoffset = reverseShift(textoffset, targetce, patternceindex);
\r
3012 patternceindex = 0;
\r
3016 if (checkPreviousExactMatch(textoffset)) {
\r
3019 textoffset = m_utilBuffer_[0];
\r
3021 setMatchNotFound();
\r
3025 * Method that does the previous canonical match
\r
3026 * @param start the offset to start shifting from and performing the
\r
3027 * previous canonical match
\r
3029 private void handlePreviousCanonical(int start)
\r
3031 boolean hasPatternAccents =
\r
3032 m_pattern_.m_hasSuffixAccents_ || m_pattern_.m_hasPrefixAccents_;
\r
3034 // shifting it check for setting offset
\r
3035 // if setOffset is called previously or there was no previous match, we
\r
3036 // leave the offset as it is.
\r
3037 int textoffset = reverseShift(start, CollationElementIterator.NULLORDER,
\r
3038 m_pattern_.m_CELength_);
\r
3039 m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_.length());
\r
3040 m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_.length());
\r
3042 while (textoffset >= m_textBeginOffset_)
\r
3044 m_colEIter_.setExactOffset(textoffset);
\r
3045 int patternceindex = 1;
\r
3046 int targetce = CollationElementIterator.IGNORABLE;
\r
3047 boolean found = false;
\r
3048 int firstce = CollationElementIterator.NULLORDER;
\r
3051 // finding the first pattern ce match, imagine composite
\r
3052 // characters. for example: search for pattern \u0300 in text
\r
3053 // \u00C0, we'll have to skip A first before we get to
\r
3054 // \u0300 the grave accent
\r
3055 targetce = m_colEIter_.next();
\r
3056 if (targetce == CollationElementIterator.NULLORDER) {
\r
3060 targetce = getCE(targetce);
\r
3061 if (firstce == CollationElementIterator.NULLORDER
\r
3062 || firstce == CollationElementIterator.IGNORABLE) {
\r
3063 firstce = targetce;
\r
3066 if (targetce == m_pattern_.m_CE_[0]) {
\r
3067 // the first ce can be a contraction
\r
3071 if (m_colEIter_.m_CEBufferOffset_ == -1
\r
3072 || m_colEIter_.m_CEBufferOffset_
\r
3073 == m_colEIter_.m_CEBufferSize_) {
\r
3074 // checking for accents in composite character
\r
3080 targetce = firstce;
\r
3082 while (found && patternceindex < m_pattern_.m_CELength_) {
\r
3083 targetce = m_colEIter_.next();
\r
3084 if (targetce == CollationElementIterator.NULLORDER) {
\r
3088 targetce = getCE(targetce);
\r
3089 if (targetce == CollationElementIterator.IGNORABLE) {
\r
3093 found = found && targetce == m_pattern_.m_CE_[patternceindex];
\r
3094 patternceindex ++;
\r
3097 // initializing the rearranged accent array
\r
3098 if (hasPatternAccents && !found) {
\r
3099 found = doPreviousCanonicalMatch(textoffset);
\r
3103 textoffset = reverseShift(textoffset, targetce, patternceindex);
\r
3104 patternceindex = 0;
\r
3108 if (checkPreviousCanonicalMatch(textoffset)) {
\r
3111 textoffset = m_utilBuffer_[0];
\r
3113 setMatchNotFound();
\r
3117 * Gets a substring out of a CharacterIterator
\r
3118 * @param text CharacterIterator
\r
3119 * @param start start offset
\r
3120 * @param length of substring
\r
3121 * @return substring from text starting at start and length length
\r
3123 private static final String getString(CharacterIterator text, int start,
\r
3126 StringBuilder result = new StringBuilder(length);
\r
3127 int offset = text.getIndex();
\r
3128 text.setIndex(start);
\r
3129 for (int i = 0; i < length; i ++) {
\r
3130 result.append(text.current());
\r
3133 text.setIndex(offset);
\r
3134 return result.toString();
\r
3138 * Getting the mask for collation strength
\r
3139 * @param strength collation strength
\r
3140 * @return collation element mask
\r
3142 private static final int getMask(int strength)
\r
3144 switch (strength)
\r
3146 case Collator.PRIMARY:
\r
3147 return RuleBasedCollator.CE_PRIMARY_MASK_;
\r
3148 case Collator.SECONDARY:
\r
3149 return RuleBasedCollator.CE_SECONDARY_MASK_
\r
3150 | RuleBasedCollator.CE_PRIMARY_MASK_;
\r
3152 return RuleBasedCollator.CE_TERTIARY_MASK_
\r
3153 | RuleBasedCollator.CE_SECONDARY_MASK_
\r
3154 | RuleBasedCollator.CE_PRIMARY_MASK_;
\r
3159 * Sets match not found
\r
3161 private void setMatchNotFound()
\r
3163 // this method resets the match result regardless of the error status.
\r
3164 m_matchedIndex_ = DONE;
\r
3165 setMatchLength(0);
\r
3169 * Check the boundaries of the match.
\r
3171 private int checkBreakBoundary(int end) {
\r
3172 if (!m_charBreakIter_.isBoundary(end)) {
\r
3173 end = m_charBreakIter_.following(end);
\r