2 *******************************************************************************
\r
3 * Copyright (C) 1996-2007, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import java.text.CharacterIterator;
\r
11 import java.text.StringCharacterIterator;
\r
12 import java.util.Locale;
\r
14 import com.ibm.icu.impl.CharacterIteratorWrapper;
\r
15 import com.ibm.icu.impl.NormalizerImpl;
\r
16 import com.ibm.icu.lang.UCharacter;
\r
17 import com.ibm.icu.util.ULocale;
\r
21 * <code>StringSearch</code> is the concrete subclass of
\r
22 * <code>SearchIterator</code> that provides language-sensitive text searching
\r
23 * based on the comparison rules defined in a {@link RuleBasedCollator} object.
\r
26 * <code>StringSearch</code> uses a version of the fast Boyer-Moore search
\r
27 * algorithm that has been adapted to work with the large character set of
\r
28 * Unicode. Refer to
\r
29 * <a href="http://www.icu-project.org/docs/papers/efficient_text_searching_in_java.html">
\r
30 * "Efficient Text Searching in Java"</a>, published in the
\r
31 * <i>Java Report</i> on February, 1999, for further information on the
\r
35 * Users are also strongly encouraged to read the section on
\r
36 * <a href="http://www.icu-project.org/userguide/searchString.html">
\r
37 * String Search</a> and
\r
38 * <a href="http://www.icu-project.org/userguide/Collate_Intro.html">
\r
39 * Collation</a> in the user guide before attempting to use this class.
\r
42 * String searching becomes a little complicated when accents are encountered at
\r
43 * match boundaries. If a match is found and it has preceding or trailing
\r
44 * accents not part of the match, the result returned will include the
\r
45 * preceding accents up to the first base character, if the pattern searched
\r
46 * for starts an accent. Likewise,
\r
47 * if the pattern ends with an accent, all trailing accents up to the first
\r
48 * base character will be included in the result.
\r
51 * For example, if a match is found in target text "a\u0325\u0300" for
\r
53 * "a\u0325", the result returned by StringSearch will be the index 0 and
\r
54 * length 3 <0, 3>. If a match is found in the target
\r
55 * "a\u0325\u0300"
\r
56 * for the pattern "\u0300", then the result will be index 1 and length 2
\r
60 * In the case where the decomposition mode is on for the RuleBasedCollator,
\r
61 * all matches that starts or ends with an accent will have its results include
\r
62 * preceding or following accents respectively. For example, if pattern "a" is
\r
63 * looked for in the target text "á\u0325", the result will be
\r
64 * index 0 and length 2 <0, 2>.
\r
67 * The StringSearch class provides two options to handle accent matching
\r
71 * Let S' be the sub-string of a text string S between the offsets start and
\r
72 * end <start, end>.
\r
74 * A pattern string P matches a text string S at the offsets <start,
\r
79 * option 1. P matches some canonical equivalent string of S'. Suppose the
\r
80 * RuleBasedCollator used for searching has a collation strength of
\r
81 * TERTIARY, all accents are non-ignorable. If the pattern
\r
82 * "a\u0300" is searched in the target text
\r
83 * "a\u0325\u0300",
\r
84 * a match will be found, since the target text is canonically
\r
85 * equivalent to "a\u0300\u0325"
\r
86 * option 2. P matches S' and if P starts or ends with a combining mark,
\r
87 * there exists no non-ignorable combining mark before or after S'
\r
88 * in S respectively. Following the example above, the pattern
\r
89 * "a\u0300" will not find a match in "a\u0325\u0300",
\r
91 * there exists a non-ignorable accent '\u0325' in the middle of
\r
92 * 'a' and '\u0300'. Even with a target text of
\r
93 * "a\u0300\u0325" a match will not be found because of the
\r
94 * non-ignorable trailing accent \u0325.
\r
96 * Option 2. will be the default mode for dealing with boundary accents unless
\r
97 * specified via the API setCanonical(boolean).
\r
98 * One restriction is to be noted for option 1. Currently there are no
\r
99 * composite characters that consists of a character with combining class > 0
\r
100 * before a character with combining class == 0. However, if such a character
\r
101 * exists in the future, the StringSearch may not work correctly with option 1
\r
102 * when such characters are encountered.
\r
105 * <tt>SearchIterator</tt> provides APIs to specify the starting position
\r
106 * within the text string to be searched, e.g. <tt>setIndex</tt>,
\r
107 * <tt>preceding</tt> and <tt>following</tt>. Since the starting position will
\r
108 * be set as it is specified, please take note that there are some dangerous
\r
109 * positions which the search may render incorrect results:
\r
111 * <li> The midst of a substring that requires decomposition.
\r
112 * <li> If the following match is to be found, the position should not be the
\r
113 * second character which requires to be swapped with the preceding
\r
114 * character. Vice versa, if the preceding match is to be found,
\r
115 * position to search from should not be the first character which
\r
116 * requires to be swapped with the next character. E.g certain Thai and
\r
117 * Lao characters require swapping.
\r
118 * <li> If a following pattern match is to be found, any position within a
\r
119 * contracting sequence except the first will fail. Vice versa if a
\r
120 * preceding pattern match is to be found, a invalid starting point
\r
121 * would be any character within a contracting sequence except the last.
\r
125 * Though collator attributes will be taken into consideration while
\r
126 * performing matches, there are no APIs provided in StringSearch for setting
\r
127 * and getting the attributes. These attributes can be set by getting the
\r
128 * collator from <tt>getCollator</tt> and using the APIs in
\r
129 * <tt>com.ibm.icu.text.Collator</tt>. To update StringSearch to the new
\r
130 * collator attributes, <tt>reset()</tt> or
\r
131 * <tt>setCollator(RuleBasedCollator)</tt> has to be called.
\r
135 * <a href="http://www.icu-project.org/userguide/searchString.html">
\r
136 * String Search</a> user guide and the <code>SearchIterator</code>
\r
137 * documentation for more information and examples of use.
\r
140 * This class is not subclassable
\r
142 * @see SearchIterator
\r
143 * @see RuleBasedCollator
\r
144 * @author Laura Werner, synwee
\r
147 // internal notes: all methods do not guarantee the correct status of the
\r
148 // characteriterator. the caller has to maintain the original index position
\r
149 // if necessary. methods could change the index position as it deems fit
\r
150 public final class StringSearch extends SearchIterator
\r
153 // public constructors --------------------------------------------------
\r
156 * Initializes the iterator to use the language-specific rules defined in
\r
157 * the argument collator to search for argument pattern in the argument
\r
158 * target text. The argument breakiter is used to define logical matches.
\r
159 * See super class documentation for more details on the use of the target
\r
160 * text and BreakIterator.
\r
161 * @param pattern text to look for.
\r
162 * @param target target text to search for pattern.
\r
163 * @param collator RuleBasedCollator that defines the language rules
\r
164 * @param breakiter A {@link BreakIterator} that is used to determine the
\r
165 * boundaries of a logical match. This argument can be null.
\r
166 * @exception IllegalArgumentException thrown when argument target is null,
\r
168 * @see BreakIterator
\r
169 * @see RuleBasedCollator
\r
170 * @see SearchIterator
\r
173 public StringSearch(String pattern, CharacterIterator target,
\r
174 RuleBasedCollator collator, BreakIterator breakiter)
\r
176 super(target, breakiter);
\r
177 m_textBeginOffset_ = targetText.getBeginIndex();
\r
178 m_textLimitOffset_ = targetText.getEndIndex();
\r
179 m_collator_ = collator;
\r
180 m_colEIter_ = m_collator_.getCollationElementIterator(target);
\r
181 m_utilColEIter_ = collator.getCollationElementIterator("");
\r
182 m_ceMask_ = getMask(m_collator_.getStrength());
\r
183 m_isCanonicalMatch_ = false;
\r
184 m_pattern_ = new Pattern(pattern);
\r
185 m_matchedIndex_ = DONE;
\r
186 m_charBreakIter_ = BreakIterator.getCharacterInstance(/*m_collator_.getLocale(ULocale.ACTUAL_LOCALE)*/);
\r
187 m_charBreakIter_.setText(target);
\r
192 * Initializes the iterator to use the language-specific rules defined in
\r
193 * the argument collator to search for argument pattern in the argument
\r
194 * target text. No BreakIterators are set to test for logical matches.
\r
195 * @param pattern text to look for.
\r
196 * @param target target text to search for pattern.
\r
197 * @param collator RuleBasedCollator that defines the language rules
\r
198 * @exception IllegalArgumentException thrown when argument target is null,
\r
200 * @see RuleBasedCollator
\r
201 * @see SearchIterator
\r
204 public StringSearch(String pattern, CharacterIterator target,
\r
205 RuleBasedCollator collator)
\r
207 this(pattern, target, collator, null/*BreakIterator.getCharacterInstance()*/);
\r
211 * Initializes the iterator to use the language-specific rules and
\r
212 * break iterator rules defined in the argument locale to search for
\r
213 * argument pattern in the argument target text.
\r
214 * See super class documentation for more details on the use of the target
\r
215 * text and BreakIterator.
\r
216 * @param pattern text to look for.
\r
217 * @param target target text to search for pattern.
\r
218 * @param locale locale to use for language and break iterator rules
\r
219 * @exception IllegalArgumentException thrown when argument target is null,
\r
220 * or of length 0. ClassCastException thrown if the collator for
\r
221 * the specified locale is not a RuleBasedCollator.
\r
222 * @see BreakIterator
\r
223 * @see RuleBasedCollator
\r
224 * @see SearchIterator
\r
227 public StringSearch(String pattern, CharacterIterator target, Locale locale)
\r
229 this(pattern, target, ULocale.forLocale(locale));
\r
233 * Initializes the iterator to use the language-specific rules and
\r
234 * break iterator rules defined in the argument locale to search for
\r
235 * argument pattern in the argument target text.
\r
236 * See super class documentation for more details on the use of the target
\r
237 * text and BreakIterator.
\r
238 * @param pattern text to look for.
\r
239 * @param target target text to search for pattern.
\r
240 * @param locale ulocale to use for language and break iterator rules
\r
241 * @exception IllegalArgumentException thrown when argument target is null,
\r
242 * or of length 0. ClassCastException thrown if the collator for
\r
243 * the specified locale is not a RuleBasedCollator.
\r
244 * @see BreakIterator
\r
245 * @see RuleBasedCollator
\r
246 * @see SearchIterator
\r
249 public StringSearch(String pattern, CharacterIterator target, ULocale locale)
\r
251 this(pattern, target, (RuleBasedCollator)Collator.getInstance(locale),
\r
252 null/*BreakIterator.getCharacterInstance(locale)*/);
\r
256 * Initializes the iterator to use the language-specific rules and
\r
257 * break iterator rules defined in the default locale to search for
\r
258 * argument pattern in the argument target text.
\r
259 * See super class documentation for more details on the use of the target
\r
260 * text and BreakIterator.
\r
261 * @param pattern text to look for.
\r
262 * @param target target text to search for pattern.
\r
263 * @exception IllegalArgumentException thrown when argument target is null,
\r
264 * or of length 0. ClassCastException thrown if the collator for
\r
265 * the default locale is not a RuleBasedCollator.
\r
266 * @see BreakIterator
\r
267 * @see RuleBasedCollator
\r
268 * @see SearchIterator
\r
271 public StringSearch(String pattern, String target)
\r
273 this(pattern, new StringCharacterIterator(target),
\r
274 (RuleBasedCollator)Collator.getInstance(),
\r
275 null/*BreakIterator.getCharacterInstance()*/);
\r
278 // public getters -----------------------------------------------------
\r
282 * Gets the RuleBasedCollator used for the language rules.
\r
285 * Since StringSearch depends on the returned RuleBasedCollator, any
\r
286 * changes to the RuleBasedCollator result should follow with a call to
\r
287 * either StringSearch.reset() or
\r
288 * StringSearch.setCollator(RuleBasedCollator) to ensure the correct
\r
289 * search behaviour.
\r
291 * @return RuleBasedCollator used by this StringSearch
\r
292 * @see RuleBasedCollator
\r
293 * @see #setCollator
\r
296 public RuleBasedCollator getCollator()
\r
298 return m_collator_;
\r
302 * Returns the pattern for which StringSearch is searching for.
\r
303 * @return the pattern searched for
\r
306 public String getPattern()
\r
308 return m_pattern_.targetText;
\r
312 * Return the index in the target text where the iterator is currently
\r
314 * If the iteration has gone past the end of the target text or past
\r
315 * the beginning for a backwards search, {@link #DONE} is returned.
\r
316 * @return index in the target text where the iterator is currently
\r
320 public int getIndex()
\r
322 int result = m_colEIter_.getOffset();
\r
323 if (isOutOfBounds(m_textBeginOffset_, m_textLimitOffset_, result)) {
\r
330 * Determines whether canonical matches (option 1, as described in the
\r
331 * class documentation) is set.
\r
332 * See setCanonical(boolean) for more information.
\r
333 * @see #setCanonical
\r
334 * @return true if canonical matches is set, false otherwise
\r
337 public boolean isCanonical()
\r
339 return m_isCanonicalMatch_;
\r
342 // public setters -----------------------------------------------------
\r
346 * Sets the RuleBasedCollator to be used for language-specific searching.
\r
349 * This method causes internal data such as Boyer-Moore shift tables
\r
350 * to be recalculated, but the iterator's position is unchanged.
\r
352 * @param collator to use for this StringSearch
\r
353 * @exception IllegalArgumentException thrown when collator is null
\r
354 * @see #getCollator
\r
357 public void setCollator(RuleBasedCollator collator)
\r
359 if (collator == null) {
\r
360 throw new IllegalArgumentException("Collator can not be null");
\r
362 m_collator_ = collator;
\r
363 m_ceMask_ = getMask(m_collator_.getStrength());
\r
364 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
\r
366 m_colEIter_.setCollator(m_collator_);
\r
367 m_utilColEIter_.setCollator(m_collator_);
\r
368 m_charBreakIter_ = BreakIterator.getCharacterInstance(/*collator.getLocale(ULocale.VALID_LOCALE)*/);
\r
369 m_charBreakIter_.setText(targetText);
\r
374 * Set the pattern to search for.
\r
377 * This method causes internal data such as Boyer-Moore shift tables
\r
378 * to be recalculated, but the iterator's position is unchanged.
\r
380 * @param pattern for searching
\r
382 * @exception IllegalArgumentException thrown if pattern is null or of
\r
386 public void setPattern(String pattern)
\r
388 if (pattern == null || pattern.length() <= 0) {
\r
389 throw new IllegalArgumentException(
\r
390 "Pattern to search for can not be null or of length 0");
\r
392 m_pattern_.targetText = pattern;
\r
397 * Set the target text to be searched. Text iteration will hence begin at
\r
398 * the start of the text string. This method is useful if you want to
\r
399 * re-use an iterator to search within a different body of text.
\r
400 * @param text new text iterator to look for match,
\r
401 * @exception IllegalArgumentException thrown when text is null or has
\r
406 public void setTarget(CharacterIterator text)
\r
408 super.setTarget(text);
\r
409 m_textBeginOffset_ = targetText.getBeginIndex();
\r
410 m_textLimitOffset_ = targetText.getEndIndex();
\r
411 m_colEIter_.setText(targetText);
\r
412 m_charBreakIter_.setText(targetText);
\r
417 * Sets the position in the target text which the next search will start
\r
418 * from to the argument. This method clears all previous states.
\r
421 * This method takes the argument position and sets the position in the
\r
422 * target text accordingly, without checking if position is pointing to a
\r
423 * valid starting point to begin searching.
\r
426 * Search positions that may render incorrect results are highlighted in
\r
427 * the class documentation.
\r
429 * @param position index to start next search from.
\r
430 * @exception IndexOutOfBoundsException thrown if argument position is out
\r
431 * of the target text range.
\r
435 public void setIndex(int position)
\r
437 super.setIndex(position);
\r
438 m_matchedIndex_ = DONE;
\r
439 m_colEIter_.setExactOffset(position);
\r
444 * Set the canonical match mode. See class documentation for details.
\r
445 * The default setting for this property is false.
\r
447 * @param allowCanonical flag indicator if canonical matches are allowed
\r
448 * @see #isCanonical
\r
451 public void setCanonical(boolean allowCanonical)
\r
453 m_isCanonicalMatch_ = allowCanonical;
\r
454 if (m_isCanonicalMatch_ == true) {
\r
455 if (m_canonicalPrefixAccents_ == null) {
\r
456 m_canonicalPrefixAccents_ = new StringBuffer();
\r
459 m_canonicalPrefixAccents_.delete(0,
\r
460 m_canonicalPrefixAccents_.length());
\r
462 if (m_canonicalSuffixAccents_ == null) {
\r
463 m_canonicalSuffixAccents_ = new StringBuffer();
\r
466 m_canonicalSuffixAccents_.delete(0,
\r
467 m_canonicalSuffixAccents_.length());
\r
472 // public miscellaneous methods -----------------------------------------
\r
476 * Resets the search iteration. All properties will be reset to the
\r
480 * Search will begin at the start of the target text if a forward iteration
\r
481 * is initiated before a backwards iteration. Otherwise if a
\r
482 * backwards iteration is initiated before a forwards iteration, the search
\r
483 * will begin at the end of the target text.
\r
486 * Canonical match option will be reset to false, ie an exact match.
\r
490 public void reset()
\r
492 // reset is setting the attributes that are already in string search,
\r
493 // hence all attributes in the collator should be retrieved without any
\r
496 m_isCanonicalMatch_ = false;
\r
497 m_ceMask_ = getMask(m_collator_.getStrength());
\r
498 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
\r
500 m_colEIter_.setCollator(m_collator_);
\r
501 m_colEIter_.reset();
\r
502 m_utilColEIter_.setCollator(m_collator_);
\r
505 // protected methods -----------------------------------------------------
\r
509 * Concrete method to provide the mechanism
\r
510 * for finding the next <b>forwards</b> match in the target text.
\r
511 * See super class documentation for its use.
\r
513 * @param start index in the target text at which the forwards search
\r
515 * @return the starting index of the next forwards match if found, DONE
\r
517 * @see #handlePrevious(int)
\r
521 protected int handleNext(int start)
\r
523 if (m_pattern_.m_CELength_ == 0) {
\r
525 if (m_matchedIndex_ == DONE && start == m_textBeginOffset_) {
\r
526 m_matchedIndex_ = start;
\r
527 return m_matchedIndex_;
\r
530 targetText.setIndex(start);
\r
531 char ch = targetText.current();
\r
532 // ch can never be done, it is handled by next()
\r
533 char ch2 = targetText.next();
\r
534 if (ch2 == CharacterIterator.DONE) {
\r
535 m_matchedIndex_ = DONE;
\r
538 m_matchedIndex_ = targetText.getIndex();
\r
540 if (UTF16.isLeadSurrogate(ch) && UTF16.isTrailSurrogate(ch2)) {
\r
542 m_matchedIndex_ = targetText.getIndex();
\r
546 if (matchLength <= 0) {
\r
547 // we must have reversed direction after we reached the start
\r
548 // of the target text
\r
549 // see SearchIterator next(), it checks the bounds and returns
\r
550 // if it exceeds the range. It does not allow setting of
\r
552 if (start == m_textBeginOffset_) {
\r
553 m_matchedIndex_ = DONE;
\r
556 // for boundary check purposes. this will ensure that the
\r
557 // next match will not preceed the current offset
\r
558 // note search->matchedIndex will always be set to something
\r
560 m_matchedIndex_ = start - 1;
\r
564 // status checked below
\r
565 if (m_isCanonicalMatch_) {
\r
566 // can't use exact here since extra accents are allowed.
\r
567 handleNextCanonical(start);
\r
570 handleNextExact(start);
\r
573 if (m_matchedIndex_ == DONE) {
\r
574 targetText.setIndex(m_textLimitOffset_);
\r
577 targetText.setIndex(m_matchedIndex_);
\r
579 return m_matchedIndex_;
\r
584 * Concrete method to provide the mechanism
\r
585 * for finding the next <b>backwards</b> match in the target text.
\r
586 * See super class documentation for its use.
\r
588 * @param start index in the target text at which the backwards search
\r
590 * @return the starting index of the next backwards match if found, DONE
\r
592 * @see #handleNext(int)
\r
596 protected int handlePrevious(int start)
\r
598 if (m_pattern_.m_CELength_ == 0) {
\r
600 // start can never be DONE or 0, it is handled in previous
\r
601 targetText.setIndex(start);
\r
602 char ch = targetText.previous();
\r
603 if (ch == CharacterIterator.DONE) {
\r
604 m_matchedIndex_ = DONE;
\r
607 m_matchedIndex_ = targetText.getIndex();
\r
608 if (UTF16.isTrailSurrogate(ch)) {
\r
609 if (UTF16.isLeadSurrogate(targetText.previous())) {
\r
610 m_matchedIndex_ = targetText.getIndex();
\r
616 if (matchLength == 0) {
\r
617 // we must have reversed direction after we reached the end
\r
618 // of the target text
\r
619 // see SearchIterator next(), it checks the bounds and returns
\r
620 // if it exceeds the range. It does not allow setting of
\r
622 m_matchedIndex_ = DONE;
\r
624 if (m_isCanonicalMatch_) {
\r
625 // can't use exact here since extra accents are allowed.
\r
626 handlePreviousCanonical(start);
\r
629 handlePreviousExact(start);
\r
633 if (m_matchedIndex_ == DONE) {
\r
634 targetText.setIndex(m_textBeginOffset_);
\r
637 targetText.setIndex(m_matchedIndex_);
\r
639 return m_matchedIndex_;
\r
642 // private static inner classes ----------------------------------------
\r
644 private static class Pattern
\r
646 // protected methods -----------------------------------------------
\r
651 protected String targetText;
\r
653 * Array containing the collation elements of targetText
\r
655 protected int m_CE_[];
\r
657 * Number of collation elements in m_CE_
\r
659 protected int m_CELength_;
\r
661 * Flag indicator if targetText starts with an accent
\r
663 protected boolean m_hasPrefixAccents_;
\r
665 * Flag indicator if targetText ends with an accent
\r
667 protected boolean m_hasSuffixAccents_;
\r
669 * Default number of characters to shift for Boyer Moore
\r
671 protected int m_defaultShiftSize_;
\r
673 * Number of characters to shift for Boyer Moore, depending on the
\r
674 * source text to search
\r
676 protected char m_shift_[];
\r
678 * Number of characters to shift backwards for Boyer Moore, depending
\r
679 * on the source text to search
\r
681 protected char m_backShift_[];
\r
683 // protected constructors ------------------------------------------
\r
686 * Empty constructor
\r
688 protected Pattern(String pattern)
\r
690 targetText = pattern;
\r
691 m_CE_ = new int[INITIAL_ARRAY_SIZE_];
\r
693 m_hasPrefixAccents_ = false;
\r
694 m_hasSuffixAccents_ = false;
\r
695 m_defaultShiftSize_ = 1;
\r
696 m_shift_ = new char[MAX_TABLE_SIZE_];
\r
697 m_backShift_ = new char[MAX_TABLE_SIZE_];
\r
702 // private data members ------------------------------------------------
\r
705 * target text begin offset. Each targetText has a valid contiguous region
\r
706 * to iterate and this data member is the offset to the first such
\r
707 * character in the region.
\r
709 private int m_textBeginOffset_;
\r
711 * target text limit offset. Each targetText has a valid contiguous region
\r
712 * to iterate and this data member is the offset to 1 after the last such
\r
713 * character in the region.
\r
715 private int m_textLimitOffset_;
\r
717 * Upon completion of a search, m_matchIndex_ will store starting offset in
\r
718 * m_text for the match. The Value DONE is the default value.
\r
719 * If we are not at the start of the text or the end of the text and
\r
720 * m_matchedIndex_ is DONE it means that we can find any more matches in
\r
721 * that particular direction
\r
723 private int m_matchedIndex_;
\r
725 * Current pattern to search for
\r
727 private Pattern m_pattern_;
\r
729 * Collator whose rules are used to perform the search
\r
731 private RuleBasedCollator m_collator_;
\r
733 * The collation element iterator for the text source.
\r
735 private CollationElementIterator m_colEIter_;
\r
737 * Utility collation element, used throughout program for temporary
\r
740 private CollationElementIterator m_utilColEIter_;
\r
742 * The mask used on the collation elements to retrieve the valid strength
\r
745 private int m_ceMask_;
\r
747 * Buffer storing accents during a canonical search
\r
749 private StringBuffer m_canonicalPrefixAccents_;
\r
751 * Buffer storing accents during a canonical search
\r
753 private StringBuffer m_canonicalSuffixAccents_;
\r
755 * Flag to indicate if canonical search is to be done.
\r
756 * E.g looking for "a\u0300" in "a\u0318\u0300" will yield the match at 0.
\r
758 private boolean m_isCanonicalMatch_;
\r
760 * Character break iterator for boundary checking.
\r
762 private BreakIterator m_charBreakIter_;
\r
764 * Size of the shift tables
\r
766 private static final int MAX_TABLE_SIZE_ = 257;
\r
768 * Initial array size
\r
770 private static final int INITIAL_ARRAY_SIZE_ = 256;
\r
774 private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
\r
778 private static final int LAST_BYTE_MASK_ = 0xff;
\r
780 * Utility buffer for return values and temporary storage
\r
782 private int m_utilBuffer_[] = new int[2];
\r
784 * Unsigned 32-Bit Integer Mask
\r
786 private static final long UNSIGNED_32BIT_MASK = 0xffffffffL;
\r
788 // private methods -------------------------------------------------------
\r
791 * Hash a collation element from its full size (32 bits) down into a
\r
792 * value that can be used as an index into the shift tables. Right
\r
793 * now we do a modulus by the size of the hash table.
\r
794 * @param ce collation element
\r
795 * @return collapsed version of the collation element
\r
797 private static final int hash(int ce)
\r
799 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
\r
800 // well with the new collation where most of the latin 1 characters
\r
801 // are of the value xx000xxx. their hashes will most of the time be 0
\r
802 // to be discussed on the hash algo.
\r
803 return CollationElementIterator.primaryOrder(ce) % MAX_TABLE_SIZE_;
\r
807 * Gets the fcd value for a character at the argument index.
\r
808 * This method takes into accounts of the supplementary characters.
\r
809 * Note this method changes the offset in the character iterator.
\r
810 * @param str UTF16 string where character for fcd retrieval resides
\r
811 * @param offset position of the character whose fcd is to be retrieved
\r
812 * @return fcd value
\r
814 private static final char getFCD(CharacterIterator str, int offset)
\r
816 str.setIndex(offset);
\r
817 char ch = str.current();
\r
818 char result = NormalizerImpl.getFCD16(ch);
\r
820 if ((result != 0) && (str.getEndIndex() != offset + 1) &&
\r
821 UTF16.isLeadSurrogate(ch)) {
\r
823 if (UTF16.isTrailSurrogate(ch)) {
\r
824 result = NormalizerImpl.getFCD16FromSurrogatePair(result, ch);
\r
833 * Gets the fcd value for a character at the argument index.
\r
834 * This method takes into accounts of the supplementary characters.
\r
835 * @param str UTF16 string where character for fcd retrieval resides
\r
836 * @param offset position of the character whose fcd is to be retrieved
\r
837 * @return fcd value
\r
839 private static final char getFCD(String str, int offset)
\r
841 char ch = str.charAt(offset);
\r
842 char result = NormalizerImpl.getFCD16(ch);
\r
844 if ((result != 0) && (str.length() != offset + 1) &&
\r
845 UTF16.isLeadSurrogate(ch)) {
\r
846 ch = str.charAt(offset + 1);
\r
847 if (UTF16.isTrailSurrogate(ch)) {
\r
848 result = NormalizerImpl.getFCD16FromSurrogatePair(result, ch);
\r
857 * Getting the modified collation elements taking into account the collation
\r
860 * @return the modified collation element
\r
862 private final int getCE(int ce)
\r
864 // note for tertiary we can't use the collator->tertiaryMask, that
\r
865 // is a preprocessed mask that takes into account case options. since
\r
866 // we are only concerned with exact matches, we don't need that.
\r
869 if (m_collator_.isAlternateHandlingShifted()) {
\r
870 // alternate handling here, since only the 16 most significant
\r
871 // digits is only used, we can safely do a compare without masking
\r
872 // if the ce is a variable, we mask and get only the primary values
\r
873 // no shifting to quartenary is required since all primary values
\r
874 // less than variabletop will need to be masked off anyway.
\r
875 if (((m_collator_.m_variableTopValue_ << 16) & UNSIGNED_32BIT_MASK) > (ce & UNSIGNED_32BIT_MASK)) {
\r
876 if (m_collator_.getStrength() == Collator.QUATERNARY) {
\r
877 ce = CollationElementIterator.primaryOrder(ce);
\r
880 ce = CollationElementIterator.IGNORABLE;
\r
889 * Appends a int to a int array, increasing the size of the array when
\r
890 * we are out of space.
\r
891 * @param offset in array to append to
\r
892 * @param value to append
\r
893 * @param array to append to
\r
894 * @return the array appended to, this could be a new and bigger array
\r
896 private static final int[] append(int offset, int value, int array[])
\r
898 if (offset >= array.length) {
\r
899 int temp[] = new int[offset + INITIAL_ARRAY_SIZE_];
\r
900 System.arraycopy(array, 0, temp, 0, array.length);
\r
903 array[offset] = value;
\r
908 * Initializing the ce table for a pattern. Stores non-ignorable collation
\r
909 * keys. Table size will be estimated by the size of the pattern text.
\r
910 * Table expansion will be perform as we go along. Adding 1 to ensure that
\r
911 * the table size definitely increases.
\r
912 * Internal method, status assumed to be a success.
\r
913 * @return total number of expansions
\r
915 private final int initializePatternCETable()
\r
917 m_utilColEIter_.setText(m_pattern_.targetText);
\r
921 int ce = m_utilColEIter_.next();
\r
923 while (ce != CollationElementIterator.NULLORDER) {
\r
924 int newce = getCE(ce);
\r
925 if (newce != CollationElementIterator.IGNORABLE) {
\r
926 m_pattern_.m_CE_ = append(offset, newce, m_pattern_.m_CE_);
\r
929 result += m_utilColEIter_.getMaxExpansion(ce) - 1;
\r
930 ce = m_utilColEIter_.next();
\r
933 m_pattern_.m_CE_ = append(offset, 0, m_pattern_.m_CE_);
\r
934 m_pattern_.m_CELength_ = offset;
\r
940 * Initializes the pattern struct.
\r
941 * Internal method, status assumed to be success.
\r
942 * @return expansionsize the total expansion size of the pattern
\r
944 private final int initializePattern()
\r
946 if (m_collator_.getStrength() == Collator.PRIMARY) {
\r
947 m_pattern_.m_hasPrefixAccents_ = false;
\r
948 m_pattern_.m_hasSuffixAccents_ = false;
\r
950 m_pattern_.m_hasPrefixAccents_ = (getFCD(m_pattern_.targetText, 0)
\r
951 >> SECOND_LAST_BYTE_SHIFT_) != 0;
\r
952 m_pattern_.m_hasSuffixAccents_ = (getFCD(m_pattern_.targetText,
\r
953 m_pattern_.targetText.length()
\r
955 & LAST_BYTE_MASK_) != 0;
\r
957 // since intializePattern is an internal method status is a success.
\r
958 return initializePatternCETable();
\r
962 * Initializing shift tables, with the default values.
\r
963 * If a corresponding default value is 0, the shift table is not set.
\r
964 * @param shift table for forwards shift
\r
965 * @param backshift table for backwards shift
\r
966 * @param cetable table containing pattern ce
\r
967 * @param cesize size of the pattern ces
\r
968 * @param expansionsize total size of the expansions
\r
969 * @param defaultforward the default forward value
\r
970 * @param defaultbackward the default backward value
\r
972 private final void setShiftTable(char shift[],
\r
974 int cetable[], int cesize,
\r
976 char defaultforward,
\r
977 char defaultbackward)
\r
979 // estimate the value to shift. to do that we estimate the smallest
\r
980 // number of characters to give the relevant ces, ie approximately
\r
981 // the number of ces minus their expansion, since expansions can come
\r
982 // from a character.
\r
983 for (int count = 0; count < MAX_TABLE_SIZE_; count ++) {
\r
984 shift[count] = defaultforward;
\r
986 cesize --; // down to the last index
\r
987 for (int count = 0; count < cesize; count ++) {
\r
988 // number of ces from right of array to the count
\r
989 int temp = defaultforward - count - 1;
\r
990 shift[hash(cetable[count])] = temp > 1 ? ((char)temp) : 1;
\r
992 shift[hash(cetable[cesize])] = 1;
\r
993 // for ignorables we just shift by one. see test examples.
\r
994 shift[hash(0)] = 1;
\r
996 for (int count = 0; count < MAX_TABLE_SIZE_; count ++) {
\r
997 backshift[count] = defaultbackward;
\r
999 for (int count = cesize; count > 0; count --) {
\r
1000 // the original value count does not seem to work
\r
1001 backshift[hash(cetable[count])] = (char)(count > expansionsize ?
\r
1002 count - expansionsize : 1);
\r
1004 backshift[hash(cetable[0])] = 1;
\r
1005 backshift[hash(0)] = 1;
\r
1009 * <p>Building of the pattern collation element list and the Boyer Moore
\r
1010 * StringSearch table.</p>
\r
1011 * <p>The canonical match will only be performed after the default match
\r
1013 * <p>For both cases we need to remember the size of the composed and
\r
1014 * decomposed versions of the string. Since the Boyer-Moore shift
\r
1015 * calculations shifts by a number of characters in the text and tries to
\r
1016 * match the pattern from that offset, the shift value can not be too large
\r
1017 * in case we miss some characters. To choose a right shift size, we
\r
1018 * estimate the NFC form of the and use its size as a shift guide. The NFC
\r
1019 * form should be the small possible representation of the pattern. Anyways,
\r
1020 * we'll err on the smaller shift size. Hence the calculation for
\r
1021 * minlength. Canonical match will be performed slightly differently. We'll
\r
1022 * split the pattern into 3 parts, the prefix accents (PA), the middle
\r
1023 * string bounded by the first and last base character (MS), the ending
\r
1024 * accents (EA). Matches will be done on MS first, and only when we match
\r
1025 * MS then some processing will be required for the prefix and end accents
\r
1026 * in order to determine if they match PA and EA. Hence the default shift
\r
1027 * values for the canonical match will take the size of either end's accent
\r
1028 * into consideration. Forwards search will take the end accents into
\r
1029 * consideration for the default shift values and the backwards search will
\r
1030 * take the prefix accents into consideration.</p>
\r
1031 * <p>If pattern has no non-ignorable ce, we return a illegal argument
\r
1034 private final void initialize()
\r
1036 int expandlength = initializePattern();
\r
1037 if (m_pattern_.m_CELength_ > 0) {
\r
1038 char minlength = (char)(m_pattern_.m_CELength_ > expandlength
\r
1039 ? m_pattern_.m_CELength_ - expandlength : 1);
\r
1040 m_pattern_.m_defaultShiftSize_ = minlength;
\r
1041 setShiftTable(m_pattern_.m_shift_, m_pattern_.m_backShift_,
\r
1042 m_pattern_.m_CE_, m_pattern_.m_CELength_,
\r
1043 expandlength, minlength, minlength);
\r
1046 m_pattern_.m_defaultShiftSize_ = 0;
\r
1051 * Determine whether the search text bounded by the offset start and end is
\r
1052 * one or more whole units of text as determined by the breakiterator in
\r
1054 * @param start target text start offset
\r
1055 * @param end target text end offset
\r
1057 private final boolean isBreakUnit(int start, int end)
\r
1059 if (breakIterator != null) {
\r
1060 int startindex = breakIterator.first();
\r
1061 int endindex = breakIterator.last();
\r
1063 // out-of-range indexes are never boundary positions
\r
1064 if (start < startindex || start > endindex || end < startindex
\r
1065 || end > endindex) {
\r
1068 // otherwise, we can use following() on the position before the
\r
1069 // specified one and return true of the position we get back is the
\r
1070 // one the user specified
\r
1071 boolean result = (start == startindex
\r
1072 || breakIterator.following(start - 1) == start)
\r
1073 && (end == endindex
\r
1074 || breakIterator.following(end - 1) == end);
\r
1076 // iterates the individual ces
\r
1077 m_utilColEIter_.setText(
\r
1078 new CharacterIteratorWrapper(targetText), start);
\r
1079 for (int count = 0; count < m_pattern_.m_CELength_;
\r
1081 int ce = getCE(m_utilColEIter_.next());
\r
1082 if (ce == CollationElementIterator.IGNORABLE) {
\r
1086 if (ce != m_pattern_.m_CE_[count]) {
\r
1090 int nextce = m_utilColEIter_.next();
\r
1091 while (m_utilColEIter_.getOffset() == end
\r
1092 && getCE(nextce) == CollationElementIterator.IGNORABLE) {
\r
1093 nextce = m_utilColEIter_.next();
\r
1095 if (nextce != CollationElementIterator.NULLORDER
\r
1096 && m_utilColEIter_.getOffset() == end) {
\r
1097 // extra collation elements at the end of the match
\r
1107 * Getting the next base character offset if current offset is an accent,
\r
1108 * or the current offset if the current character contains a base character.
\r
1109 * accents the following base character will be returned
\r
1110 * @param text string
\r
1111 * @param textoffset current offset
\r
1112 * @param textlength length of text string
\r
1113 * @return the next base character or the current offset
\r
1114 * if the current character is contains a base character.
\r
1116 private final int getNextBaseOffset(CharacterIterator text,
\r
1119 if (textoffset < text.getEndIndex()) {
\r
1120 while (text.getIndex() < text.getEndIndex()) {
\r
1121 int result = textoffset;
\r
1122 if ((getFCD(text, textoffset ++)
\r
1123 >> SECOND_LAST_BYTE_SHIFT_) == 0) {
\r
1127 return text.getEndIndex();
\r
1129 return textoffset;
\r
1133 * Gets the next base character offset depending on the string search
\r
1135 * @param textoffset one offset away from the last character
\r
1137 * @return start index of the next base character or the current offset
\r
1138 * if the current character is contains a base character.
\r
1140 private final int getNextBaseOffset(int textoffset)
\r
1142 if (m_pattern_.m_hasSuffixAccents_
\r
1143 && textoffset < m_textLimitOffset_) {
\r
1144 targetText.setIndex(textoffset);
\r
1145 targetText.previous();
\r
1146 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) != 0) {
\r
1147 return getNextBaseOffset(targetText, textoffset);
\r
1150 return textoffset;
\r
1154 * Shifting the collation element iterator position forward to prepare for
\r
1155 * a following match. If the last character is a unsafe character, we'll
\r
1156 * only shift by 1 to capture contractions, normalization etc.
\r
1157 * Internal method, status assumed to be success.
\r
1158 * @param textoffset start text position to do search
\r
1159 * @param ce the text ce which failed the match.
\r
1160 * @param patternceindex index of the ce within the pattern ce buffer which
\r
1161 * failed the match
\r
1162 * @return final offset
\r
1164 private int shiftForward(int textoffset, int ce, int patternceindex)
\r
1167 if (ce != CollationElementIterator.NULLORDER) {
\r
1168 int shift = m_pattern_.m_shift_[hash(ce)];
\r
1169 // this is to adjust for characters in the middle of the
\r
1170 // substring for matching that failed.
\r
1171 int adjust = m_pattern_.m_CELength_ - patternceindex;
\r
1172 if (adjust > 1 && shift >= adjust) {
\r
1173 shift -= adjust - 1;
\r
1175 textoffset += shift;
\r
1178 textoffset += m_pattern_.m_defaultShiftSize_;
\r
1181 textoffset = getNextBaseOffset(textoffset);
\r
1182 // check for unsafe characters
\r
1183 // * if it is the start or middle of a contraction: to be done after
\r
1184 // a initial match is found
\r
1185 // * thai or lao base consonant character: similar to contraction
\r
1186 // * high surrogate character: similar to contraction
\r
1187 // * next character is a accent: shift to the next base character
\r
1188 return textoffset;
\r
1192 * Gets the offset to the next safe point in text.
\r
1193 * ie. not the middle of a contraction, swappable characters or
\r
1194 * supplementary characters.
\r
1195 * @param textoffset offset in string
\r
1196 * @param end offset in string
\r
1197 * @return offset to the next safe character
\r
1199 private final int getNextSafeOffset(int textoffset, int end)
\r
1201 int result = textoffset; // first contraction character
\r
1202 targetText.setIndex(result);
\r
1203 while (result != end &&
\r
1204 m_collator_.isUnsafe(targetText.current())) {
\r
1206 targetText.setIndex(result);
\r
1212 * This checks for accents in the potential match started with a composite
\r
1214 * This is really painful... we have to check that composite character do
\r
1215 * not have any extra accents. We have to normalize the potential match and
\r
1216 * find the immediate decomposed character before the match.
\r
1217 * The first composite character would have been taken care of by the fcd
\r
1218 * checks in checkForwardExactMatch.
\r
1219 * This is the slow path after the fcd of the first character and
\r
1220 * the last character has been checked by checkForwardExactMatch and we
\r
1221 * determine that the potential match has extra non-ignorable preceding
\r
1223 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
\r
1224 * checkExtraMatchAccent should fail since there is a middle ring in
\r
1225 * \u01FA Note here that accents checking are slow and cautioned in the API
\r
1227 * Internal method, status assumed to be a success, caller should check
\r
1228 * status before calling this method
\r
1229 * @param start index of the potential unfriendly composite character
\r
1230 * @param end index of the potential unfriendly composite character
\r
1231 * @return true if there is non-ignorable accents before at the beginning
\r
1232 * of the match, false otherwise.
\r
1234 private final boolean checkExtraMatchAccents(int start, int end)
\r
1236 boolean result = false;
\r
1237 if (m_pattern_.m_hasPrefixAccents_) {
\r
1238 targetText.setIndex(start);
\r
1240 if (UTF16.isLeadSurrogate(targetText.next())) {
\r
1241 if (!UTF16.isTrailSurrogate(targetText.next())) {
\r
1242 targetText.previous();
\r
1245 // we are only concerned with the first composite character
\r
1246 String str = getString(targetText, start, end);
\r
1247 if (Normalizer.quickCheck(str, Normalizer.NFD,0)
\r
1248 == Normalizer.NO) {
\r
1249 int safeoffset = getNextSafeOffset(start, end);
\r
1250 if (safeoffset != end) {
\r
1253 String decomp = Normalizer.decompose(
\r
1254 str.substring(0, safeoffset - start), false);
\r
1255 m_utilColEIter_.setText(decomp);
\r
1256 int firstce = m_pattern_.m_CE_[0];
\r
1257 boolean ignorable = true;
\r
1258 int ce = CollationElementIterator.IGNORABLE;
\r
1260 while (ce != firstce) {
\r
1261 offset = m_utilColEIter_.getOffset();
\r
1262 if (ce != firstce
\r
1263 && ce != CollationElementIterator.IGNORABLE) {
\r
1264 ignorable = false;
\r
1266 ce = m_utilColEIter_.next();
\r
1268 m_utilColEIter_.setExactOffset(offset); // back up 1 to the
\r
1269 m_utilColEIter_.previous(); // right offset
\r
1270 offset = m_utilColEIter_.getOffset();
\r
1271 result = !ignorable && (UCharacter.getCombiningClass(
\r
1272 UTF16.charAt(decomp, offset)) != 0);
\r
1280 * Used by exact matches, checks if there are accents before the match.
\r
1281 * This is really painful... we have to check that composite characters at
\r
1282 * the start of the matches have to not have any extra accents.
\r
1283 * We check the FCD of the character first, if it starts with an accent and
\r
1284 * the first pattern ce does not match the first ce of the character, we
\r
1286 * Otherwise we try normalizing the first composite
\r
1287 * character and find the immediate decomposed character before the match to
\r
1288 * see if it is an non-ignorable accent.
\r
1289 * Now normalizing the first composite character is enough because we ensure
\r
1290 * that when the match is passed in here with extra beginning ces, the
\r
1291 * first or last ce that match has to occur within the first character.
\r
1292 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
\r
1293 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
\r
1294 * Note here that accents checking are slow and cautioned in the API docs.
\r
1295 * @param start offset
\r
1296 * @param end offset
\r
1297 * @return true if there are accents on either side of the match,
\r
1300 private final boolean hasAccentsBeforeMatch(int start, int end)
\r
1302 if (m_pattern_.m_hasPrefixAccents_) {
\r
1303 // we have been iterating forwards previously
\r
1304 boolean ignorable = true;
\r
1305 int firstce = m_pattern_.m_CE_[0];
\r
1306 m_colEIter_.setExactOffset(start);
\r
1307 int ce = getCE(m_colEIter_.next());
\r
1308 while (ce != firstce) {
\r
1309 if (ce != CollationElementIterator.IGNORABLE) {
\r
1310 ignorable = false;
\r
1312 ce = getCE(m_colEIter_.next());
\r
1314 if (!ignorable && m_colEIter_.isInBuffer()) {
\r
1315 // within normalization buffer, discontiguous handled here
\r
1320 boolean accent = (getFCD(targetText, start) >> SECOND_LAST_BYTE_SHIFT_)
\r
1323 return checkExtraMatchAccents(start, end);
\r
1328 if (start > m_textBeginOffset_) {
\r
1329 targetText.setIndex(start);
\r
1330 targetText.previous();
\r
1331 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_)
\r
1333 m_colEIter_.setExactOffset(start);
\r
1334 ce = m_colEIter_.previous();
\r
1335 if (ce != CollationElementIterator.NULLORDER
\r
1336 && ce != CollationElementIterator.IGNORABLE) {
\r
1347 * Used by exact matches, checks if there are accents bounding the match.
\r
1348 * Note this is the initial boundary check. If the potential match
\r
1349 * starts or ends with composite characters, the accents in those
\r
1350 * characters will be determined later.
\r
1351 * Not doing backwards iteration here, since discontiguos contraction for
\r
1352 * backwards collation element iterator, use up too many characters.
\r
1353 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
\r
1354 * should fail since there is a acute at the end of \u01FA
\r
1355 * Note here that accents checking are slow and cautioned in the API docs.
\r
1356 * @param start offset of match
\r
1357 * @param end end offset of the match
\r
1358 * @return true if there are accents on either side of the match,
\r
1361 private final boolean hasAccentsAfterMatch(int start, int end)
\r
1363 if (m_pattern_.m_hasSuffixAccents_) {
\r
1364 targetText.setIndex(end);
\r
1365 if (end > m_textBeginOffset_
\r
1366 && UTF16.isTrailSurrogate(targetText.previous())) {
\r
1367 if (targetText.getIndex() > m_textBeginOffset_ &&
\r
1368 !UTF16.isLeadSurrogate(targetText.previous())) {
\r
1369 targetText.next();
\r
1372 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) != 0) {
\r
1373 int firstce = m_pattern_.m_CE_[0];
\r
1374 m_colEIter_.setExactOffset(start);
\r
1375 while (getCE(m_colEIter_.next()) != firstce) {
\r
1378 while (count < m_pattern_.m_CELength_) {
\r
1379 if (getCE(m_colEIter_.next())
\r
1380 == CollationElementIterator.IGNORABLE) {
\r
1385 //int ce = getCE(m_colEIter_.next());
\r
1386 int ce = m_colEIter_.next();
\r
1387 if (ce != CollationElementIterator.NULLORDER
\r
1388 && ce != CollationElementIterator.IGNORABLE) {
\r
1391 if (ce != CollationElementIterator.NULLORDER
\r
1392 && ce != CollationElementIterator.IGNORABLE) {
\r
1393 if (m_colEIter_.getOffset() <= end) {
\r
1396 if ((getFCD(targetText, end) >> SECOND_LAST_BYTE_SHIFT_)
\r
1407 * Checks if the offset runs out of the text string range
\r
1408 * @param textstart offset of the first character in the range
\r
1409 * @param textlimit limit offset of the text string range
\r
1410 * @param offset to test
\r
1411 * @return true if offset is out of bounds, false otherwise
\r
1413 private static final boolean isOutOfBounds(int textstart, int textlimit,
\r
1416 return offset < textstart || offset > textlimit;
\r
1420 * Checks for identical match
\r
1421 * @param strsrch string search data
\r
1422 * @param start offset of possible match
\r
1423 * @param end offset of possible match
\r
1424 * @return true if identical match is found
\r
1426 private final boolean checkIdentical(int start, int end)
\r
1428 if (m_collator_.getStrength() != Collator.IDENTICAL) {
\r
1432 String textstr = getString(targetText, start, end - start);
\r
1433 if (Normalizer.quickCheck(textstr, Normalizer.NFD,0)
\r
1434 == Normalizer.NO) {
\r
1435 textstr = Normalizer.decompose(textstr, false);
\r
1437 String patternstr = m_pattern_.targetText;
\r
1438 if (Normalizer.quickCheck(patternstr, Normalizer.NFD,0)
\r
1439 == Normalizer.NO) {
\r
1440 patternstr = Normalizer.decompose(patternstr, false);
\r
1442 return textstr.equals(patternstr);
\r
1446 * Checks to see if the match is repeated
\r
1447 * @param start new match start index
\r
1448 * @param limit new match limit index
\r
1449 * @return true if the the match is repeated, false otherwise
\r
1451 private final boolean checkRepeatedMatch(int start, int limit)
\r
1453 if (m_matchedIndex_ == DONE) {
\r
1456 int end = limit - 1; // last character in the match
\r
1457 int lastmatchend = m_matchedIndex_ + matchLength - 1;
\r
1458 if (!isOverlapping()) {
\r
1459 return (start >= m_matchedIndex_ && start <= lastmatchend)
\r
1460 || (end >= m_matchedIndex_ && end <= lastmatchend)
\r
1461 || (start <= m_matchedIndex_ && end >= lastmatchend);
\r
1464 return start <= m_matchedIndex_ && end >= lastmatchend;
\r
1468 * Checks match for contraction.
\r
1469 * If the match ends with a partial contraction we fail.
\r
1470 * If the match starts too far off (because of backwards iteration) we try
\r
1471 * to chip off the extra characters depending on whether a breakiterator
\r
1473 * Temporary utility buffer used to return modified start and end.
\r
1474 * @param start offset of potential match, to be modified if necessary
\r
1475 * @param end offset of potential match, to be modified if necessary
\r
1476 * @return true if match passes the contraction test, false otherwise.
\r
1478 private final boolean checkNextExactContractionMatch(int start, int end)
\r
1480 // This part checks if either ends of the match contains potential
\r
1481 // contraction. If so we'll have to iterate through them
\r
1483 if (end < m_textLimitOffset_) {
\r
1484 targetText.setIndex(end);
\r
1485 endchar = targetText.current();
\r
1487 char poststartchar = 0;
\r
1488 if (start + 1 < m_textLimitOffset_) {
\r
1489 targetText.setIndex(start + 1);
\r
1490 poststartchar = targetText.current();
\r
1492 if (m_collator_.isUnsafe(endchar)
\r
1493 || m_collator_.isUnsafe(poststartchar)) {
\r
1494 // expansion prefix, what's left to iterate
\r
1495 int bufferedCEOffset = m_colEIter_.m_CEBufferOffset_;
\r
1496 boolean hasBufferedCE = bufferedCEOffset > 0;
\r
1497 m_colEIter_.setExactOffset(start);
\r
1499 while (bufferedCEOffset > 0) {
\r
1500 // getting rid of the redundant ce, caused by setOffset.
\r
1501 // since backward contraction/expansion may have extra ces if
\r
1502 // we are in the normalization buffer, hasAccentsBeforeMatch
\r
1503 // would have taken care of it.
\r
1504 // E.g. the character \u01FA will have an expansion of 3, but
\r
1505 // if we are only looking for acute and ring \u030A and \u0301,
\r
1506 // we'll have to skip the first ce in the expansion buffer.
\r
1507 m_colEIter_.next();
\r
1508 if (m_colEIter_.getOffset() != temp) {
\r
1510 temp = m_colEIter_.getOffset();
\r
1512 bufferedCEOffset --;
\r
1516 while (count < m_pattern_.m_CELength_) {
\r
1517 int ce = getCE(m_colEIter_.next());
\r
1518 if (ce == CollationElementIterator.IGNORABLE) {
\r
1521 if (hasBufferedCE && count == 0
\r
1522 && m_colEIter_.getOffset() != temp) {
\r
1524 temp = m_colEIter_.getOffset();
\r
1526 if (ce != m_pattern_.m_CE_[count]) {
\r
1528 end = getNextBaseOffset(end);
\r
1529 m_utilBuffer_[0] = start;
\r
1530 m_utilBuffer_[1] = end;
\r
1536 m_utilBuffer_[0] = start;
\r
1537 m_utilBuffer_[1] = end;
\r
1543 * Checks and sets the match information if found.
\r
1546 * <li> the potential match does not repeat the previous match
\r
1547 * <li> boundaries are correct
\r
1548 * <li> exact matches has no extra accents
\r
1549 * <li> identical matchesb
\r
1550 * <li> potential match does not end in the middle of a contraction
\r
1552 * Otherwise the offset will be shifted to the next character.
\r
1553 * The result m_matchIndex_ and m_matchLength_ will be set to the truncated
\r
1554 * more fitting result value.
\r
1555 * Uses the temporary utility buffer for storing the modified textoffset.
\r
1556 * @param textoffset offset in the collation element text.
\r
1557 * @return true if the match is valid, false otherwise
\r
1559 private final boolean checkNextExactMatch(int textoffset)
\r
1561 int start = m_colEIter_.getOffset();
\r
1562 if (!checkNextExactContractionMatch(start, textoffset)) {
\r
1563 // returns the modified textoffset
\r
1564 m_utilBuffer_[0] = m_utilBuffer_[1];
\r
1568 start = m_utilBuffer_[0];
\r
1569 textoffset = m_utilBuffer_[1];
\r
1570 // this totally matches, however we need to check if it is repeating
\r
1571 if (!isBreakUnit(start, textoffset)
\r
1572 || checkRepeatedMatch(start, textoffset)
\r
1573 || hasAccentsBeforeMatch(start, textoffset)
\r
1574 || !checkIdentical(start, textoffset)
\r
1575 || hasAccentsAfterMatch(start, textoffset)) {
\r
1577 textoffset = getNextBaseOffset(textoffset);
\r
1578 m_utilBuffer_[0] = textoffset;
\r
1582 if (m_collator_.getStrength() == Collator.PRIMARY) {
\r
1583 textoffset = checkBreakBoundary(textoffset);
\r
1586 // totally match, we will get rid of the ending ignorables.
\r
1587 m_matchedIndex_ = start;
\r
1588 matchLength = textoffset - start;
\r
1593 * Getting the previous base character offset, or the current offset if the
\r
1594 * current character is a base character
\r
1595 * @param text the source text to work on
\r
1596 * @param textoffset one offset after the current character
\r
1597 * @return the offset of the next character after the base character or the
\r
1598 * first composed character with accents
\r
1600 private final int getPreviousBaseOffset(CharacterIterator text,
\r
1603 if (textoffset > m_textBeginOffset_) {
\r
1605 int result = textoffset;
\r
1606 text.setIndex(result);
\r
1607 if (UTF16.isTrailSurrogate(text.previous())) {
\r
1608 if (text.getIndex() != text.getBeginIndex() &&
\r
1609 !UTF16.isLeadSurrogate(text.previous())) {
\r
1613 textoffset = text.getIndex();
\r
1614 char fcd = getFCD(text, textoffset);
\r
1615 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
\r
1616 if ((fcd & LAST_BYTE_MASK_) != 0) {
\r
1617 return textoffset;
\r
1621 if (textoffset == m_textBeginOffset_) {
\r
1622 return m_textBeginOffset_;
\r
1626 return textoffset;
\r
1630 * Getting the indexes of the accents that are not blocked in the argument
\r
1632 * @param accents accents in nfd.
\r
1633 * @param accentsindex array to store the indexes of accents in accents that
\r
1635 * @return the length of populated accentsindex
\r
1637 private int getUnblockedAccentIndex(StringBuffer accents,
\r
1638 int accentsindex[])
\r
1641 int length = accents.length();
\r
1644 while (index < length) {
\r
1645 int codepoint = UTF16.charAt(accents, index);
\r
1646 int tempclass = UCharacter.getCombiningClass(codepoint);
\r
1647 if (tempclass != cclass) {
\r
1648 cclass = tempclass;
\r
1649 accentsindex[result] = index;
\r
1652 if (UCharacter.isSupplementary(codepoint)) {
\r
1659 accentsindex[result] = length;
\r
1664 * Appends 3 StringBuffer/CharacterIterator together into a destination
\r
1666 * @param source1 string buffer
\r
1667 * @param source2 character iterator
\r
1668 * @param start2 start of the character iterator to merge
\r
1669 * @param end2 end of the character iterator to merge
\r
1670 * @param source3 string buffer
\r
1671 * @return appended string buffer
\r
1673 private static final StringBuffer merge(StringBuffer source1,
\r
1674 CharacterIterator source2,
\r
1675 int start2, int end2,
\r
1676 StringBuffer source3)
\r
1678 StringBuffer result = new StringBuffer();
\r
1679 if (source1 != null && source1.length() != 0) {
\r
1680 // jdk 1.3.1 does not have append(StringBuffer) yet
\r
1681 if(com.ibm.icu.impl.ICUDebug.isJDK14OrHigher){
\r
1682 result.append(source1);
\r
1684 result.append(source1.toString());
\r
1687 source2.setIndex(start2);
\r
1688 while (source2.getIndex() < end2) {
\r
1689 result.append(source2.current());
\r
1692 if (source3 != null && source3.length() != 0) {
\r
1693 // jdk 1.3.1 does not have append(StringBuffer) yet
\r
1694 if(com.ibm.icu.impl.ICUDebug.isJDK14OrHigher){
\r
1695 result.append(source3);
\r
1697 result.append(source3.toString());
\r
1704 * Running through a collation element iterator to see if the contents
\r
1705 * matches pattern in string search data
\r
1706 * @param coleiter collation element iterator to test
\r
1707 * @return true if a match if found, false otherwise
\r
1709 private final boolean checkCollationMatch(CollationElementIterator coleiter)
\r
1711 int patternceindex = m_pattern_.m_CELength_;
\r
1713 while (patternceindex > 0) {
\r
1714 int ce = getCE(coleiter.next());
\r
1715 if (ce == CollationElementIterator.IGNORABLE) {
\r
1718 if (ce != m_pattern_.m_CE_[offset]) {
\r
1722 patternceindex --;
\r
1728 * Rearranges the front accents to try matching.
\r
1729 * Prefix accents in the text will be grouped according to their combining
\r
1730 * class and the groups will be mixed and matched to try find the perfect
\r
1731 * match with the pattern.
\r
1732 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
\r
1733 * step 1: split "\u030A\u0301" into 6 other type of potential accent
\r
1734 * substrings "\u030A", "\u0301", "\u0325", "\u030A\u0301",
\r
1735 * "\u030A\u0325", "\u0301\u0325".
\r
1736 * step 2: check if any of the generated substrings matches the pattern.
\r
1737 * Internal method, status is assumed to be success, caller has to check
\r
1738 * status before calling this method.
\r
1739 * @param start first offset of the accents to start searching
\r
1740 * @param end start of the last accent set
\r
1741 * @return DONE if a match is not found, otherwise return the starting
\r
1742 * offset of the match. Note this start includes all preceding
\r
1745 private int doNextCanonicalPrefixMatch(int start, int end)
\r
1747 if ((getFCD(targetText, start) & LAST_BYTE_MASK_) == 0) {
\r
1748 // die... failed at a base character
\r
1752 start = targetText.getIndex(); // index changed by fcd
\r
1753 int offset = getNextBaseOffset(targetText, start);
\r
1754 start = getPreviousBaseOffset(start);
\r
1756 StringBuffer accents = new StringBuffer();
\r
1757 String accentstr = getString(targetText, start, offset - start);
\r
1758 // normalizing the offensive string
\r
1759 if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0)
\r
1760 == Normalizer.NO) {
\r
1761 accentstr = Normalizer.decompose(accentstr, false);
\r
1763 accents.append(accentstr);
\r
1765 int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
\r
1766 int accentsize = getUnblockedAccentIndex(accents, accentsindex);
\r
1767 int count = (2 << (accentsize - 1)) - 1;
\r
1768 while (count > 0) {
\r
1769 // copy the base characters
\r
1770 m_canonicalPrefixAccents_.delete(0,
\r
1771 m_canonicalPrefixAccents_.length());
\r
1773 for (; k < accentsindex[0]; k ++) {
\r
1774 m_canonicalPrefixAccents_.append(accents.charAt(k));
\r
1776 // forming all possible canonical rearrangement by dropping
\r
1777 // sets of accents
\r
1778 for (int i = 0; i <= accentsize - 1; i ++) {
\r
1779 int mask = 1 << (accentsize - i - 1);
\r
1780 if ((count & mask) != 0) {
\r
1781 for (int j = accentsindex[i]; j < accentsindex[i + 1];
\r
1783 m_canonicalPrefixAccents_.append(accents.charAt(j));
\r
1787 StringBuffer match = merge(m_canonicalPrefixAccents_,
\r
1788 targetText, offset, end,
\r
1789 m_canonicalSuffixAccents_);
\r
1791 // if status is a failure, ucol_setText does nothing.
\r
1792 // run the collator iterator through this match
\r
1793 m_utilColEIter_.setText(match.toString());
\r
1794 if (checkCollationMatch(m_utilColEIter_)) {
\r
1803 * Gets the offset to the safe point in text before textoffset.
\r
1804 * ie. not the middle of a contraction, swappable characters or
\r
1805 * supplementary characters.
\r
1806 * @param start offset in string
\r
1807 * @param textoffset offset in string
\r
1808 * @return offset to the previous safe character
\r
1810 private final int getPreviousSafeOffset(int start, int textoffset)
\r
1812 int result = textoffset; // first contraction character
\r
1813 targetText.setIndex(textoffset);
\r
1814 while (result >= start && m_collator_.isUnsafe(targetText.previous())) {
\r
1815 result = targetText.getIndex();
\r
1817 if (result != start) {
\r
1818 // the first contraction character is consider unsafe here
\r
1819 result = targetText.getIndex(); // originally result --;
\r
1825 * Take the rearranged end accents and tries matching. If match failed at
\r
1826 * a seperate preceding set of accents (seperated from the rearranged on by
\r
1827 * at least a base character) then we rearrange the preceding accents and
\r
1828 * tries matching again.
\r
1829 * We allow skipping of the ends of the accent set if the ces do not match.
\r
1830 * However if the failure is found before the accent set, it fails.
\r
1831 * Internal method, status assumed to be success, caller has to check
\r
1832 * status before calling this method.
\r
1833 * @param textoffset of the start of the rearranged accent
\r
1834 * @return DONE if a match is not found, otherwise return the starting
\r
1835 * offset of the match. Note this start includes all preceding
\r
1838 private int doNextCanonicalSuffixMatch(int textoffset)
\r
1840 int safelength = 0;
\r
1841 StringBuffer safetext;
\r
1842 int safeoffset = m_textBeginOffset_;
\r
1844 if (textoffset != m_textBeginOffset_
\r
1845 && m_canonicalSuffixAccents_.length() > 0
\r
1846 && m_collator_.isUnsafe(m_canonicalSuffixAccents_.charAt(0))) {
\r
1847 safeoffset = getPreviousSafeOffset(m_textBeginOffset_,
\r
1849 safelength = textoffset - safeoffset;
\r
1850 safetext = merge(null, targetText, safeoffset, textoffset,
\r
1851 m_canonicalSuffixAccents_);
\r
1854 safetext = m_canonicalSuffixAccents_;
\r
1857 // if status is a failure, ucol_setText does nothing
\r
1858 CollationElementIterator coleiter = m_utilColEIter_;
\r
1859 coleiter.setText(safetext.toString());
\r
1860 // status checked in loop below
\r
1862 int ceindex = m_pattern_.m_CELength_ - 1;
\r
1863 boolean isSafe = true; // indication flag for position in safe zone
\r
1865 while (ceindex >= 0) {
\r
1866 int textce = coleiter.previous();
\r
1867 if (textce == CollationElementIterator.NULLORDER) {
\r
1868 // check if we have passed the safe buffer
\r
1869 if (coleiter == m_colEIter_) {
\r
1872 coleiter = m_colEIter_;
\r
1873 if (safetext != m_canonicalSuffixAccents_) {
\r
1874 safetext.delete(0, safetext.length());
\r
1876 coleiter.setExactOffset(safeoffset);
\r
1877 // status checked at the start of the loop
\r
1881 textce = getCE(textce);
\r
1882 if (textce != CollationElementIterator.IGNORABLE
\r
1883 && textce != m_pattern_.m_CE_[ceindex]) {
\r
1884 // do the beginning stuff
\r
1885 int failedoffset = coleiter.getOffset();
\r
1886 if (isSafe && failedoffset >= safelength) {
\r
1887 // alas... no hope. failed at rearranged accent set
\r
1892 failedoffset += safeoffset;
\r
1895 // try rearranging the front accents
\r
1896 int result = doNextCanonicalPrefixMatch(failedoffset,
\r
1898 if (result != DONE) {
\r
1899 // if status is a failure, ucol_setOffset does nothing
\r
1900 m_colEIter_.setExactOffset(result);
\r
1905 if (textce == m_pattern_.m_CE_[ceindex]) {
\r
1909 // set offset here
\r
1911 int result = coleiter.getOffset();
\r
1912 // sets the text iterator with the correct expansion and offset
\r
1913 int leftoverces = coleiter.m_CEBufferOffset_;
\r
1914 if (result >= safelength) {
\r
1915 result = textoffset;
\r
1918 result += safeoffset;
\r
1920 m_colEIter_.setExactOffset(result);
\r
1921 m_colEIter_.m_CEBufferOffset_ = leftoverces;
\r
1925 return coleiter.getOffset();
\r
1929 * Trying out the substring and sees if it can be a canonical match.
\r
1930 * This will try normalizing the end accents and arranging them into
\r
1931 * canonical equivalents and check their corresponding ces with the pattern
\r
1933 * Suffix accents in the text will be grouped according to their combining
\r
1934 * class and the groups will be mixed and matched to try find the perfect
\r
1935 * match with the pattern.
\r
1936 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
\r
1937 * step 1: split "\u030A\u0301" into 6 other type of potential accent
\r
1939 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
\r
1941 * step 2: check if any of the generated substrings matches the pattern.
\r
1942 * @param textoffset end offset in the collation element text that ends with
\r
1943 * the accents to be rearranged
\r
1944 * @return true if the match is valid, false otherwise
\r
1946 private boolean doNextCanonicalMatch(int textoffset)
\r
1948 int offset = m_colEIter_.getOffset();
\r
1949 targetText.setIndex(textoffset);
\r
1950 if (UTF16.isTrailSurrogate(targetText.previous())
\r
1951 && targetText.getIndex() > m_textBeginOffset_) {
\r
1952 if (!UTF16.isLeadSurrogate(targetText.previous())) {
\r
1953 targetText.next();
\r
1956 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) {
\r
1957 if (m_pattern_.m_hasPrefixAccents_) {
\r
1958 offset = doNextCanonicalPrefixMatch(offset, textoffset);
\r
1959 if (offset != DONE) {
\r
1960 m_colEIter_.setExactOffset(offset);
\r
1967 if (!m_pattern_.m_hasSuffixAccents_) {
\r
1971 StringBuffer accents = new StringBuffer();
\r
1972 // offset to the last base character in substring to search
\r
1973 int baseoffset = getPreviousBaseOffset(targetText, textoffset);
\r
1974 // normalizing the offensive string
\r
1975 String accentstr = getString(targetText, baseoffset,
\r
1976 textoffset - baseoffset);
\r
1977 if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0)
\r
1978 == Normalizer.NO) {
\r
1979 accentstr = Normalizer.decompose(accentstr, false);
\r
1981 accents.append(accentstr);
\r
1982 // status checked in loop below
\r
1984 int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
\r
1985 int size = getUnblockedAccentIndex(accents, accentsindex);
\r
1987 // 2 power n - 1 plus the full set of accents
\r
1988 int count = (2 << (size - 1)) - 1;
\r
1989 while (count > 0) {
\r
1990 m_canonicalSuffixAccents_.delete(0,
\r
1991 m_canonicalSuffixAccents_.length());
\r
1992 // copy the base characters
\r
1993 for (int k = 0; k < accentsindex[0]; k ++) {
\r
1994 m_canonicalSuffixAccents_.append(accents.charAt(k));
\r
1996 // forming all possible canonical rearrangement by dropping
\r
1997 // sets of accents
\r
1998 for (int i = 0; i <= size - 1; i ++) {
\r
1999 int mask = 1 << (size - i - 1);
\r
2000 if ((count & mask) != 0) {
\r
2001 for (int j = accentsindex[i]; j < accentsindex[i + 1];
\r
2003 m_canonicalSuffixAccents_.append(accents.charAt(j));
\r
2007 offset = doNextCanonicalSuffixMatch(baseoffset);
\r
2008 if (offset != DONE) {
\r
2009 return true; // match found
\r
2017 * Gets the previous base character offset depending on the string search
\r
2019 * @param strsrch string search data
\r
2020 * @param textoffset current offset, current character
\r
2021 * @return the offset of the next character after this base character or
\r
2022 * itself if it is a composed character with accents
\r
2024 private final int getPreviousBaseOffset(int textoffset)
\r
2026 if (m_pattern_.m_hasPrefixAccents_ && textoffset > m_textBeginOffset_) {
\r
2027 int offset = textoffset;
\r
2028 if ((getFCD(targetText, offset) >> SECOND_LAST_BYTE_SHIFT_) != 0) {
\r
2029 return getPreviousBaseOffset(targetText, textoffset);
\r
2032 return textoffset;
\r
2036 * Checks match for contraction.
\r
2037 * If the match ends with a partial contraction we fail.
\r
2038 * If the match starts too far off (because of backwards iteration) we try
\r
2039 * to chip off the extra characters.
\r
2040 * Uses the temporary util buffer for return values of the modified start
\r
2042 * @param start offset of potential match, to be modified if necessary
\r
2043 * @param end offset of potential match, to be modified if necessary
\r
2044 * @return true if match passes the contraction test, false otherwise.
\r
2046 private boolean checkNextCanonicalContractionMatch(int start, int end)
\r
2048 // This part checks if either ends of the match contains potential
\r
2049 // contraction. If so we'll have to iterate through them
\r
2052 if (end < m_textLimitOffset_) {
\r
2053 targetText.setIndex(end);
\r
2054 echar = targetText.current();
\r
2056 if (start < m_textLimitOffset_) {
\r
2057 targetText.setIndex(start + 1);
\r
2058 schar = targetText.current();
\r
2060 if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
\r
2061 int expansion = m_colEIter_.m_CEBufferOffset_;
\r
2062 boolean hasExpansion = expansion > 0;
\r
2063 m_colEIter_.setExactOffset(start);
\r
2065 while (expansion > 0) {
\r
2066 // getting rid of the redundant ce, caused by setOffset.
\r
2067 // since backward contraction/expansion may have extra ces if
\r
2068 // we are in the normalization buffer, hasAccentsBeforeMatch
\r
2069 // would have taken care of it.
\r
2070 // E.g. the character \u01FA will have an expansion of 3, but
\r
2071 // if we are only looking for acute and ring \u030A and \u0301,
\r
2072 // we'll have to skip the first ce in the expansion buffer.
\r
2073 m_colEIter_.next();
\r
2074 if (m_colEIter_.getOffset() != temp) {
\r
2076 temp = m_colEIter_.getOffset();
\r
2082 while (count < m_pattern_.m_CELength_) {
\r
2083 int ce = getCE(m_colEIter_.next());
\r
2084 // status checked below, note that if status is a failure
\r
2085 // ucol_next returns UCOL_NULLORDER
\r
2086 if (ce == CollationElementIterator.IGNORABLE) {
\r
2089 if (hasExpansion && count == 0
\r
2090 && m_colEIter_.getOffset() != temp) {
\r
2092 temp = m_colEIter_.getOffset();
\r
2095 if (count == 0 && ce != m_pattern_.m_CE_[0]) {
\r
2096 // accents may have extra starting ces, this occurs when a
\r
2097 // pure accent pattern is matched without rearrangement
\r
2098 // text \u0325\u0300 and looking for \u0300
\r
2099 int expected = m_pattern_.m_CE_[0];
\r
2100 if ((getFCD(targetText, start) & LAST_BYTE_MASK_) != 0) {
\r
2101 ce = getCE(m_colEIter_.next());
\r
2102 while (ce != expected
\r
2103 && ce != CollationElementIterator.NULLORDER
\r
2104 && m_colEIter_.getOffset() <= end) {
\r
2105 ce = getCE(m_colEIter_.next());
\r
2109 if (ce != m_pattern_.m_CE_[count]) {
\r
2111 end = getNextBaseOffset(end);
\r
2112 m_utilBuffer_[0] = start;
\r
2113 m_utilBuffer_[1] = end;
\r
2119 m_utilBuffer_[0] = start;
\r
2120 m_utilBuffer_[1] = end;
\r
2125 * Checks and sets the match information if found.
\r
2128 * <li> the potential match does not repeat the previous match
\r
2129 * <li> boundaries are correct
\r
2130 * <li> potential match does not end in the middle of a contraction
\r
2131 * <li> identical matches
\r
2133 * Otherwise the offset will be shifted to the next character.
\r
2134 * The result m_matchIndex_ and m_matchLength_ will be set to the truncated
\r
2135 * more fitting result value.
\r
2136 * Uses the temporary utility buffer for storing the modified textoffset.
\r
2137 * @param textoffset offset in the collation element text.
\r
2138 * @return true if the match is valid, false otherwise
\r
2140 private boolean checkNextCanonicalMatch(int textoffset)
\r
2142 // to ensure that the start and ends are not composite characters
\r
2143 // if we have a canonical accent match
\r
2144 if ((m_pattern_.m_hasSuffixAccents_
\r
2145 && m_canonicalSuffixAccents_.length() != 0) ||
\r
2146 (m_pattern_.m_hasPrefixAccents_
\r
2147 && m_canonicalPrefixAccents_.length() != 0)) {
\r
2148 m_matchedIndex_ = getPreviousBaseOffset(m_colEIter_.getOffset());
\r
2149 matchLength = textoffset - m_matchedIndex_;
\r
2153 int start = m_colEIter_.getOffset();
\r
2154 if (!checkNextCanonicalContractionMatch(start, textoffset)) {
\r
2155 // return the modified textoffset
\r
2156 m_utilBuffer_[0] = m_utilBuffer_[1];
\r
2159 start = m_utilBuffer_[0];
\r
2160 textoffset = m_utilBuffer_[1];
\r
2161 start = getPreviousBaseOffset(start);
\r
2162 // this totally matches, however we need to check if it is repeating
\r
2163 if (checkRepeatedMatch(start, textoffset)
\r
2164 || !isBreakUnit(start, textoffset)
\r
2165 || !checkIdentical(start, textoffset)) {
\r
2167 textoffset = getNextBaseOffset(targetText, textoffset);
\r
2168 m_utilBuffer_[0] = textoffset;
\r
2172 m_matchedIndex_ = start;
\r
2173 matchLength = textoffset - start;
\r
2178 * Shifting the collation element iterator position forward to prepare for
\r
2179 * a preceding match. If the first character is a unsafe character, we'll
\r
2180 * only shift by 1 to capture contractions, normalization etc.
\r
2181 * @param textoffset start text position to do search
\r
2182 * @param ce the text ce which failed the match.
\r
2183 * @param patternceindex index of the ce within the pattern ce buffer which
\r
2184 * failed the match
\r
2185 * @return final offset
\r
2187 private int reverseShift(int textoffset, int ce, int patternceindex)
\r
2189 if (isOverlapping()) {
\r
2190 if (textoffset != m_textLimitOffset_) {
\r
2194 textoffset -= m_pattern_.m_defaultShiftSize_;
\r
2198 if (ce != CollationElementIterator.NULLORDER) {
\r
2199 int shift = m_pattern_.m_backShift_[hash(ce)];
\r
2201 // this is to adjust for characters in the middle of the substring
\r
2202 // for matching that failed.
\r
2203 int adjust = patternceindex;
\r
2204 if (adjust > 1 && shift > adjust) {
\r
2205 shift -= adjust - 1;
\r
2207 textoffset -= shift;
\r
2210 textoffset -= m_pattern_.m_defaultShiftSize_;
\r
2214 textoffset = getPreviousBaseOffset(textoffset);
\r
2215 return textoffset;
\r
2219 * Checks match for contraction.
\r
2220 * If the match starts with a partial contraction we fail.
\r
2221 * Uses the temporary utility buffer to return the modified start and end.
\r
2222 * @param start offset of potential match, to be modified if necessary
\r
2223 * @param end offset of potential match, to be modified if necessary
\r
2224 * @return true if match passes the contraction test, false otherwise.
\r
2226 private boolean checkPreviousExactContractionMatch(int start, int end)
\r
2228 // This part checks if either ends of the match contains potential
\r
2229 // contraction. If so we'll have to iterate through them
\r
2231 if (end < m_textLimitOffset_) {
\r
2232 targetText.setIndex(end);
\r
2233 echar = targetText.current();
\r
2236 if (start + 1 < m_textLimitOffset_) {
\r
2237 targetText.setIndex(start + 1);
\r
2238 schar = targetText.current();
\r
2240 if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
\r
2241 // expansion suffix, what's left to iterate
\r
2242 int expansion = m_colEIter_.m_CEBufferSize_
\r
2243 - m_colEIter_.m_CEBufferOffset_;
\r
2244 boolean hasExpansion = expansion > 0;
\r
2245 m_colEIter_.setExactOffset(end);
\r
2247 while (expansion > 0) {
\r
2248 // getting rid of the redundant ce
\r
2249 // since forward contraction/expansion may have extra ces
\r
2250 // if we are in the normalization buffer, hasAccentsBeforeMatch
\r
2251 // would have taken care of it.
\r
2252 // E.g. the character \u01FA will have an expansion of 3, but if
\r
2253 // we are only looking for A ring A\u030A, we'll have to skip the
\r
2254 // last ce in the expansion buffer
\r
2255 m_colEIter_.previous();
\r
2256 if (m_colEIter_.getOffset() != temp) {
\r
2258 temp = m_colEIter_.getOffset();
\r
2263 int count = m_pattern_.m_CELength_;
\r
2264 while (count > 0) {
\r
2265 int ce = getCE(m_colEIter_.previous());
\r
2266 // status checked below, note that if status is a failure
\r
2267 // ucol_previous returns UCOL_NULLORDER
\r
2268 if (ce == CollationElementIterator.IGNORABLE) {
\r
2271 if (hasExpansion && count == 0
\r
2272 && m_colEIter_.getOffset() != temp) {
\r
2274 temp = m_colEIter_.getOffset();
\r
2276 if (ce != m_pattern_.m_CE_[count - 1]) {
\r
2278 start = getPreviousBaseOffset(targetText, start);
\r
2279 m_utilBuffer_[0] = start;
\r
2280 m_utilBuffer_[1] = end;
\r
2286 m_utilBuffer_[0] = start;
\r
2287 m_utilBuffer_[1] = end;
\r
2292 * Checks and sets the match information if found.
\r
2295 * <li> the current match does not repeat the last match
\r
2296 * <li> boundaries are correct
\r
2297 * <li> exact matches has no extra accents
\r
2298 * <li> identical matches
\r
2300 * Otherwise the offset will be shifted to the preceding character.
\r
2301 * Uses the temporary utility buffer to store the modified textoffset.
\r
2302 * @param textoffset offset in the collation element text. the returned value
\r
2303 * will be the truncated start offset of the match or the new start
\r
2305 * @return true if the match is valid, false otherwise
\r
2307 private final boolean checkPreviousExactMatch(int textoffset)
\r
2309 // to ensure that the start and ends are not composite characters
\r
2310 int end = m_colEIter_.getOffset();
\r
2311 if (!checkPreviousExactContractionMatch(textoffset, end)) {
\r
2314 textoffset = m_utilBuffer_[0];
\r
2315 end = m_utilBuffer_[1];
\r
2317 // this totally matches, however we need to check if it is repeating
\r
2319 if (checkRepeatedMatch(textoffset, end)
\r
2320 || !isBreakUnit(textoffset, end)
\r
2321 || hasAccentsBeforeMatch(textoffset, end)
\r
2322 || !checkIdentical(textoffset, end)
\r
2323 || hasAccentsAfterMatch(textoffset, end)) {
\r
2325 textoffset = getPreviousBaseOffset(targetText, textoffset);
\r
2326 m_utilBuffer_[0] = textoffset;
\r
2330 if (m_collator_.getStrength() == Collator.PRIMARY) {
\r
2331 end = checkBreakBoundary(end);
\r
2334 m_matchedIndex_ = textoffset;
\r
2335 matchLength = end - textoffset;
\r
2340 * Rearranges the end accents to try matching.
\r
2341 * Suffix accents in the text will be grouped according to their combining
\r
2342 * class and the groups will be mixed and matched to try find the perfect
\r
2343 * match with the pattern.
\r
2344 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
\r
2345 * step 1: split "\u030A\u0301" into 6 other type of potential accent
\r
2347 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
\r
2349 * step 2: check if any of the generated substrings matches the pattern.
\r
2350 * @param start offset of the first base character
\r
2351 * @param end start of the last accent set
\r
2352 * @return DONE if a match is not found, otherwise return the ending
\r
2353 * offset of the match. Note this start includes all following
\r
2356 private int doPreviousCanonicalSuffixMatch(int start, int end)
\r
2358 targetText.setIndex(end);
\r
2359 if (UTF16.isTrailSurrogate(targetText.previous())
\r
2360 && targetText.getIndex() > m_textBeginOffset_) {
\r
2361 if (!UTF16.isLeadSurrogate(targetText.previous())) {
\r
2362 targetText.next();
\r
2365 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) {
\r
2366 // die... failed at a base character
\r
2369 end = getNextBaseOffset(targetText, end);
\r
2371 StringBuffer accents = new StringBuffer();
\r
2372 int offset = getPreviousBaseOffset(targetText, end);
\r
2373 // normalizing the offensive string
\r
2374 String accentstr = getString(targetText, offset, end - offset);
\r
2375 if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0)
\r
2376 == Normalizer.NO) {
\r
2377 accentstr = Normalizer.decompose(accentstr, false);
\r
2379 accents.append(accentstr);
\r
2381 int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
\r
2382 int accentsize = getUnblockedAccentIndex(accents, accentsindex);
\r
2383 int count = (2 << (accentsize - 1)) - 1;
\r
2384 while (count > 0) {
\r
2385 m_canonicalSuffixAccents_.delete(0,
\r
2386 m_canonicalSuffixAccents_.length());
\r
2387 // copy the base characters
\r
2388 for (int k = 0; k < accentsindex[0]; k ++) {
\r
2389 m_canonicalSuffixAccents_.append(accents.charAt(k));
\r
2391 // forming all possible canonical rearrangement by dropping
\r
2392 // sets of accents
\r
2393 for (int i = 0; i <= accentsize - 1; i ++) {
\r
2394 int mask = 1 << (accentsize - i - 1);
\r
2395 if ((count & mask) != 0) {
\r
2396 for (int j = accentsindex[i]; j < accentsindex[i + 1];
\r
2398 m_canonicalSuffixAccents_.append(accents.charAt(j));
\r
2402 StringBuffer match = merge(m_canonicalPrefixAccents_, targetText,
\r
2404 m_canonicalSuffixAccents_);
\r
2405 // run the collator iterator through this match
\r
2406 // if status is a failure ucol_setText does nothing
\r
2407 m_utilColEIter_.setText(match.toString());
\r
2408 if (checkCollationMatch(m_utilColEIter_)) {
\r
2417 * Take the rearranged start accents and tries matching. If match failed at
\r
2418 * a seperate following set of accents (seperated from the rearranged on by
\r
2419 * at least a base character) then we rearrange the preceding accents and
\r
2420 * tries matching again.
\r
2421 * We allow skipping of the ends of the accent set if the ces do not match.
\r
2422 * However if the failure is found before the accent set, it fails.
\r
2423 * Internal method, status assumed to be success, caller has to check
\r
2424 * status before calling this method.
\r
2425 * @param textoffset of the ends of the rearranged accent
\r
2426 * @return DONE if a match is not found, otherwise return the ending offset
\r
2427 * of the match. Note this start includes all following accents.
\r
2429 private int doPreviousCanonicalPrefixMatch(int textoffset)
\r
2431 // int safelength = 0;
\r
2432 StringBuffer safetext;
\r
2433 int safeoffset = textoffset;
\r
2435 if (textoffset > m_textBeginOffset_
\r
2436 && m_collator_.isUnsafe(m_canonicalPrefixAccents_.charAt(
\r
2437 m_canonicalPrefixAccents_.length() - 1))) {
\r
2438 safeoffset = getNextSafeOffset(textoffset, m_textLimitOffset_);
\r
2439 //safelength = safeoffset - textoffset;
\r
2440 safetext = merge(m_canonicalPrefixAccents_, targetText, textoffset,
\r
2441 safeoffset, null);
\r
2444 safetext = m_canonicalPrefixAccents_;
\r
2447 // if status is a failure, ucol_setText does nothing
\r
2448 CollationElementIterator coleiter = m_utilColEIter_;
\r
2449 coleiter.setText(safetext.toString());
\r
2450 // status checked in loop below
\r
2453 boolean isSafe = true; // safe zone indication flag for position
\r
2454 int prefixlength = m_canonicalPrefixAccents_.length();
\r
2456 while (ceindex < m_pattern_.m_CELength_) {
\r
2457 int textce = coleiter.next();
\r
2458 if (textce == CollationElementIterator.NULLORDER) {
\r
2459 // check if we have passed the safe buffer
\r
2460 if (coleiter == m_colEIter_) {
\r
2463 if (safetext != m_canonicalPrefixAccents_) {
\r
2464 safetext.delete(0, safetext.length());
\r
2466 coleiter = m_colEIter_;
\r
2467 coleiter.setExactOffset(safeoffset);
\r
2468 // status checked at the start of the loop
\r
2472 textce = getCE(textce);
\r
2473 if (textce != CollationElementIterator.IGNORABLE
\r
2474 && textce != m_pattern_.m_CE_[ceindex]) {
\r
2475 // do the beginning stuff
\r
2476 int failedoffset = coleiter.getOffset();
\r
2477 if (isSafe && failedoffset <= prefixlength) {
\r
2478 // alas... no hope. failed at rearranged accent set
\r
2483 failedoffset = safeoffset - failedoffset;
\r
2484 if (safetext != m_canonicalPrefixAccents_) {
\r
2485 safetext.delete(0, safetext.length());
\r
2489 // try rearranging the end accents
\r
2490 int result = doPreviousCanonicalSuffixMatch(textoffset,
\r
2492 if (result != DONE) {
\r
2493 // if status is a failure, ucol_setOffset does nothing
\r
2494 m_colEIter_.setExactOffset(result);
\r
2499 if (textce == m_pattern_.m_CE_[ceindex]) {
\r
2503 // set offset here
\r
2505 int result = coleiter.getOffset();
\r
2506 // sets the text iterator here with the correct expansion and offset
\r
2507 int leftoverces = coleiter.m_CEBufferSize_
\r
2508 - coleiter.m_CEBufferOffset_;
\r
2509 if (result <= prefixlength) {
\r
2510 result = textoffset;
\r
2513 result = textoffset + (safeoffset - result);
\r
2515 m_colEIter_.setExactOffset(result);
\r
2516 m_colEIter_.m_CEBufferOffset_ = m_colEIter_.m_CEBufferSize_
\r
2521 return coleiter.getOffset();
\r
2525 * Trying out the substring and sees if it can be a canonical match.
\r
2526 * This will try normalizing the starting accents and arranging them into
\r
2527 * canonical equivalents and check their corresponding ces with the pattern
\r
2529 * Prefix accents in the text will be grouped according to their combining
\r
2530 * class and the groups will be mixed and matched to try find the perfect
\r
2531 * match with the pattern.
\r
2532 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
\r
2533 * step 1: split "\u030A\u0301" into 6 other type of potential accent
\r
2535 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
\r
2537 * step 2: check if any of the generated substrings matches the pattern.
\r
2538 * @param textoffset start offset in the collation element text that starts
\r
2539 * with the accents to be rearranged
\r
2540 * @return true if the match is valid, false otherwise
\r
2542 private boolean doPreviousCanonicalMatch(int textoffset)
\r
2544 int offset = m_colEIter_.getOffset();
\r
2545 if ((getFCD(targetText, textoffset) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
\r
2546 if (m_pattern_.m_hasSuffixAccents_) {
\r
2547 offset = doPreviousCanonicalSuffixMatch(textoffset, offset);
\r
2548 if (offset != DONE) {
\r
2549 m_colEIter_.setExactOffset(offset);
\r
2556 if (!m_pattern_.m_hasPrefixAccents_) {
\r
2560 StringBuffer accents = new StringBuffer();
\r
2561 // offset to the last base character in substring to search
\r
2562 int baseoffset = getNextBaseOffset(targetText, textoffset);
\r
2563 // normalizing the offensive string
\r
2564 String textstr = getString(targetText, textoffset,
\r
2565 baseoffset - textoffset);
\r
2566 if (Normalizer.quickCheck(textstr, Normalizer.NFD,0)
\r
2567 == Normalizer.NO) {
\r
2568 textstr = Normalizer.decompose(textstr, false);
\r
2570 accents.append(textstr);
\r
2571 // status checked in loop
\r
2573 int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
\r
2574 int size = getUnblockedAccentIndex(accents, accentsindex);
\r
2576 // 2 power n - 1 plus the full set of accents
\r
2577 int count = (2 << (size - 1)) - 1;
\r
2578 while (count > 0) {
\r
2579 m_canonicalPrefixAccents_.delete(0,
\r
2580 m_canonicalPrefixAccents_.length());
\r
2581 // copy the base characters
\r
2582 for (int k = 0; k < accentsindex[0]; k ++) {
\r
2583 m_canonicalPrefixAccents_.append(accents.charAt(k));
\r
2585 // forming all possible canonical rearrangement by dropping
\r
2586 // sets of accents
\r
2587 for (int i = 0; i <= size - 1; i ++) {
\r
2588 int mask = 1 << (size - i - 1);
\r
2589 if ((count & mask) != 0) {
\r
2590 for (int j = accentsindex[i]; j < accentsindex[i + 1];
\r
2592 m_canonicalPrefixAccents_.append(accents.charAt(j));
\r
2596 offset = doPreviousCanonicalPrefixMatch(baseoffset);
\r
2597 if (offset != DONE) {
\r
2598 return true; // match found
\r
2606 * Checks match for contraction.
\r
2607 * If the match starts with a partial contraction we fail.
\r
2608 * Uses the temporary utility buffer to return the modified start and end.
\r
2609 * @param start offset of potential match, to be modified if necessary
\r
2610 * @param end offset of potential match, to be modified if necessary
\r
2611 * @return true if match passes the contraction test, false otherwise.
\r
2613 private boolean checkPreviousCanonicalContractionMatch(int start, int end)
\r
2616 // This part checks if either ends of the match contains potential
\r
2617 // contraction. If so we'll have to iterate through them
\r
2620 if (end < m_textLimitOffset_) {
\r
2621 targetText.setIndex(end);
\r
2622 echar = targetText.current();
\r
2624 if (start + 1 < m_textLimitOffset_) {
\r
2625 targetText.setIndex(start + 1);
\r
2626 schar = targetText.current();
\r
2628 if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
\r
2629 int expansion = m_colEIter_.m_CEBufferSize_
\r
2630 - m_colEIter_.m_CEBufferOffset_;
\r
2631 boolean hasExpansion = expansion > 0;
\r
2632 m_colEIter_.setExactOffset(end);
\r
2633 while (expansion > 0) {
\r
2634 // getting rid of the redundant ce
\r
2635 // since forward contraction/expansion may have extra ces
\r
2636 // if we are in the normalization buffer, hasAccentsBeforeMatch
\r
2637 // would have taken care of it.
\r
2638 // E.g. the character \u01FA will have an expansion of 3, but
\r
2639 // if we are only looking for A ring A\u030A, we'll have to
\r
2640 // skip the last ce in the expansion buffer
\r
2641 m_colEIter_.previous();
\r
2642 if (m_colEIter_.getOffset() != temp) {
\r
2644 temp = m_colEIter_.getOffset();
\r
2649 int count = m_pattern_.m_CELength_;
\r
2650 while (count > 0) {
\r
2651 int ce = getCE(m_colEIter_.previous());
\r
2652 // status checked below, note that if status is a failure
\r
2653 // previous() returns NULLORDER
\r
2654 if (ce == CollationElementIterator.IGNORABLE) {
\r
2657 if (hasExpansion && count == 0
\r
2658 && m_colEIter_.getOffset() != temp) {
\r
2660 temp = m_colEIter_.getOffset();
\r
2662 if (count == m_pattern_.m_CELength_
\r
2663 && ce != m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1]) {
\r
2664 // accents may have extra starting ces, this occurs when a
\r
2665 // pure accent pattern is matched without rearrangement
\r
2666 int expected = m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1];
\r
2667 targetText.setIndex(end);
\r
2668 if (UTF16.isTrailSurrogate(targetText.previous())) {
\r
2669 if (targetText.getIndex() > m_textBeginOffset_ &&
\r
2670 !UTF16.isLeadSurrogate(targetText.previous())) {
\r
2671 targetText.next();
\r
2674 end = targetText.getIndex();
\r
2675 if ((getFCD(targetText, end) & LAST_BYTE_MASK_) != 0) {
\r
2676 ce = getCE(m_colEIter_.previous());
\r
2677 while (ce != expected
\r
2678 && ce != CollationElementIterator.NULLORDER
\r
2679 && m_colEIter_.getOffset() <= start) {
\r
2680 ce = getCE(m_colEIter_.previous());
\r
2684 if (ce != m_pattern_.m_CE_[count - 1]) {
\r
2686 start = getPreviousBaseOffset(start);
\r
2687 m_utilBuffer_[0] = start;
\r
2688 m_utilBuffer_[1] = end;
\r
2694 m_utilBuffer_[0] = start;
\r
2695 m_utilBuffer_[1] = end;
\r
2700 * Checks and sets the match information if found.
\r
2703 * <li> the potential match does not repeat the previous match
\r
2704 * <li> boundaries are correct
\r
2705 * <li> potential match does not end in the middle of a contraction
\r
2706 * <li> identical matches
\r
2708 * Otherwise the offset will be shifted to the next character.
\r
2709 * Uses the temporary utility buffer for storing the modified textoffset.
\r
2710 * @param textoffset offset in the collation element text. the returned
\r
2711 * value will be the truncated start offset of the match or the
\r
2712 * new start search offset.
\r
2713 * @return true if the match is valid, false otherwise
\r
2715 private boolean checkPreviousCanonicalMatch(int textoffset)
\r
2717 // to ensure that the start and ends are not composite characters
\r
2718 // if we have a canonical accent match
\r
2719 if (m_pattern_.m_hasSuffixAccents_
\r
2720 && m_canonicalSuffixAccents_.length() != 0
\r
2721 || m_pattern_.m_hasPrefixAccents_
\r
2722 && m_canonicalPrefixAccents_.length() != 0) {
\r
2723 m_matchedIndex_ = textoffset;
\r
2724 matchLength = getNextBaseOffset(m_colEIter_.getOffset())
\r
2729 int end = m_colEIter_.getOffset();
\r
2730 if (!checkPreviousCanonicalContractionMatch(textoffset, end)) {
\r
2731 // storing the modified textoffset
\r
2734 textoffset = m_utilBuffer_[0];
\r
2735 end = m_utilBuffer_[1];
\r
2736 end = getNextBaseOffset(end);
\r
2737 // this totally matches, however we need to check if it is repeating
\r
2738 if (checkRepeatedMatch(textoffset, end)
\r
2739 || !isBreakUnit(textoffset, end)
\r
2740 || !checkIdentical(textoffset, end)) {
\r
2742 textoffset = getPreviousBaseOffset(textoffset);
\r
2743 m_utilBuffer_[0] = textoffset;
\r
2747 m_matchedIndex_ = textoffset;
\r
2748 matchLength = end - textoffset;
\r
2753 * Method that does the next exact match
\r
2754 * @param start the offset to start shifting from and performing the
\r
2755 * next exact match
\r
2757 private void handleNextExact(int start)
\r
2759 int textoffset = shiftForward(start,
\r
2760 CollationElementIterator.NULLORDER,
\r
2761 m_pattern_.m_CELength_);
\r
2762 int targetce = CollationElementIterator.IGNORABLE;
\r
2763 while (textoffset <= m_textLimitOffset_) {
\r
2764 m_colEIter_.setExactOffset(textoffset);
\r
2765 int patternceindex = m_pattern_.m_CELength_ - 1;
\r
2766 boolean found = false;
\r
2767 int lastce = CollationElementIterator.NULLORDER;
\r
2770 // finding the last pattern ce match, imagine composite
\r
2771 // characters. for example: search for pattern A in text \u00C0
\r
2772 // we'll have to skip \u0300 the grave first before we get to A
\r
2773 targetce = m_colEIter_.previous();
\r
2774 if (targetce == CollationElementIterator.NULLORDER) {
\r
2778 targetce = getCE(targetce);
\r
2779 if (targetce == CollationElementIterator.IGNORABLE &&
\r
2780 m_colEIter_.isInBuffer()) {
\r
2781 // this is for the text \u0315\u0300 that requires
\r
2782 // normalization and pattern \u0300, where \u0315 is ignorable
\r
2785 if (lastce == CollationElementIterator.NULLORDER
\r
2786 || lastce == CollationElementIterator.IGNORABLE) {
\r
2787 lastce = targetce;
\r
2789 if (targetce == m_pattern_.m_CE_[patternceindex]) {
\r
2790 // the first ce can be a contraction
\r
2794 if (m_colEIter_.m_CEBufferOffset_ <= 0) {
\r
2800 while (found && patternceindex > 0) {
\r
2801 lastce = targetce;
\r
2802 targetce = m_colEIter_.previous();
\r
2803 if (targetce == CollationElementIterator.NULLORDER) {
\r
2807 targetce = getCE(targetce);
\r
2808 if (targetce == CollationElementIterator.IGNORABLE) {
\r
2812 patternceindex --;
\r
2813 found = found && targetce == m_pattern_.m_CE_[patternceindex];
\r
2816 targetce = lastce;
\r
2819 textoffset = shiftForward(textoffset, lastce, patternceindex);
\r
2820 // status checked at loop.
\r
2821 patternceindex = m_pattern_.m_CELength_;
\r
2825 if (checkNextExactMatch(textoffset)) {
\r
2826 // status checked in ucol_setOffset
\r
2829 textoffset = m_utilBuffer_[0];
\r
2831 setMatchNotFound();
\r
2835 * Method that does the next canonical match
\r
2836 * @param start the offset to start shifting from and performing the
\r
2837 * next canonical match
\r
2839 private void handleNextCanonical(int start)
\r
2841 boolean hasPatternAccents =
\r
2842 m_pattern_.m_hasSuffixAccents_ || m_pattern_.m_hasPrefixAccents_;
\r
2844 // shifting it check for setting offset
\r
2845 // if setOffset is called previously or there was no previous match, we
\r
2846 // leave the offset as it is.
\r
2847 int textoffset = shiftForward(start, CollationElementIterator.NULLORDER,
\r
2848 m_pattern_.m_CELength_);
\r
2849 m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_.length());
\r
2850 m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_.length());
\r
2851 int targetce = CollationElementIterator.IGNORABLE;
\r
2853 while (textoffset <= m_textLimitOffset_)
\r
2855 m_colEIter_.setExactOffset(textoffset);
\r
2856 int patternceindex = m_pattern_.m_CELength_ - 1;
\r
2857 boolean found = false;
\r
2858 int lastce = CollationElementIterator.NULLORDER;
\r
2861 // finding the last pattern ce match, imagine composite characters
\r
2862 // for example: search for pattern A in text \u00C0
\r
2863 // we'll have to skip \u0300 the grave first before we get to A
\r
2864 targetce = m_colEIter_.previous();
\r
2865 if (targetce == CollationElementIterator.NULLORDER) {
\r
2869 targetce = getCE(targetce);
\r
2870 if (lastce == CollationElementIterator.NULLORDER
\r
2871 || lastce == CollationElementIterator.IGNORABLE) {
\r
2872 lastce = targetce;
\r
2874 if (targetce == m_pattern_.m_CE_[patternceindex]) {
\r
2875 // the first ce can be a contraction
\r
2879 if (m_colEIter_.m_CEBufferOffset_ <= 0) {
\r
2885 while (found && patternceindex > 0) {
\r
2886 targetce = m_colEIter_.previous();
\r
2887 if (targetce == CollationElementIterator.NULLORDER) {
\r
2891 targetce = getCE(targetce);
\r
2892 if (targetce == CollationElementIterator.IGNORABLE) {
\r
2896 patternceindex --;
\r
2897 found = found && targetce == m_pattern_.m_CE_[patternceindex];
\r
2900 // initializing the rearranged accent array
\r
2901 if (hasPatternAccents && !found) {
\r
2902 found = doNextCanonicalMatch(textoffset);
\r
2906 textoffset = shiftForward(textoffset, lastce, patternceindex);
\r
2907 // status checked at loop
\r
2908 patternceindex = m_pattern_.m_CELength_;
\r
2912 if (checkNextCanonicalMatch(textoffset)) {
\r
2915 textoffset = m_utilBuffer_[0];
\r
2917 setMatchNotFound();
\r
2921 * Method that does the previous exact match
\r
2922 * @param start the offset to start shifting from and performing the
\r
2923 * previous exact match
\r
2925 private void handlePreviousExact(int start)
\r
2927 int textoffset = reverseShift(start, CollationElementIterator.NULLORDER,
\r
2928 m_pattern_.m_CELength_);
\r
2929 while (textoffset >= m_textBeginOffset_)
\r
2931 m_colEIter_.setExactOffset(textoffset);
\r
2932 int patternceindex = 1;
\r
2933 int targetce = CollationElementIterator.IGNORABLE;
\r
2934 boolean found = false;
\r
2935 int firstce = CollationElementIterator.NULLORDER;
\r
2938 // finding the first pattern ce match, imagine composite
\r
2939 // characters. for example: search for pattern \u0300 in text
\r
2940 // \u00C0, we'll have to skip A first before we get to
\r
2941 // \u0300 the grave accent
\r
2942 targetce = m_colEIter_.next();
\r
2943 if (targetce == CollationElementIterator.NULLORDER) {
\r
2947 targetce = getCE(targetce);
\r
2948 if (firstce == CollationElementIterator.NULLORDER
\r
2949 || firstce == CollationElementIterator.IGNORABLE) {
\r
2950 firstce = targetce;
\r
2952 if (targetce == CollationElementIterator.IGNORABLE && m_collator_.getStrength() != Collator.PRIMARY) {
\r
2955 if (targetce == m_pattern_.m_CE_[0]) {
\r
2959 if (m_colEIter_.m_CEBufferOffset_ == -1
\r
2960 || m_colEIter_.m_CEBufferOffset_
\r
2961 == m_colEIter_.m_CEBufferSize_) {
\r
2962 // checking for accents in composite character
\r
2968 //targetce = firstce;
\r
2970 while (found && patternceindex < m_pattern_.m_CELength_) {
\r
2971 firstce = targetce;
\r
2972 targetce = m_colEIter_.next();
\r
2973 if (targetce == CollationElementIterator.NULLORDER) {
\r
2977 targetce = getCE(targetce);
\r
2978 if (targetce == CollationElementIterator.IGNORABLE) {
\r
2982 found = found && targetce == m_pattern_.m_CE_[patternceindex];
\r
2983 patternceindex ++;
\r
2986 targetce = firstce;
\r
2989 textoffset = reverseShift(textoffset, targetce, patternceindex);
\r
2990 patternceindex = 0;
\r
2994 if (checkPreviousExactMatch(textoffset)) {
\r
2997 textoffset = m_utilBuffer_[0];
\r
2999 setMatchNotFound();
\r
3003 * Method that does the previous canonical match
\r
3004 * @param start the offset to start shifting from and performing the
\r
3005 * previous canonical match
\r
3007 private void handlePreviousCanonical(int start)
\r
3009 boolean hasPatternAccents =
\r
3010 m_pattern_.m_hasSuffixAccents_ || m_pattern_.m_hasPrefixAccents_;
\r
3012 // shifting it check for setting offset
\r
3013 // if setOffset is called previously or there was no previous match, we
\r
3014 // leave the offset as it is.
\r
3015 int textoffset = reverseShift(start, CollationElementIterator.NULLORDER,
\r
3016 m_pattern_.m_CELength_);
\r
3017 m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_.length());
\r
3018 m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_.length());
\r
3020 while (textoffset >= m_textBeginOffset_)
\r
3022 m_colEIter_.setExactOffset(textoffset);
\r
3023 int patternceindex = 1;
\r
3024 int targetce = CollationElementIterator.IGNORABLE;
\r
3025 boolean found = false;
\r
3026 int firstce = CollationElementIterator.NULLORDER;
\r
3029 // finding the first pattern ce match, imagine composite
\r
3030 // characters. for example: search for pattern \u0300 in text
\r
3031 // \u00C0, we'll have to skip A first before we get to
\r
3032 // \u0300 the grave accent
\r
3033 targetce = m_colEIter_.next();
\r
3034 if (targetce == CollationElementIterator.NULLORDER) {
\r
3038 targetce = getCE(targetce);
\r
3039 if (firstce == CollationElementIterator.NULLORDER
\r
3040 || firstce == CollationElementIterator.IGNORABLE) {
\r
3041 firstce = targetce;
\r
3044 if (targetce == m_pattern_.m_CE_[0]) {
\r
3045 // the first ce can be a contraction
\r
3049 if (m_colEIter_.m_CEBufferOffset_ == -1
\r
3050 || m_colEIter_.m_CEBufferOffset_
\r
3051 == m_colEIter_.m_CEBufferSize_) {
\r
3052 // checking for accents in composite character
\r
3058 targetce = firstce;
\r
3060 while (found && patternceindex < m_pattern_.m_CELength_) {
\r
3061 targetce = m_colEIter_.next();
\r
3062 if (targetce == CollationElementIterator.NULLORDER) {
\r
3066 targetce = getCE(targetce);
\r
3067 if (targetce == CollationElementIterator.IGNORABLE) {
\r
3071 found = found && targetce == m_pattern_.m_CE_[patternceindex];
\r
3072 patternceindex ++;
\r
3075 // initializing the rearranged accent array
\r
3076 if (hasPatternAccents && !found) {
\r
3077 found = doPreviousCanonicalMatch(textoffset);
\r
3081 textoffset = reverseShift(textoffset, targetce, patternceindex);
\r
3082 patternceindex = 0;
\r
3086 if (checkPreviousCanonicalMatch(textoffset)) {
\r
3089 textoffset = m_utilBuffer_[0];
\r
3091 setMatchNotFound();
\r
3095 * Gets a substring out of a CharacterIterator
\r
3096 * @param text CharacterIterator
\r
3097 * @param start start offset
\r
3098 * @param length of substring
\r
3099 * @return substring from text starting at start and length length
\r
3101 private static final String getString(CharacterIterator text, int start,
\r
3104 StringBuffer result = new StringBuffer(length);
\r
3105 int offset = text.getIndex();
\r
3106 text.setIndex(start);
\r
3107 for (int i = 0; i < length; i ++) {
\r
3108 result.append(text.current());
\r
3111 text.setIndex(offset);
\r
3112 return result.toString();
\r
3116 * Getting the mask for collation strength
\r
3117 * @param strength collation strength
\r
3118 * @return collation element mask
\r
3120 private static final int getMask(int strength)
\r
3122 switch (strength)
\r
3124 case Collator.PRIMARY:
\r
3125 return RuleBasedCollator.CE_PRIMARY_MASK_;
\r
3126 case Collator.SECONDARY:
\r
3127 return RuleBasedCollator.CE_SECONDARY_MASK_
\r
3128 | RuleBasedCollator.CE_PRIMARY_MASK_;
\r
3130 return RuleBasedCollator.CE_TERTIARY_MASK_
\r
3131 | RuleBasedCollator.CE_SECONDARY_MASK_
\r
3132 | RuleBasedCollator.CE_PRIMARY_MASK_;
\r
3137 * Sets match not found
\r
3139 private void setMatchNotFound()
\r
3141 // this method resets the match result regardless of the error status.
\r
3142 m_matchedIndex_ = DONE;
\r
3143 setMatchLength(0);
\r
3147 * Check the boundaries of the match.
\r
3149 private int checkBreakBoundary(int end) {
\r
3150 if (!m_charBreakIter_.isBoundary(end)) {
\r
3151 end = m_charBreakIter_.following(end);
\r