2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import java.text.CharacterIterator;
\r
13 * <p>SearchIterator is an abstract base class that defines a protocol
\r
14 * for text searching. Subclasses provide concrete implementations of
\r
15 * various search algorithms. A concrete subclass, StringSearch, is
\r
16 * provided that implements language-sensitive pattern matching based
\r
17 * on the comparison rules defined in a RuleBasedCollator
\r
18 * object. Instances of SearchIterator maintain a current position and
\r
19 * scan over the target text, returning the indices where a match is
\r
20 * found and the length of each match. Generally, the sequence of forward
\r
21 * matches will be equivalent to the sequence of backward matches.One
\r
22 * case where this statement may not hold is when non-overlapping mode
\r
23 * is set on and there are continuous repetitive patterns in the text.
\r
24 * Consider the case searching for pattern "aba" in the text
\r
25 * "ababababa", setting overlapping mode off will produce forward matches
\r
26 * at offsets 0, 4. However when a backwards search is done, the
\r
27 * results will be at offsets 6 and 2.</p>
\r
29 * <p>If matches searched for have boundary restrictions. BreakIterators
\r
30 * can be used to define the valid boundaries of such a match. Once a
\r
31 * BreakIterator is set, potential matches will be tested against the
\r
32 * BreakIterator to determine if the boundaries are valid and that all
\r
33 * characters in the potential match are equivalent to the pattern
\r
34 * searched for. For example, looking for the pattern "fox" in the text
\r
35 * "foxy fox" will produce match results at offset 0 and 5 with length 3
\r
36 * if no BreakIterators were set. However if a WordBreakIterator is set,
\r
37 * the only match that would be found will be at the offset 5. Since,
\r
38 * the SearchIterator guarantees that if a BreakIterator is set, all its
\r
39 * matches will match the given pattern exactly, a potential match that
\r
40 * passes the BreakIterator might still not produce a valid match. For
\r
41 * instance the pattern "e" will not be found in the string
\r
42 * "\u00e9" (latin small letter e with acute) if a
\r
43 * CharacterBreakIterator is used. Even though "e" is
\r
44 * a part of the character "\u00e9" and the potential match at
\r
45 * offset 0 length 1 passes the CharacterBreakIterator test, "\u00e9"
\r
46 * is not equivalent to "e", hence the SearchIterator rejects the potential
\r
47 * match. By default, the SearchIterator
\r
48 * does not impose any boundary restriction on the matches, it will
\r
49 * return all results that match the pattern. Illustrating with the
\r
50 * above example, "e" will
\r
51 * be found in the string "\u00e9" if no BreakIterator is
\r
54 * <p>SearchIterator also provides a means to handle overlapping
\r
55 * matches via the API setOverlapping(boolean). For example, if
\r
56 * overlapping mode is set, searching for the pattern "abab" in the
\r
57 * text "ababab" will match at positions 0 and 2, whereas if
\r
58 * overlapping is not set, SearchIterator will only match at position
\r
59 * 0. By default, overlapping mode is not set.</p>
\r
61 * <p>The APIs in SearchIterator are similar to that of other text
\r
62 * iteration classes such as BreakIterator. Using this class, it is
\r
63 * easy to scan through text looking for all occurances of a
\r
66 * Example of use:<br>
\r
68 * String target = "The quick brown fox jumped over the lazy fox";
\r
69 * String pattern = "fox";
\r
70 * SearchIterator iter = new StringSearch(pattern, target);
\r
71 * for (int pos = iter.first(); pos != SearchIterator.DONE;
\r
72 * pos = iter.next()) {
\r
73 * // println matches at offset 16 and 41 with length 3
\r
74 * System.out.println("Found match at " + pos + ", length is "
\r
75 * + iter.getMatchLength());
\r
77 * target = "ababababa";
\r
79 * iter.setTarget(new StringCharacterIterator(pattern));
\r
80 * iter.setOverlapping(false);
\r
81 * System.out.println("Overlapping mode set to false");
\r
82 * System.out.println("Forward matches of pattern " + pattern + " in text "
\r
84 * for (int pos = iter.first(); pos != SearchIterator.DONE;
\r
85 * pos = iter.next()) {
\r
86 * // println matches at offset 0 and 4 with length 3
\r
87 * System.out.println("offset " + pos + ", length "
\r
88 * + iter.getMatchLength());
\r
90 * System.out.println("Backward matches of pattern " + pattern + " in text "
\r
92 * for (int pos = iter.last(); pos != SearchIterator.DONE;
\r
93 * pos = iter.previous()) {
\r
94 * // println matches at offset 6 and 2 with length 3
\r
95 * System.out.println("offset " + pos + ", length "
\r
96 * + iter.getMatchLength());
\r
98 * System.out.println("Overlapping mode set to true");
\r
99 * System.out.println("Index set to 2");
\r
100 * iter.setIndex(2);
\r
101 * iter.setOverlapping(true);
\r
102 * System.out.println("Forward matches of pattern " + pattern + " in text "
\r
104 * for (int pos = iter.first(); pos != SearchIterator.DONE;
\r
105 * pos = iter.next()) {
\r
106 * // println matches at offset 2, 4 and 6 with length 3
\r
107 * System.out.println("offset " + pos + ", length "
\r
108 * + iter.getMatchLength());
\r
110 * System.out.println("Index set to 2");
\r
111 * iter.setIndex(2);
\r
112 * System.out.println("Backward matches of pattern " + pattern + " in text "
\r
114 * for (int pos = iter.last(); pos != SearchIterator.DONE;
\r
115 * pos = iter.previous()) {
\r
116 * // println matches at offset 0 with length 3
\r
117 * System.out.println("offset " + pos + ", length "
\r
118 * + iter.getMatchLength());
\r
122 * @author Laura Werner, synwee
\r
124 * @see BreakIterator
\r
126 public abstract class SearchIterator
\r
129 // public data members -------------------------------------------------
\r
132 * DONE is returned by previous() and next() after all valid matches have
\r
133 * been returned, and by first() and last() if there are no matches at all.
\r
138 public static final int DONE = -1;
\r
140 // public methods -----------------------------------------------------
\r
142 // public setters -----------------------------------------------------
\r
146 * Sets the position in the target text at which the next search will start.
\r
147 * This method clears any previous match.
\r
149 * @param position position from which to start the next search
\r
150 * @exception IndexOutOfBoundsException thrown if argument position is out
\r
151 * of the target text range.
\r
155 public void setIndex(int position) {
\r
156 if (position < targetText.getBeginIndex()
\r
157 || position > targetText.getEndIndex()) {
\r
158 throw new IndexOutOfBoundsException(
\r
159 "setIndex(int) expected position to be between " +
\r
160 targetText.getBeginIndex() + " and " + targetText.getEndIndex());
\r
162 m_setOffset_ = position;
\r
169 * Determines whether overlapping matches are returned. See the class
\r
170 * documentation for more information about overlapping matches.
\r
173 * The default setting of this property is false
\r
175 * @param allowOverlap flag indicator if overlapping matches are allowed
\r
176 * @see #isOverlapping
\r
179 public void setOverlapping(boolean allowOverlap)
\r
181 m_isOverlap_ = allowOverlap;
\r
185 * Set the BreakIterator that is used to restrict the points at which
\r
186 * matches are detected.
\r
187 * Using <tt>null</tt> as the parameter is legal; it means that break
\r
188 * detection should not be attempted.
\r
189 * See class documentation for more information.
\r
190 * @param breakiter A BreakIterator that will be used to restrict the
\r
191 * points at which matches are detected.
\r
192 * @see #getBreakIterator
\r
193 * @see BreakIterator
\r
196 public void setBreakIterator(BreakIterator breakiter)
\r
198 breakIterator = breakiter;
\r
199 if (breakIterator != null) {
\r
200 breakIterator.setText(targetText);
\r
205 * Set the target text to be searched. Text iteration will then begin at
\r
206 * the start of the text string. This method is useful if you want to
\r
207 * reuse an iterator to search within a different body of text.
\r
208 * @param text new text iterator to look for match,
\r
209 * @exception IllegalArgumentException thrown when text is null or has
\r
214 public void setTarget(CharacterIterator text)
\r
216 if (text == null || text.getEndIndex() == text.getIndex()) {
\r
217 throw new IllegalArgumentException("Illegal null or empty text");
\r
221 targetText.setIndex(targetText.getBeginIndex());
\r
224 m_isForwardSearching_ = true;
\r
225 if (breakIterator != null) {
\r
226 breakIterator.setText(targetText);
\r
230 // public getters ----------------------------------------------------
\r
234 * Returns the index of the most recent match in the target text.
\r
235 * This call returns a valid result only after a successful call to
\r
236 * {@link #first}, {@link #next}, {@link #previous}, or {@link #last}.
\r
237 * Just after construction, or after a searching method returns
\r
238 * <tt>DONE</tt>, this method will return <tt>DONE</tt>.
\r
241 * Use <tt>getMatchLength</tt> to get the length of the matched text.
\r
242 * <tt>getMatchedText</tt> will return the subtext in the searched
\r
243 * target text from index getMatchStart() with length getMatchLength().
\r
245 * @return index to a substring within the text string that is being
\r
247 * @see #getMatchLength
\r
248 * @see #getMatchedText
\r
256 public int getMatchStart()
\r
258 return m_lastMatchStart_;
\r
262 * Return the index in the target text at which the iterator is currently
\r
264 * If the iteration has gone past the end of the target text, or past
\r
265 * the beginning for a backwards search, {@link #DONE} is returned.
\r
266 * @return index in the target text at which the iterator is currently
\r
275 public abstract int getIndex();
\r
279 * Returns the length of the most recent match in the target text.
\r
280 * This call returns a valid result only after a successful
\r
281 * call to {@link #first}, {@link #next}, {@link #previous}, or
\r
283 * Just after construction, or after a searching method returns
\r
284 * <tt>DONE</tt>, this method will return 0. See getMatchStart() for
\r
287 * @return The length of the most recent match in the target text, or 0 if
\r
288 * there is no match.
\r
289 * @see #getMatchStart
\r
290 * @see #getMatchedText
\r
298 public int getMatchLength()
\r
300 return matchLength;
\r
304 * Returns the BreakIterator that is used to restrict the indexes at which
\r
305 * matches are detected. This will be the same object that was passed to
\r
306 * the constructor or to <code>setBreakIterator</code>.
\r
307 * If the BreakIterator has not been set, <tt>null</tt> will be returned.
\r
308 * See setBreakIterator for more information.
\r
309 * @return the BreakIterator set to restrict logic matches
\r
310 * @see #setBreakIterator
\r
311 * @see BreakIterator
\r
314 public BreakIterator getBreakIterator()
\r
316 return breakIterator;
\r
320 * Return the target text that is being searched.
\r
321 * @return target text being searched.
\r
325 public CharacterIterator getTarget()
\r
331 * Returns the text that was matched by the most recent call to
\r
332 * {@link #first}, {@link #next}, {@link #previous}, or {@link #last}.
\r
333 * If the iterator is not pointing at a valid match, for instance just
\r
334 * after construction or after <tt>DONE</tt> has been returned, an empty
\r
335 * String will be returned. See getMatchStart for more information
\r
336 * @see #getMatchStart
\r
337 * @see #getMatchLength
\r
343 * @return the substring in the target text of the most recent match
\r
346 public String getMatchedText()
\r
348 if (matchLength > 0) {
\r
349 int limit = m_lastMatchStart_ + matchLength;
\r
350 StringBuilder result = new StringBuilder(matchLength);
\r
351 result.append(targetText.current());
\r
353 while (targetText.getIndex() < limit) {
\r
354 result.append(targetText.current());
\r
357 targetText.setIndex(m_lastMatchStart_);
\r
358 return result.toString();
\r
363 // miscellaneous public methods -----------------------------------------
\r
366 * Search <b>forwards</b> in the target text for the next valid match,
\r
367 * starting the search from the current iterator position. The iterator is
\r
368 * adjusted so that its current index, as returned by {@link #getIndex},
\r
369 * is the starting position of the match if one was found. If a match is
\r
370 * found, the index of the match is returned, otherwise <tt>DONE</tt> is
\r
371 * returned. If overlapping mode is set, the beginning of the found match
\r
372 * can be before the end of the current match, if any.
\r
373 * @return The starting index of the next forward match after the current
\r
374 * iterator position, or
\r
375 * <tt>DONE</tt> if there are no more matches.
\r
376 * @see #getMatchStart
\r
377 * @see #getMatchLength
\r
378 * @see #getMatchedText
\r
389 int start = targetText.getIndex();
\r
390 if (m_setOffset_ != DONE) {
\r
391 start = m_setOffset_;
\r
392 m_setOffset_ = DONE;
\r
394 if (m_isForwardSearching_) {
\r
396 start + matchLength >= targetText.getEndIndex()) {
\r
397 // not enough characters to match
\r
399 targetText.setIndex(targetText.getEndIndex());
\r
400 m_lastMatchStart_ = DONE;
\r
406 // switching direction.
\r
407 // if matchedIndex == USEARCH_DONE, it means that either a
\r
408 // setIndex has been called or that previous ran off the text
\r
409 // string. the iterator would have been set to offset 0 if a
\r
410 // match is not found.
\r
411 m_isForwardSearching_ = true;
\r
412 if (start != DONE) {
\r
413 // there's no need to set the collation element iterator
\r
414 // the next call to next will set the offset.
\r
419 if (start == DONE) {
\r
420 start = targetText.getBeginIndex();
\r
422 if (matchLength > 0) {
\r
423 // if match length is 0 we are at the start of the iteration
\r
424 if (m_isOverlap_) {
\r
428 start += matchLength;
\r
431 m_lastMatchStart_ = handleNext(start);
\r
432 return m_lastMatchStart_;
\r
436 * Search <b>backwards</b> in the target text for the next valid match,
\r
437 * starting the search from the current iterator position. The iterator is
\r
438 * adjusted so that its current index, as returned by {@link #getIndex},
\r
439 * is the starting position of the match if one was found. If a match is
\r
440 * found, the index is returned, otherwise <tt>DONE</tt> is returned. If
\r
441 * overlapping mode is set, the end of the found match can be after the
\r
442 * beginning of the previous match, if any.
\r
443 * @return The starting index of the next backwards match after the current
\r
444 * iterator position, or
\r
445 * <tt>DONE</tt> if there are no more matches.
\r
446 * @see #getMatchStart
\r
447 * @see #getMatchLength
\r
448 * @see #getMatchedText
\r
457 public int previous()
\r
459 int start = targetText.getIndex();
\r
460 if (m_setOffset_ != DONE) {
\r
461 start = m_setOffset_;
\r
462 m_setOffset_ = DONE;
\r
465 m_isForwardSearching_ = false;
\r
467 start = targetText.getEndIndex();
\r
470 if (m_isForwardSearching_ == true) {
\r
471 // switching direction.
\r
472 // if matchedIndex == USEARCH_DONE, it means that either a
\r
473 // setIndex has been called or that next ran off the text
\r
474 // string. the iterator would have been set to offset textLength if
\r
475 // a match is not found.
\r
476 m_isForwardSearching_ = false;
\r
477 if (start != targetText.getEndIndex()) {
\r
482 if (start == targetText.getBeginIndex()) {
\r
483 // not enough characters to match
\r
485 targetText.setIndex(targetText.getBeginIndex());
\r
486 m_lastMatchStart_ = DONE;
\r
491 m_lastMatchStart_ = handlePrevious(start);
\r
492 return m_lastMatchStart_;
\r
496 * Return true if the overlapping property has been set.
\r
497 * See setOverlapping(boolean) for more information.
\r
498 * @see #setOverlapping
\r
499 * @return true if the overlapping property has been set, false otherwise
\r
502 public boolean isOverlapping()
\r
504 return m_isOverlap_;
\r
509 * Resets the search iteration. All properties will be reset to their
\r
513 * If a forward iteration is initiated, the next search will begin at the
\r
514 * start of the target text. Otherwise, if a backwards iteration is initiated,
\r
515 * the next search will begin at the end of the target text.
\r
519 public void reset()
\r
521 // reset is setting the attributes that are already in string search
\r
523 setIndex(targetText.getBeginIndex());
\r
524 m_isOverlap_ = false;
\r
525 m_isForwardSearching_ = true;
\r
527 m_setOffset_ = DONE;
\r
531 * Return the index of the first <b>forward</b> match in the target text.
\r
532 * This method sets the iteration to begin at the start of the
\r
533 * target text and searches forward from there.
\r
534 * @return The index of the first forward match, or <code>DONE</code>
\r
535 * if there are no matches.
\r
536 * @see #getMatchStart
\r
537 * @see #getMatchLength
\r
538 * @see #getMatchedText
\r
547 public final int first()
\r
549 m_isForwardSearching_ = true;
\r
550 setIndex(targetText.getBeginIndex());
\r
555 * Return the index of the first <b>forward</b> match in target text that
\r
556 * is at or after argument <tt>position</tt>.
\r
557 * This method sets the iteration to begin at the specified
\r
558 * position in the the target text and searches forward from there.
\r
559 * @return The index of the first forward match, or <code>DONE</code>
\r
560 * if there are no matches.
\r
561 * @see #getMatchStart
\r
562 * @see #getMatchLength
\r
563 * @see #getMatchedText
\r
572 public final int following(int position)
\r
574 m_isForwardSearching_ = true;
\r
575 // position checked in usearch_setOffset
\r
576 setIndex(position);
\r
581 * Return the index of the first <b>backward</b> match in target text.
\r
582 * This method sets the iteration to begin at the end of the
\r
583 * target text and searches backwards from there.
\r
584 * @return The starting index of the first backward match, or
\r
585 * <code>DONE</code> if there are no matches.
\r
586 * @see #getMatchStart
\r
587 * @see #getMatchLength
\r
588 * @see #getMatchedText
\r
597 public final int last()
\r
599 m_isForwardSearching_ = false;
\r
600 setIndex(targetText.getEndIndex());
\r
605 * Return the index of the first <b>backwards</b> match in target
\r
606 * text that ends at or before argument <tt>position</tt>.
\r
607 * This method sets the iteration to begin at the argument
\r
608 * position index of the target text and searches backwards from there.
\r
609 * @return The starting index of the first backwards match, or
\r
610 * <code>DONE</code>
\r
611 * if there are no matches.
\r
612 * @see #getMatchStart
\r
613 * @see #getMatchLength
\r
614 * @see #getMatchedText
\r
623 public final int preceding(int position)
\r
625 m_isForwardSearching_ = false;
\r
626 // position checked in usearch_setOffset
\r
627 setIndex(position);
\r
628 return previous();
\r
631 // protected data member ----------------------------------------------
\r
634 * The BreakIterator to define the boundaries of a logical match.
\r
635 * This value can be a null.
\r
636 * See class documentation for more information.
\r
637 * @see #setBreakIterator(BreakIterator)
\r
638 * @see #getBreakIterator
\r
639 * @see BreakIterator
\r
642 protected BreakIterator breakIterator;
\r
645 * Target text for searching.
\r
646 * @see #setTarget(CharacterIterator)
\r
650 protected CharacterIterator targetText;
\r
652 * Length of the most current match in target text.
\r
653 * Value 0 is the default value.
\r
654 * @see #setMatchLength
\r
655 * @see #getMatchLength
\r
658 protected int matchLength;
\r
660 // protected constructor ----------------------------------------------
\r
663 * Protected constructor for use by subclasses.
\r
664 * Initializes the iterator with the argument target text for searching
\r
665 * and sets the BreakIterator.
\r
666 * See class documentation for more details on the use of the target text
\r
667 * and BreakIterator.
\r
668 * @param target The target text to be searched.
\r
669 * @param breaker A {@link BreakIterator} that is used to determine the
\r
670 * boundaries of a logical match. This argument can be null.
\r
671 * @exception IllegalArgumentException thrown when argument target is null,
\r
673 * @see BreakIterator
\r
676 protected SearchIterator(CharacterIterator target, BreakIterator breaker)
\r
678 if (target == null
\r
679 || (target.getEndIndex() - target.getBeginIndex()) == 0) {
\r
680 throw new IllegalArgumentException(
\r
681 "Illegal argument target. " +
\r
682 " Argument can not be null or of length 0");
\r
684 targetText = target;
\r
685 breakIterator = breaker;
\r
686 if (breakIterator != null) {
\r
687 breakIterator.setText(target);
\r
690 m_lastMatchStart_ = DONE;
\r
691 m_isOverlap_ = false;
\r
692 m_isForwardSearching_ = true;
\r
694 m_setOffset_ = DONE;
\r
697 // protected methods --------------------------------------------------
\r
701 * Sets the length of the most recent match in the target text.
\r
702 * Subclasses' handleNext() and handlePrevious() methods should call this
\r
703 * after they find a match in the target text.
\r
704 * @param length new length to set
\r
706 * @see #handlePrevious
\r
709 protected void setMatchLength(int length)
\r
711 matchLength = length;
\r
716 * Abstract method that subclasses override to provide the mechanism
\r
717 * for finding the next <b>forwards</b> match in the target text. This
\r
718 * allows different subclasses to provide different search algorithms.
\r
721 * If a match is found, this function must call setMatchLength(int) to
\r
722 * set the length of the result match.
\r
723 * The iterator is adjusted so that its current index, as returned by
\r
724 * {@link #getIndex}, is the starting position of the match if one was
\r
725 * found. If a match is not found, <tt>DONE</tt> will be returned.
\r
727 * @param start index in the target text at which the forwards search
\r
729 * @return the starting index of the next forwards match if found, DONE
\r
731 * @see #setMatchLength(int)
\r
732 * @see #handlePrevious(int)
\r
736 protected abstract int handleNext(int start);
\r
740 * Abstract method which subclasses override to provide the mechanism
\r
741 * for finding the next <b>backwards</b> match in the target text.
\r
742 * This allows different
\r
743 * subclasses to provide different search algorithms.
\r
746 * If a match is found, this function must call setMatchLength(int) to
\r
747 * set the length of the result match.
\r
748 * The iterator is adjusted so that its current index, as returned by
\r
749 * {@link #getIndex}, is the starting position of the match if one was
\r
750 * found. If a match is not found, <tt>DONE</tt> will be returned.
\r
752 * @param startAt index in the target text at which the backwards search
\r
754 * @return the starting index of the next backwards match if found,
\r
756 * @see #setMatchLength(int)
\r
757 * @see #handleNext(int)
\r
761 protected abstract int handlePrevious(int startAt);
\r
763 // private data members ------------------------------------------------
\r
766 * Flag indicates if we are doing a forwards search
\r
768 private boolean m_isForwardSearching_;
\r
770 * Flag to indicate if overlapping search is to be done.
\r
771 * E.g. looking for "aa" in "aaa" will yield matches at offset 0 and 1.
\r
773 private boolean m_isOverlap_;
\r
775 * Flag indicates if we are at the start of a string search.
\r
776 * This indicates that we are in forward search and at the start of m_text.
\r
778 private boolean m_reset_;
\r
780 * Data member to store user defined position in setIndex().
\r
781 * If setIndex() is not called, this value will be DONE.
\r
783 private int m_setOffset_;
\r
785 * Offset of the beginning of the last match
\r
787 private int m_lastMatchStart_;
\r