2 *******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
8 package com.ibm.icu.text;
10 import java.text.CharacterIterator;
13 * <p>SearchIterator is an abstract base class that defines a protocol
14 * for text searching. Subclasses provide concrete implementations of
15 * various search algorithms. A concrete subclass, StringSearch, is
16 * provided that implements language-sensitive pattern matching based
17 * on the comparison rules defined in a RuleBasedCollator
18 * object. Instances of SearchIterator maintain a current position and
19 * scan over the target text, returning the indices where a match is
20 * found and the length of each match. Generally, the sequence of forward
21 * matches will be equivalent to the sequence of backward matches.One
22 * case where this statement may not hold is when non-overlapping mode
23 * is set on and there are continuous repetitive patterns in the text.
24 * Consider the case searching for pattern "aba" in the text
25 * "ababababa", setting overlapping mode off will produce forward matches
26 * at offsets 0, 4. However when a backwards search is done, the
27 * results will be at offsets 6 and 2.</p>
29 * <p>If matches searched for have boundary restrictions. BreakIterators
30 * can be used to define the valid boundaries of such a match. Once a
31 * BreakIterator is set, potential matches will be tested against the
32 * BreakIterator to determine if the boundaries are valid and that all
33 * characters in the potential match are equivalent to the pattern
34 * searched for. For example, looking for the pattern "fox" in the text
35 * "foxy fox" will produce match results at offset 0 and 5 with length 3
36 * if no BreakIterators were set. However if a WordBreakIterator is set,
37 * the only match that would be found will be at the offset 5. Since,
38 * the SearchIterator guarantees that if a BreakIterator is set, all its
39 * matches will match the given pattern exactly, a potential match that
40 * passes the BreakIterator might still not produce a valid match. For
41 * instance the pattern "e" will not be found in the string
42 * "\u00e9" (latin small letter e with acute) if a
43 * CharacterBreakIterator is used. Even though "e" is
44 * a part of the character "\u00e9" and the potential match at
45 * offset 0 length 1 passes the CharacterBreakIterator test, "\u00e9"
46 * is not equivalent to "e", hence the SearchIterator rejects the potential
47 * match. By default, the SearchIterator
48 * does not impose any boundary restriction on the matches, it will
49 * return all results that match the pattern. Illustrating with the
50 * above example, "e" will
51 * be found in the string "\u00e9" if no BreakIterator is
54 * <p>SearchIterator also provides a means to handle overlapping
55 * matches via the API setOverlapping(boolean). For example, if
56 * overlapping mode is set, searching for the pattern "abab" in the
57 * text "ababab" will match at positions 0 and 2, whereas if
58 * overlapping is not set, SearchIterator will only match at position
59 * 0. By default, overlapping mode is not set.</p>
61 * <p>The APIs in SearchIterator are similar to that of other text
62 * iteration classes such as BreakIterator. Using this class, it is
63 * easy to scan through text looking for all occurances of a
68 * String target = "The quick brown fox jumped over the lazy fox";
69 * String pattern = "fox";
70 * SearchIterator iter = new StringSearch(pattern, target);
71 * for (int pos = iter.first(); pos != SearchIterator.DONE;
72 * pos = iter.next()) {
73 * // println matches at offset 16 and 41 with length 3
74 * System.out.println("Found match at " + pos + ", length is "
75 * + iter.getMatchLength());
77 * target = "ababababa";
79 * iter.setTarget(new StringCharacterIterator(pattern));
80 * iter.setOverlapping(false);
81 * System.out.println("Overlapping mode set to false");
82 * System.out.println("Forward matches of pattern " + pattern + " in text "
84 * for (int pos = iter.first(); pos != SearchIterator.DONE;
85 * pos = iter.next()) {
86 * // println matches at offset 0 and 4 with length 3
87 * System.out.println("offset " + pos + ", length "
88 * + iter.getMatchLength());
90 * System.out.println("Backward matches of pattern " + pattern + " in text "
92 * for (int pos = iter.last(); pos != SearchIterator.DONE;
93 * pos = iter.previous()) {
94 * // println matches at offset 6 and 2 with length 3
95 * System.out.println("offset " + pos + ", length "
96 * + iter.getMatchLength());
98 * System.out.println("Overlapping mode set to true");
99 * System.out.println("Index set to 2");
101 * iter.setOverlapping(true);
102 * System.out.println("Forward matches of pattern " + pattern + " in text "
104 * for (int pos = iter.first(); pos != SearchIterator.DONE;
105 * pos = iter.next()) {
106 * // println matches at offset 2, 4 and 6 with length 3
107 * System.out.println("offset " + pos + ", length "
108 * + iter.getMatchLength());
110 * System.out.println("Index set to 2");
112 * System.out.println("Backward matches of pattern " + pattern + " in text "
114 * for (int pos = iter.last(); pos != SearchIterator.DONE;
115 * pos = iter.previous()) {
116 * // println matches at offset 0 with length 3
117 * System.out.println("offset " + pos + ", length "
118 * + iter.getMatchLength());
122 * @author Laura Werner, synwee
126 public abstract class SearchIterator
129 // public data members -------------------------------------------------
132 * DONE is returned by previous() and next() after all valid matches have
133 * been returned, and by first() and last() if there are no matches at all.
138 public static final int DONE = -1;
140 // public methods -----------------------------------------------------
142 // public setters -----------------------------------------------------
146 * Sets the position in the target text at which the next search will start.
147 * This method clears any previous match.
149 * @param position position from which to start the next search
150 * @exception IndexOutOfBoundsException thrown if argument position is out
151 * of the target text range.
155 public void setIndex(int position) {
156 if (position < targetText.getBeginIndex()
157 || position > targetText.getEndIndex()) {
158 throw new IndexOutOfBoundsException(
159 "setIndex(int) expected position to be between " +
160 targetText.getBeginIndex() + " and " + targetText.getEndIndex());
162 m_setOffset_ = position;
169 * Determines whether overlapping matches are returned. See the class
170 * documentation for more information about overlapping matches.
173 * The default setting of this property is false
175 * @param allowOverlap flag indicator if overlapping matches are allowed
176 * @see #isOverlapping
179 public void setOverlapping(boolean allowOverlap)
181 m_isOverlap_ = allowOverlap;
185 * Set the BreakIterator that is used to restrict the points at which
186 * matches are detected.
187 * Using <tt>null</tt> as the parameter is legal; it means that break
188 * detection should not be attempted.
189 * See class documentation for more information.
190 * @param breakiter A BreakIterator that will be used to restrict the
191 * points at which matches are detected.
192 * @see #getBreakIterator
196 public void setBreakIterator(BreakIterator breakiter)
198 breakIterator = breakiter;
199 if (breakIterator != null) {
200 breakIterator.setText(targetText);
205 * Set the target text to be searched. Text iteration will then begin at
206 * the start of the text string. This method is useful if you want to
207 * reuse an iterator to search within a different body of text.
208 * @param text new text iterator to look for match,
209 * @exception IllegalArgumentException thrown when text is null or has
214 public void setTarget(CharacterIterator text)
216 if (text == null || text.getEndIndex() == text.getIndex()) {
217 throw new IllegalArgumentException("Illegal null or empty text");
221 targetText.setIndex(targetText.getBeginIndex());
224 m_isForwardSearching_ = true;
225 if (breakIterator != null) {
226 breakIterator.setText(targetText);
230 // public getters ----------------------------------------------------
234 * Returns the index of the most recent match in the target text.
235 * This call returns a valid result only after a successful call to
236 * {@link #first}, {@link #next}, {@link #previous}, or {@link #last}.
237 * Just after construction, or after a searching method returns
238 * <tt>DONE</tt>, this method will return <tt>DONE</tt>.
241 * Use <tt>getMatchLength</tt> to get the length of the matched text.
242 * <tt>getMatchedText</tt> will return the subtext in the searched
243 * target text from index getMatchStart() with length getMatchLength().
245 * @return index to a substring within the text string that is being
247 * @see #getMatchLength
248 * @see #getMatchedText
256 public int getMatchStart()
258 return m_lastMatchStart_;
262 * Return the index in the target text at which the iterator is currently
264 * If the iteration has gone past the end of the target text, or past
265 * the beginning for a backwards search, {@link #DONE} is returned.
266 * @return index in the target text at which the iterator is currently
275 public abstract int getIndex();
279 * Returns the length of the most recent match in the target text.
280 * This call returns a valid result only after a successful
281 * call to {@link #first}, {@link #next}, {@link #previous}, or
283 * Just after construction, or after a searching method returns
284 * <tt>DONE</tt>, this method will return 0. See getMatchStart() for
287 * @return The length of the most recent match in the target text, or 0 if
289 * @see #getMatchStart
290 * @see #getMatchedText
298 public int getMatchLength()
304 * Returns the BreakIterator that is used to restrict the indexes at which
305 * matches are detected. This will be the same object that was passed to
306 * the constructor or to <code>setBreakIterator</code>.
307 * If the BreakIterator has not been set, <tt>null</tt> will be returned.
308 * See setBreakIterator for more information.
309 * @return the BreakIterator set to restrict logic matches
310 * @see #setBreakIterator
314 public BreakIterator getBreakIterator()
316 return breakIterator;
320 * Return the target text that is being searched.
321 * @return target text being searched.
325 public CharacterIterator getTarget()
331 * Returns the text that was matched by the most recent call to
332 * {@link #first}, {@link #next}, {@link #previous}, or {@link #last}.
333 * If the iterator is not pointing at a valid match, for instance just
334 * after construction or after <tt>DONE</tt> has been returned, an empty
335 * String will be returned. See getMatchStart for more information
336 * @see #getMatchStart
337 * @see #getMatchLength
343 * @return the substring in the target text of the most recent match
346 public String getMatchedText()
348 if (matchLength > 0) {
349 int limit = m_lastMatchStart_ + matchLength;
350 StringBuilder result = new StringBuilder(matchLength);
351 result.append(targetText.current());
353 while (targetText.getIndex() < limit) {
354 result.append(targetText.current());
357 targetText.setIndex(m_lastMatchStart_);
358 return result.toString();
363 // miscellaneous public methods -----------------------------------------
366 * Search <b>forwards</b> in the target text for the next valid match,
367 * starting the search from the current iterator position. The iterator is
368 * adjusted so that its current index, as returned by {@link #getIndex},
369 * is the starting position of the match if one was found. If a match is
370 * found, the index of the match is returned, otherwise <tt>DONE</tt> is
371 * returned. If overlapping mode is set, the beginning of the found match
372 * can be before the end of the current match, if any.
373 * @return The starting index of the next forward match after the current
374 * iterator position, or
375 * <tt>DONE</tt> if there are no more matches.
376 * @see #getMatchStart
377 * @see #getMatchLength
378 * @see #getMatchedText
389 int start = targetText.getIndex();
390 if (m_setOffset_ != DONE) {
391 start = m_setOffset_;
394 if (m_isForwardSearching_) {
396 start + matchLength >= targetText.getEndIndex()) {
397 // not enough characters to match
399 targetText.setIndex(targetText.getEndIndex());
400 m_lastMatchStart_ = DONE;
406 // switching direction.
407 // if matchedIndex == USEARCH_DONE, it means that either a
408 // setIndex has been called or that previous ran off the text
409 // string. the iterator would have been set to offset 0 if a
410 // match is not found.
411 m_isForwardSearching_ = true;
413 // there's no need to set the collation element iterator
414 // the next call to next will set the offset.
420 start = targetText.getBeginIndex();
422 if (matchLength > 0) {
423 // if match length is 0 we are at the start of the iteration
428 start += matchLength;
431 m_lastMatchStart_ = handleNext(start);
432 return m_lastMatchStart_;
436 * Search <b>backwards</b> in the target text for the next valid match,
437 * starting the search from the current iterator position. The iterator is
438 * adjusted so that its current index, as returned by {@link #getIndex},
439 * is the starting position of the match if one was found. If a match is
440 * found, the index is returned, otherwise <tt>DONE</tt> is returned. If
441 * overlapping mode is set, the end of the found match can be after the
442 * beginning of the previous match, if any.
443 * @return The starting index of the next backwards match after the current
444 * iterator position, or
445 * <tt>DONE</tt> if there are no more matches.
446 * @see #getMatchStart
447 * @see #getMatchLength
448 * @see #getMatchedText
457 public int previous()
459 int start = targetText.getIndex();
460 if (m_setOffset_ != DONE) {
461 start = m_setOffset_;
465 m_isForwardSearching_ = false;
467 start = targetText.getEndIndex();
470 if (m_isForwardSearching_ == true) {
471 // switching direction.
472 // if matchedIndex == USEARCH_DONE, it means that either a
473 // setIndex has been called or that next ran off the text
474 // string. the iterator would have been set to offset textLength if
475 // a match is not found.
476 m_isForwardSearching_ = false;
477 if (start != targetText.getEndIndex()) {
482 if (start == targetText.getBeginIndex()) {
483 // not enough characters to match
485 targetText.setIndex(targetText.getBeginIndex());
486 m_lastMatchStart_ = DONE;
491 m_lastMatchStart_ = handlePrevious(start);
492 return m_lastMatchStart_;
496 * Return true if the overlapping property has been set.
497 * See setOverlapping(boolean) for more information.
498 * @see #setOverlapping
499 * @return true if the overlapping property has been set, false otherwise
502 public boolean isOverlapping()
509 * Resets the search iteration. All properties will be reset to their
513 * If a forward iteration is initiated, the next search will begin at the
514 * start of the target text. Otherwise, if a backwards iteration is initiated,
515 * the next search will begin at the end of the target text.
521 // reset is setting the attributes that are already in string search
523 setIndex(targetText.getBeginIndex());
524 m_isOverlap_ = false;
525 m_isForwardSearching_ = true;
531 * Return the index of the first <b>forward</b> match in the target text.
532 * This method sets the iteration to begin at the start of the
533 * target text and searches forward from there.
534 * @return The index of the first forward match, or <code>DONE</code>
535 * if there are no matches.
536 * @see #getMatchStart
537 * @see #getMatchLength
538 * @see #getMatchedText
547 public final int first()
549 m_isForwardSearching_ = true;
550 setIndex(targetText.getBeginIndex());
555 * Return the index of the first <b>forward</b> match in target text that
556 * is at or after argument <tt>position</tt>.
557 * This method sets the iteration to begin at the specified
558 * position in the the target text and searches forward from there.
559 * @return The index of the first forward match, or <code>DONE</code>
560 * if there are no matches.
561 * @see #getMatchStart
562 * @see #getMatchLength
563 * @see #getMatchedText
572 public final int following(int position)
574 m_isForwardSearching_ = true;
575 // position checked in usearch_setOffset
581 * Return the index of the first <b>backward</b> match in target text.
582 * This method sets the iteration to begin at the end of the
583 * target text and searches backwards from there.
584 * @return The starting index of the first backward match, or
585 * <code>DONE</code> if there are no matches.
586 * @see #getMatchStart
587 * @see #getMatchLength
588 * @see #getMatchedText
597 public final int last()
599 m_isForwardSearching_ = false;
600 setIndex(targetText.getEndIndex());
605 * Return the index of the first <b>backwards</b> match in target
606 * text that ends at or before argument <tt>position</tt>.
607 * This method sets the iteration to begin at the argument
608 * position index of the target text and searches backwards from there.
609 * @return The starting index of the first backwards match, or
611 * if there are no matches.
612 * @see #getMatchStart
613 * @see #getMatchLength
614 * @see #getMatchedText
623 public final int preceding(int position)
625 m_isForwardSearching_ = false;
626 // position checked in usearch_setOffset
631 // protected data member ----------------------------------------------
634 * The BreakIterator to define the boundaries of a logical match.
635 * This value can be a null.
636 * See class documentation for more information.
637 * @see #setBreakIterator(BreakIterator)
638 * @see #getBreakIterator
642 protected BreakIterator breakIterator;
645 * Target text for searching.
646 * @see #setTarget(CharacterIterator)
650 protected CharacterIterator targetText;
652 * Length of the most current match in target text.
653 * Value 0 is the default value.
654 * @see #setMatchLength
655 * @see #getMatchLength
658 protected int matchLength;
660 // protected constructor ----------------------------------------------
663 * Protected constructor for use by subclasses.
664 * Initializes the iterator with the argument target text for searching
665 * and sets the BreakIterator.
666 * See class documentation for more details on the use of the target text
668 * @param target The target text to be searched.
669 * @param breaker A {@link BreakIterator} that is used to determine the
670 * boundaries of a logical match. This argument can be null.
671 * @exception IllegalArgumentException thrown when argument target is null,
676 protected SearchIterator(CharacterIterator target, BreakIterator breaker)
679 || (target.getEndIndex() - target.getBeginIndex()) == 0) {
680 throw new IllegalArgumentException(
681 "Illegal argument target. " +
682 " Argument can not be null or of length 0");
685 breakIterator = breaker;
686 if (breakIterator != null) {
687 breakIterator.setText(target);
690 m_lastMatchStart_ = DONE;
691 m_isOverlap_ = false;
692 m_isForwardSearching_ = true;
697 // protected methods --------------------------------------------------
701 * Sets the length of the most recent match in the target text.
702 * Subclasses' handleNext() and handlePrevious() methods should call this
703 * after they find a match in the target text.
704 * @param length new length to set
706 * @see #handlePrevious
709 protected void setMatchLength(int length)
711 matchLength = length;
716 * Abstract method that subclasses override to provide the mechanism
717 * for finding the next <b>forwards</b> match in the target text. This
718 * allows different subclasses to provide different search algorithms.
721 * If a match is found, this function must call setMatchLength(int) to
722 * set the length of the result match.
723 * The iterator is adjusted so that its current index, as returned by
724 * {@link #getIndex}, is the starting position of the match if one was
725 * found. If a match is not found, <tt>DONE</tt> will be returned.
727 * @param start index in the target text at which the forwards search
729 * @return the starting index of the next forwards match if found, DONE
731 * @see #setMatchLength(int)
732 * @see #handlePrevious(int)
736 protected abstract int handleNext(int start);
740 * Abstract method which subclasses override to provide the mechanism
741 * for finding the next <b>backwards</b> match in the target text.
742 * This allows different
743 * subclasses to provide different search algorithms.
746 * If a match is found, this function must call setMatchLength(int) to
747 * set the length of the result match.
748 * The iterator is adjusted so that its current index, as returned by
749 * {@link #getIndex}, is the starting position of the match if one was
750 * found. If a match is not found, <tt>DONE</tt> will be returned.
752 * @param startAt index in the target text at which the backwards search
754 * @return the starting index of the next backwards match if found,
756 * @see #setMatchLength(int)
757 * @see #handleNext(int)
761 protected abstract int handlePrevious(int startAt);
763 // private data members ------------------------------------------------
766 * Flag indicates if we are doing a forwards search
768 private boolean m_isForwardSearching_;
770 * Flag to indicate if overlapping search is to be done.
771 * E.g. looking for "aa" in "aaa" will yield matches at offset 0 and 1.
773 private boolean m_isOverlap_;
775 * Flag indicates if we are at the start of a string search.
776 * This indicates that we are in forward search and at the start of m_text.
778 private boolean m_reset_;
780 * Data member to store user defined position in setIndex().
781 * If setIndex() is not called, this value will be DONE.
783 private int m_setOffset_;
785 * Offset of the beginning of the last match
787 private int m_lastMatchStart_;