2 *******************************************************************************
\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import java.text.CharacterIterator;
\r
11 import java.text.StringCharacterIterator;
\r
12 import java.util.Locale;
\r
14 import com.ibm.icu.util.ULocale;
\r
17 * A class that locates boundaries in text. This class defines a protocol for
\r
18 * objects that break up a piece of natural-language text according to a set
\r
19 * of criteria. Instances or subclasses of BreakIterator can be provided, for
\r
20 * example, to break a piece of text into words, sentences, or logical characters
\r
21 * according to the conventions of some language or group of languages.
\r
23 * We provide five built-in types of BreakIterator:
\r
24 * <ul><li>getTitleInstance() returns a BreakIterator that locates boundaries
\r
25 * between title breaks.
\r
26 * <li>getSentenceInstance() returns a BreakIterator that locates boundaries
\r
27 * between sentences. This is useful for triple-click selection, for example.
\r
28 * <li>getWordInstance() returns a BreakIterator that locates boundaries between
\r
29 * words. This is useful for double-click selection or "find whole words" searches.
\r
30 * This type of BreakIterator makes sure there is a boundary position at the
\r
31 * beginning and end of each legal word. (Numbers count as words, too.) Whitespace
\r
32 * and punctuation are kept separate from real words.
\r
33 * <li>getLineInstance() returns a BreakIterator that locates positions where it is
\r
34 * legal for a text editor to wrap lines. This is similar to word breaking, but
\r
35 * not the same: punctuation and whitespace are generally kept with words (you don't
\r
36 * want a line to start with whitespace, for example), and some special characters
\r
37 * can force a position to be considered a line-break position or prevent a position
\r
38 * from being a line-break position.
\r
39 * <li>getCharacterInstance() returns a BreakIterator that locates boundaries between
\r
40 * logical characters. Because of the structure of the Unicode encoding, a logical
\r
41 * character may be stored internally as more than one Unicode code point. (A with an
\r
42 * umlaut may be stored as an a followed by a separate combining umlaut character,
\r
43 * for example, but the user still thinks of it as one character.) This iterator allows
\r
44 * various processes (especially text editors) to treat as characters the units of text
\r
45 * that a user would think of as characters, rather than the units of text that the
\r
46 * computer sees as "characters".</ul>
\r
48 * BreakIterator's interface follows an "iterator" model (hence the name), meaning it
\r
49 * has a concept of a "current position" and methods like first(), last(), next(),
\r
50 * and previous() that update the current position. All BreakIterators uphold the
\r
51 * following invariants:
\r
52 * <ul><li>The beginning and end of the text are always treated as boundary positions.
\r
53 * <li>The current position of the iterator is always a boundary position (random-
\r
54 * access methods move the iterator to the nearest boundary position before or
\r
55 * after the specified position, not _to_ the specified position).
\r
56 * <li>DONE is used as a flag to indicate when iteration has stopped. DONE is only
\r
57 * returned when the current position is the end of the text and the user calls next(),
\r
58 * or when the current position is the beginning of the text and the user calls
\r
60 * <li>Break positions are numbered by the positions of the characters that follow
\r
61 * them. Thus, under normal circumstances, the position before the first character
\r
62 * is 0, the position after the first character is 1, and the position after the
\r
63 * last character is 1 plus the length of the string.
\r
64 * <li>The client can change the position of an iterator, or the text it analyzes,
\r
65 * at will, but cannot change the behavior. If the user wants different behavior, he
\r
66 * must instantiate a new iterator.</ul>
\r
68 * BreakIterator accesses the text it analyzes through a CharacterIterator, which makes
\r
69 * it possible to use BreakIterator to analyze text in any text-storage vehicle that
\r
70 * provides a CharacterIterator interface.
\r
72 * <b>NOTE:</b> Some types of BreakIterator can take a long time to create, and
\r
73 * instances of BreakIterator are not currently cached by the system. For
\r
74 * optimal performance, keep instances of BreakIterator around as long as makes
\r
75 * sense. For example, when word-wrapping a document, don't create and destroy a
\r
76 * new BreakIterator for each line. Create one break iterator for the whole document
\r
77 * (or whatever stretch of text you're wrapping) and use it to do the whole job of
\r
78 * wrapping the text.
\r
81 * <strong>Examples</strong>:<P>
\r
82 * Creating and using text boundaries
\r
85 * public static void main(String args[]) {
\r
86 * if (args.length == 1) {
\r
87 * String stringToExamine = args[0];
\r
88 * //print each word in order
\r
89 * BreakIterator boundary = BreakIterator.getWordInstance();
\r
90 * boundary.setText(stringToExamine);
\r
91 * printEachForward(boundary, stringToExamine);
\r
92 * //print each sentence in reverse order
\r
93 * boundary = BreakIterator.getSentenceInstance(Locale.US);
\r
94 * boundary.setText(stringToExamine);
\r
95 * printEachBackward(boundary, stringToExamine);
\r
96 * printFirst(boundary, stringToExamine);
\r
97 * printLast(boundary, stringToExamine);
\r
103 * Print each element in order
\r
106 * public static void printEachForward(BreakIterator boundary, String source) {
\r
107 * int start = boundary.first();
\r
108 * for (int end = boundary.next();
\r
109 * end != BreakIterator.DONE;
\r
110 * start = end, end = boundary.next()) {
\r
111 * System.out.println(source.substring(start,end));
\r
117 * Print each element in reverse order
\r
120 * public static void printEachBackward(BreakIterator boundary, String source) {
\r
121 * int end = boundary.last();
\r
122 * for (int start = boundary.previous();
\r
123 * start != BreakIterator.DONE;
\r
124 * end = start, start = boundary.previous()) {
\r
125 * System.out.println(source.substring(start,end));
\r
131 * Print first element
\r
134 * public static void printFirst(BreakIterator boundary, String source) {
\r
135 * int start = boundary.first();
\r
136 * int end = boundary.next();
\r
137 * System.out.println(source.substring(start,end));
\r
142 * Print last element
\r
145 * public static void printLast(BreakIterator boundary, String source) {
\r
146 * int end = boundary.last();
\r
147 * int start = boundary.previous();
\r
148 * System.out.println(source.substring(start,end));
\r
153 * Print the element at a specified position
\r
156 * public static void printAt(BreakIterator boundary, int pos, String source) {
\r
157 * int end = boundary.following(pos);
\r
158 * int start = boundary.previous();
\r
159 * System.out.println(source.substring(start,end));
\r
164 * Find the next word
\r
167 * public static int nextWordStartAfter(int pos, String text) {
\r
168 * BreakIterator wb = BreakIterator.getWordInstance();
\r
169 * wb.setText(text);
\r
170 * int last = wb.following(pos);
\r
171 * int current = wb.next();
\r
172 * while (current != BreakIterator.DONE) {
\r
173 * for (int p = last; p < current; p++) {
\r
174 * if (Character.isLetter(text.charAt(p))
\r
178 * current = wb.next();
\r
180 * return BreakIterator.DONE;
\r
183 * (The iterator returned by BreakIterator.getWordInstance() is unique in that
\r
184 * the break positions it returns don't represent both the start and end of the
\r
185 * thing being iterated over. That is, a sentence-break iterator returns breaks
\r
186 * that each represent the end of one sentence and the beginning of the next.
\r
187 * With the word-break iterator, the characters between two boundaries might be a
\r
188 * word, or they might be the punctuation or whitespace between two words. The
\r
189 * above code uses a simple heuristic to determine which boundary is the beginning
\r
190 * of a word: If the characters between this boundary and the next boundary
\r
191 * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
\r
192 * a Hangul syllable, a Kana character, etc.), then the text between this boundary
\r
193 * and the next is a word; otherwise, it's the material between words.)
\r
196 * @see CharacterIterator
\r
200 public abstract class BreakIterator implements Cloneable {
\r
202 protected BreakIterator() {
\r
206 * Create a copy of this iterator
\r
207 * @return A copy of this
\r
209 public Object clone() {
\r
210 // this is here for subclass use. we must override it ourselves, though.
\r
212 return super.clone();
\r
214 catch (CloneNotSupportedException e) {
\r
215 throw new InternalError();
\r
220 * DONE is returned by previous() and next() after all valid
\r
221 * boundaries have been returned.
\r
224 public static final int DONE = -1;
\r
227 * Return the first boundary position. This is always the beginning
\r
228 * index of the text this iterator iterates over. For example, if
\r
229 * the iterator iterates over a whole string, this function will
\r
230 * always return 0. This function also updates the iteration position
\r
231 * to point to the beginning of the text.
\r
232 * @return The character offset of the beginning of the stretch of text
\r
236 public abstract int first();
\r
239 * Return the last boundary position. This is always the "past-the-end"
\r
240 * index of the text this iterator iterates over. For example, if the
\r
241 * iterator iterates over a whole string (call it "text"), this function
\r
242 * will always return text.length(). This function also updated the
\r
243 * iteration position to point to the end of the text.
\r
244 * @return The character offset of the end of the stretch of text
\r
248 public abstract int last();
\r
251 * Advances the specified number of steps forward in the text (a negative
\r
252 * number, therefore, advances backwards). If this causes the iterator
\r
253 * to advance off either end of the text, this function returns DONE;
\r
254 * otherwise, this function returns the position of the appropriate
\r
255 * boundary. Calling this function is equivalent to calling next() or
\r
256 * previous() n times.
\r
257 * @param n The number of boundaries to advance over (if positive, moves
\r
258 * forward; if negative, moves backwards).
\r
259 * @return The position of the boundary n boundaries from the current
\r
260 * iteration position, or DONE if moving n boundaries causes the iterator
\r
261 * to advance off either end of the text.
\r
264 public abstract int next(int n);
\r
267 * Advances the iterator forward one boundary. The current iteration
\r
268 * position is updated to point to the next boundary position after the
\r
269 * current position, and this is also the value that is returned. If
\r
270 * the current position is equal to the value returned by last(), or to
\r
271 * DONE, this function returns DONE and sets the current position to
\r
273 * @return The position of the first boundary position following the
\r
274 * iteration position.
\r
277 public abstract int next();
\r
280 * Advances the iterator backward one boundary. The current iteration
\r
281 * position is updated to point to the last boundary position before
\r
282 * the current position, and this is also the value that is returned. If
\r
283 * the current position is equal to the value returned by first(), or to
\r
284 * DONE, this function returns DONE and sets the current position to
\r
286 * @return The position of the last boundary position preceding the
\r
287 * iteration position.
\r
290 public abstract int previous();
\r
293 * Sets the iterator's current iteration position to be the first
\r
294 * boundary position following the specified position. (Whether the
\r
295 * specified position is itself a boundary position or not doesn't
\r
296 * matter-- this function always moves the iteration position to the
\r
297 * first boundary after the specified position.) If the specified
\r
298 * position is the past-the-end position, returns DONE.
\r
299 * @param offset The character position to start searching from.
\r
300 * @return The position of the first boundary position following
\r
301 * "offset" (whether or not "offset" itself is a boundary position),
\r
302 * or DONE if "offset" is the past-the-end offset.
\r
305 public abstract int following(int offset);
\r
308 * Sets the iterator's current iteration position to be the last
\r
309 * boundary position preceding the specified position. (Whether the
\r
310 * specified position is itself a boundary position or not doesn't
\r
311 * matter-- this function always moves the iteration position to the
\r
312 * last boundary before the specified position.) If the specified
\r
313 * position is the starting position, returns DONE.
\r
314 * @param offset The character position to start searching from.
\r
315 * @return The position of the last boundary position preceding
\r
316 * "offset" (whether of not "offset" itself is a boundary position),
\r
317 * or DONE if "offset" is the starting offset of the iterator.
\r
320 public int preceding(int offset) {
\r
321 int pos = following(offset);
\r
322 while (pos >= offset && pos != DONE) {
\r
329 * Return true if the specfied position is a boundary position. If the
\r
330 * function returns true, the current iteration position is set to the
\r
331 * specified position; if the function returns false, the current
\r
332 * iteration position is set as though following() had been called.
\r
333 * @param offset the offset to check.
\r
334 * @return True if "offset" is a boundary position.
\r
337 public boolean isBoundary(int offset) {
\r
338 return offset == 0 || following(offset - 1) == offset;
\r
342 * Return the iterator's current position.
\r
343 * @return The iterator's current position.
\r
346 public abstract int current();
\r
349 * Returns a CharacterIterator over the text being analyzed.
\r
350 * For at least some subclasses of BreakIterator, this is a reference
\r
351 * to the <b>actual iterator being used</b> by the BreakIterator,
\r
352 * and therefore, this function's return value should be treated as
\r
353 * <tt>const</tt>. No guarantees are made about the current position
\r
354 * of this iterator when it is returned. If you need to move that
\r
355 * position to examine the text, clone this function's return value first.
\r
356 * @return A CharacterIterator over the text being analyzed.
\r
359 public abstract CharacterIterator getText();
\r
362 * Sets the iterator to analyze a new piece of text. The new
\r
363 * piece of text is passed in as a String, and the current
\r
364 * iteration position is reset to the beginning of the string.
\r
365 * (The old text is dropped.)
\r
366 * @param newText A String containing the text to analyze with
\r
367 * this BreakIterator.
\r
370 public void setText(String newText) {
\r
371 setText(new StringCharacterIterator(newText));
\r
375 * Sets the iterator to analyze a new piece of text. The
\r
376 * BreakIterator is passed a CharacterIterator through which
\r
377 * it will access the text itself. The current iteration
\r
378 * position is reset to the CharacterIterator's start index.
\r
379 * (The old iterator is dropped.)
\r
380 * @param newText A CharacterIterator referring to the text
\r
381 * to analyze with this BreakIterator (the iterator's current
\r
382 * position is ignored, but its other state is significant).
\r
385 public abstract void setText(CharacterIterator newText);
\r
387 /** @stable ICU 2.4 */
\r
388 public static final int KIND_CHARACTER = 0;
\r
389 /** @stable ICU 2.4 */
\r
390 public static final int KIND_WORD = 1;
\r
391 /** @stable ICU 2.4 */
\r
392 public static final int KIND_LINE = 2;
\r
393 /** @stable ICU 2.4 */
\r
394 public static final int KIND_SENTENCE = 3;
\r
395 /** @stable ICU 2.4 */
\r
396 public static final int KIND_TITLE = 4;
\r
399 * Returns a new instance of BreakIterator that locates word boundaries.
\r
400 * This function assumes that the text being analyzed is in the default
\r
401 * locale's language.
\r
402 * @return An instance of BreakIterator that locates word boundaries.
\r
405 public static BreakIterator getWordInstance() {
\r
406 return getWordInstance(Locale.getDefault());
\r
410 * Returns a new instance of BreakIterator that locates word boundaries.
\r
411 * @param where A locale specifying the language of the text to be
\r
413 * @return An instance of BreakIterator that locates word boundaries.
\r
416 public static BreakIterator getWordInstance(Locale where) {
\r
417 return getBreakInstance(where, KIND_WORD);
\r
421 * Returns a new instance of BreakIterator that locates word boundaries.
\r
422 * @param where A locale specifying the language of the text to be
\r
424 * @return An instance of BreakIterator that locates word boundaries.
\r
425 * @stable ICU 3.4.3
\r
427 public static BreakIterator getWordInstance(ULocale where) {
\r
428 return getBreakInstance(where.toLocale(), KIND_WORD);
\r
432 * Returns a new instance of BreakIterator that locates legal line-
\r
433 * wrapping positions. This function assumes the text being broken
\r
434 * is in the default locale's language.
\r
435 * @return A new instance of BreakIterator that locates legal
\r
436 * line-wrapping positions.
\r
439 public static BreakIterator getLineInstance() {
\r
440 return getLineInstance(Locale.getDefault());
\r
444 * Returns a new instance of BreakIterator that locates legal line-
\r
445 * wrapping positions.
\r
446 * @param where A Locale specifying the language of the text being broken.
\r
447 * @return A new instance of BreakIterator that locates legal
\r
448 * line-wrapping positions.
\r
451 public static BreakIterator getLineInstance(Locale where) {
\r
452 return getBreakInstance(where, KIND_LINE);
\r
456 * Returns a new instance of BreakIterator that locates legal line-
\r
457 * wrapping positions.
\r
458 * @param where A Locale specifying the language of the text being broken.
\r
459 * @return A new instance of BreakIterator that locates legal
\r
460 * line-wrapping positions.
\r
461 * @stable ICU 3.4.3
\r
463 public static BreakIterator getLineInstance(ULocale where) {
\r
464 return getBreakInstance(where.toLocale(), KIND_LINE);
\r
468 * Returns a new instance of BreakIterator that locates logical-character
\r
469 * boundaries. This function assumes that the text being analyzed is
\r
470 * in the default locale's language.
\r
471 * @return A new instance of BreakIterator that locates logical-character
\r
475 public static BreakIterator getCharacterInstance() {
\r
476 return getCharacterInstance(Locale.getDefault());
\r
480 * Returns a new instance of BreakIterator that locates logical-character
\r
482 * @param where A Locale specifying the language of the text being analyzed.
\r
483 * @return A new instance of BreakIterator that locates logical-character
\r
487 public static BreakIterator getCharacterInstance(Locale where) {
\r
488 return getBreakInstance(where, KIND_CHARACTER);
\r
492 * Returns a new instance of BreakIterator that locates logical-character
\r
494 * @param where A Locale specifying the language of the text being analyzed.
\r
495 * @return A new instance of BreakIterator that locates logical-character
\r
499 public static BreakIterator getCharacterInstance(ULocale where) {
\r
500 return getBreakInstance(where.toLocale(), KIND_CHARACTER);
\r
504 * Returns a new instance of BreakIterator that locates sentence boundaries.
\r
505 * This function assumes the text being analyzed is in the default locale's
\r
507 * @return A new instance of BreakIterator that locates sentence boundaries.
\r
510 public static BreakIterator getSentenceInstance() {
\r
511 return getSentenceInstance(Locale.getDefault());
\r
515 * Returns a new instance of BreakIterator that locates sentence boundaries.
\r
516 * @param where A Locale specifying the language of the text being analyzed.
\r
517 * @return A new instance of BreakIterator that locates sentence boundaries.
\r
520 public static BreakIterator getSentenceInstance(Locale where) {
\r
521 return getBreakInstance(where, KIND_SENTENCE);
\r
525 * Returns a new instance of BreakIterator that locates sentence boundaries.
\r
526 * @param where A Locale specifying the language of the text being analyzed.
\r
527 * @return A new instance of BreakIterator that locates sentence boundaries.
\r
528 * @stable ICU 3.4.3
\r
530 public static BreakIterator getSentenceInstance(ULocale where) {
\r
531 return getBreakInstance(where.toLocale(), KIND_SENTENCE);
\r
534 private static BreakIterator getBreakInstance(Locale where, int kind) {
\r
535 java.text.BreakIterator br = null;
\r
537 case KIND_CHARACTER: br = java.text.BreakIterator.getCharacterInstance(where); break;
\r
538 case KIND_WORD: br = java.text.BreakIterator.getWordInstance(where); break;
\r
539 case KIND_LINE: br = java.text.BreakIterator.getLineInstance(where); break;
\r
540 case KIND_SENTENCE: br = java.text.BreakIterator.getSentenceInstance(where); break;
\r
541 case KIND_TITLE: throw new UnsupportedOperationException();
\r
543 return new BreakIteratorHandle(br);
\r
547 * Returns a list of locales for which BreakIterators can be used.
\r
548 * @return An array of Locales. All of the locales in the array can
\r
549 * be used when creating a BreakIterator.
\r
550 * @stable ICU 3.4.3
\r
552 public static synchronized Locale[] getAvailableLocales() {
\r
553 return java.text.BreakIterator.getAvailableLocales();
\r
557 * Returns a list of locales for which BreakIterators can be used.
\r
558 * @return An array of ULocales. All of the locales in the array can
\r
559 * be used when creating a BreakIterator.
\r
560 * @stable ICU 3.4.3
\r
562 public static synchronized ULocale[] getAvailableULocales() {
\r
563 Locale[] locales = java.text.BreakIterator.getAvailableLocales();
\r
564 ULocale[] ulocales = new ULocale[locales.length];
\r
565 for (int i = 0; i < locales.length; ++i) {
\r
566 ulocales[i] = ULocale.forLocale(locales[i]);
\r
571 // forwarding implementation class
\r
572 static final class BreakIteratorHandle extends BreakIterator {
\r
576 public final java.text.BreakIterator breakIterator;
\r
580 * @param delegate the BreakIterator to which to delegate
\r
582 public BreakIteratorHandle(java.text.BreakIterator delegate) {
\r
583 this.breakIterator = delegate;
\r
586 public int first() {
\r
587 return breakIterator.first();
\r
589 public int last() {
\r
590 return breakIterator.last();
\r
592 public int next(int n) {
\r
593 return breakIterator.next(n);
\r
595 public int next() {
\r
596 return breakIterator.next();
\r
598 public int previous() {
\r
599 return breakIterator.previous();
\r
601 public int following(int offset) {
\r
602 return breakIterator.following(offset);
\r
604 public int preceding(int offset) {
\r
605 return breakIterator.preceding(offset);
\r
607 public boolean isBoundary(int offset) {
\r
608 return breakIterator.isBoundary(offset);
\r
610 public int current() {
\r
611 return breakIterator.current();
\r
613 public CharacterIterator getText() {
\r
614 return breakIterator.getText();
\r
616 public void setText(CharacterIterator newText) {
\r
617 breakIterator.setText(newText);
\r
621 * Return a string suitable for debugging.
\r
622 * @return a string suitable for debugging
\r
623 * @stable ICU 3.4.3
\r
625 public String toString() {
\r
626 return breakIterator.toString();
\r
630 * Return a clone of this BreakIterator.
\r
631 * @return a clone of this BreakIterator
\r
632 * @stable ICU 3.4.3
\r
634 public Object clone() {
\r
635 return new BreakIteratorHandle((java.text.BreakIterator)breakIterator.clone());
\r
639 * Return true if rhs is a BreakIterator with the same break behavior as this.
\r
640 * @return true if rhs equals this
\r
641 * @stable ICU 3.4.3
\r
643 public boolean equals(Object rhs) {
\r
645 return breakIterator.equals(((BreakIteratorHandle)rhs).breakIterator);
\r
647 catch (Exception e) {
\r
653 * Return a hashCode.
\r
654 * @return a hashCode
\r
655 * @stable ICU 3.4.3
\r
657 public int hashCode() {
\r
658 return breakIterator.hashCode();
\r