2 *******************************************************************************
3 * Copyright (C) 2005-2013 International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.text;
9 import static com.ibm.icu.impl.CharacterIteration.DONE32;
10 import static com.ibm.icu.impl.CharacterIteration.current32;
11 import static com.ibm.icu.impl.CharacterIteration.next32;
12 import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
13 import static com.ibm.icu.impl.CharacterIteration.previous32;
15 import java.io.ByteArrayInputStream;
16 import java.io.ByteArrayOutputStream;
17 import java.io.IOException;
18 import java.io.InputStream;
19 import java.io.OutputStream;
20 import java.text.CharacterIterator;
21 import java.util.Collections;
22 import java.util.HashSet;
24 import java.util.Stack;
26 import com.ibm.icu.impl.Assert;
27 import com.ibm.icu.impl.CharTrie;
28 import com.ibm.icu.impl.ICUDebug;
29 import com.ibm.icu.lang.UCharacter;
30 import com.ibm.icu.lang.UProperty;
31 import com.ibm.icu.lang.UScript;
34 * Rule Based Break Iterator
35 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
39 public class RuleBasedBreakIterator extends BreakIterator {
40 //=======================================================================
41 // Constructors & Factories
42 //=======================================================================
46 * @deprecated This API is ICU internal only.
48 private RuleBasedBreakIterator() {
49 fLastStatusIndexValid = true;
50 fDictionaryCharCount = 0;
51 fBreakEngines.add(fUnhandledBreakEngine);
55 * Create a break iterator from a precompiled set of break rules.
57 * Creating a break iterator from the binary rules is much faster than
58 * creating one from source rules.
60 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
61 * Binary break iterator rules are not guaranteed to be compatible between
62 * different versions of ICU.
64 * @param is an input stream supplying the compiled binary rules.
65 * @throws IOException if there is an error while reading the rules from the InputStream.
66 * @see #compileRules(String, OutputStream)
69 public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
70 RuleBasedBreakIterator This = new RuleBasedBreakIterator();
71 This.fRData = RBBIDataWrapper.get(is);
76 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
77 * @param rules The break rules to be used.
80 public RuleBasedBreakIterator(String rules) {
83 ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
84 compileRules(rules, ruleOS);
85 byte [] ruleBA = ruleOS.toByteArray();
86 InputStream ruleIS = new ByteArrayInputStream(ruleBA);
87 fRData = RBBIDataWrapper.get(ruleIS);
88 } catch (IOException e) {
90 // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
91 // causing bogus compiled rules to be produced, but with no compile error raised.
92 RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
99 //=======================================================================
101 //=======================================================================
104 * Clones this iterator.
105 * @return A newly-constructed RuleBasedBreakIterator with the same
106 * behavior as this one.
109 public Object clone()
111 RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone();
113 result.fText = (CharacterIterator)(fText.clone());
119 * Returns true if both BreakIterators are of the same class, have the same
120 * rules, and iterate over the same text.
123 public boolean equals(Object that) {
131 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
132 if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
135 if (fRData != null && other.fRData != null &&
136 (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
139 if (fText == null && other.fText == null) {
142 if (fText == null || other.fText == null) {
145 return fText.equals(other.fText);
147 catch(ClassCastException e) {
153 * Returns the description (rules) used to create this iterator.
154 * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
157 public String toString() {
159 if (fRData != null) {
160 retStr = fRData.fRuleSource;
166 * Compute a hashcode for this BreakIterator
167 * @return A hash code
170 public int hashCode()
172 return fRData.fRuleSource.hashCode();
176 * Tag value for "words" that do not fit into any of other categories.
177 * Includes spaces and most punctuation.
179 * @provisional This is a draft API and might change in a future release of ICU.
181 public static final int WORD_NONE = 0;
184 * Upper bound for tags for uncategorized words.
186 * @provisional This is a draft API and might change in a future release of ICU.
188 public static final int WORD_NONE_LIMIT = 100;
191 * Tag value for words that appear to be numbers, lower limit.
193 * @provisional This is a draft API and might change in a future release of ICU.
195 public static final int WORD_NUMBER = 100;
198 * Tag value for words that appear to be numbers, upper limit.
200 * @provisional This is a draft API and might change in a future release of ICU.
202 public static final int WORD_NUMBER_LIMIT = 200;
205 * Tag value for words that contain letters, excluding
206 * hiragana, katakana or ideographic characters, lower limit.
208 * @provisional This is a draft API and might change in a future release of ICU.
210 public static final int WORD_LETTER = 200;
213 * Tag value for words containing letters, upper limit
215 * @provisional This is a draft API and might change in a future release of ICU.
217 public static final int WORD_LETTER_LIMIT = 300;
220 * Tag value for words containing kana characters, lower limit
222 * @provisional This is a draft API and might change in a future release of ICU.
224 public static final int WORD_KANA = 300;
227 * Tag value for words containing kana characters, upper limit
229 * @provisional This is a draft API and might change in a future release of ICU.
231 public static final int WORD_KANA_LIMIT = 400;
234 * Tag value for words containing ideographic characters, lower limit
236 * @provisional This is a draft API and might change in a future release of ICU.
238 public static final int WORD_IDEO = 400;
241 * Tag value for words containing ideographic characters, upper limit
243 * @provisional This is a draft API and might change in a future release of ICU.
245 public static final int WORD_IDEO_LIMIT = 500;
247 private static final int START_STATE = 1; // The state number of the starting state
248 private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
250 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
251 // of user text. A variable with this enum type keeps track of where we
252 // are. The state machine only fetches user text input while in RUN mode.
253 private static final int RBBI_START = 0;
254 private static final int RBBI_RUN = 1;
255 private static final int RBBI_END = 2;
258 * The character iterator through which this BreakIterator accesses the text.
260 private CharacterIterator fText = new java.text.StringCharacterIterator("");
263 * The rule data for this BreakIterator instance
265 * @deprecated This API is ICU internal only.
267 RBBIDataWrapper fRData;
270 * Index of the Rule {tag} values for the most recent match.
272 private int fLastRuleStatusIndex;
275 * Rule tag value valid flag.
276 * Some iterator operations don't intrinsically set the correct tag value.
277 * This flag lets us lazily compute the value if we are ever asked for it.
279 private boolean fLastStatusIndexValid;
282 * Counter for the number of characters encountered with the "dictionary"
283 * flag set. Normal RBBI iterators don't use it, although the code
284 * for updating it is live. Dictionary Based break iterators (a subclass
285 * of us) access this field directly.
288 private int fDictionaryCharCount;
291 * ICU debug argument name for RBBI
293 private static final String RBBI_DEBUG_ARG = "rbbi";
296 * Debugging flag. Trace operation of state machine when true.
298 * @deprecated This API is ICU internal only.
300 private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
301 && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
304 * What kind of break iterator this is. Set to KIND_LINE by default,
305 * since this produces sensible output.
307 private int fBreakType = KIND_LINE;
310 * The "default" break engine - just skips over ranges of dictionary words,
311 * producing no breaks. Should only be used if characters need to be handled
312 * by a dictionary but we have no dictionary implementation for them.
314 private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine();
317 * when a range of characters is divided up using the dictionary, the break
318 * positions that are discovered are stored here, preventing us from having
319 * to use either the dictionary or the state table again until the iterator
320 * leaves this range of text
322 private int[] fCachedBreakPositions;
325 * if fCachedBreakPositions is not null, this indicates which item in the
326 * cache the current iteration position refers to
328 private int fPositionInCache;
331 * Whether or not we should be using the dictionary. Set to true by
332 * default - only set to false if we get an empty string as input or
333 * if our "kind" is not KIND_WORD or KIND_LINE.
335 * If this is set to false, no dictionary handling is done.
337 private boolean fUseDictionary = true;
339 private final Set<LanguageBreakEngine> fBreakEngines = Collections.synchronizedSet(new HashSet<LanguageBreakEngine>());
342 * Dump the contents of the state table and character classes for this break iterator.
343 * For debugging only.
345 * @deprecated This API is ICU internal only.
352 * Compile a set of source break rules into the binary state tables used
353 * by the break iterator engine. Creating a break iterator from precompiled
354 * rules is much faster than creating one from source rules.
356 * Binary break rules are not guaranteed to be compatible between different
360 * @param rules The source form of the break rules
361 * @param ruleBinary An output stream to receive the compiled rules.
362 * @throws IOException If there is an error writing the output.
363 * @see #getInstanceFromCompiledRules(InputStream)
366 public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
367 RBBIRuleBuilder.compileRules(rules, ruleBinary);
370 //=======================================================================
371 // BreakIterator overrides
372 //=======================================================================
375 * Sets the current iteration position to the beginning of the text.
376 * (i.e., the CharacterIterator's starting offset).
377 * @return The offset of the beginning of the text.
381 fCachedBreakPositions = null;
382 fDictionaryCharCount = 0;
383 fPositionInCache = 0;
384 fLastRuleStatusIndex = 0;
385 fLastStatusIndexValid = true;
387 return BreakIterator.DONE;
390 return fText.getIndex();
394 * Sets the current iteration position to the end of the text.
395 * (i.e., the CharacterIterator's ending offset).
396 * @return The text's past-the-end offset.
400 fCachedBreakPositions = null;
401 fDictionaryCharCount = 0;
402 fPositionInCache = 0;
405 fLastRuleStatusIndex = 0;
406 fLastStatusIndexValid = true;
407 return BreakIterator.DONE;
410 // t.last() returns the offset of the last character,
411 // rather than the past-the-end offset
412 // so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
413 // will work correctly.
414 fLastStatusIndexValid = false;
415 int pos = fText.getEndIndex();
421 * Advances the iterator either forward or backward the specified number of steps.
422 * Negative values move backward, and positive values move forward. This is
423 * equivalent to repeatedly calling next() or previous().
424 * @param n The number of steps to move. The sign indicates the direction
425 * (negative is backwards, and positive is forwards).
426 * @return The character offset of the boundary position n boundaries away from
430 public int next(int n) {
431 int result = current();
433 result = handleNext();
444 * Advances the iterator to the next boundary position.
445 * @return The position of the first boundary after this one.
454 * Moves the iterator backwards, to the last boundary preceding this one.
455 * @return The position of the last boundary position preceding this one.
458 public int previous() {
459 CharacterIterator text = getText();
461 fLastStatusIndexValid = false;
463 // if we have cached break positions and we're still in the range
464 // covered by them, just move one step backward in the cache
465 if (fCachedBreakPositions != null && fPositionInCache > 0) {
467 text.setIndex(fCachedBreakPositions[fPositionInCache]);
468 return fCachedBreakPositions[fPositionInCache];
471 // otherwise, dump the cache and use the inherited previous() method to move
472 // backward. This may fill up the cache with new break positions, in which
473 // case we have to mark our position in the cache. If it doesn't, use next()
474 // to move forward until we hit or pass the current position. This *will* fill
477 // TODO: Try to reuse the array rather than reallocating it all the time
478 fCachedBreakPositions = null;
480 int offset = current();
481 int result = rulesPrevious();
482 if (result == BreakIterator.DONE) {
486 if (fDictionaryCharCount == 0) {
490 if (fCachedBreakPositions != null) {
491 fPositionInCache = fCachedBreakPositions.length - 2;
495 while (result < offset) {
496 int nextResult = handleNext();
497 if (nextResult >= offset) {
504 if (fCachedBreakPositions != null) {
505 for (fPositionInCache = 0; fPositionInCache < fCachedBreakPositions.length; fPositionInCache++) {
506 if (fCachedBreakPositions[fPositionInCache] >= offset) {
513 // prepare for the user asking for our status
514 // our status will have been marked as valid by the next()
515 // calls but isn't at the right place, so mark it as invalid
516 // and recompute it when the user asks
517 fLastStatusIndexValid = false;
518 text.setIndex(result);
524 private int rulesPrevious() {
525 // if we're already sitting at the beginning of the text, return DONE
526 if (fText == null || current() == fText.getBeginIndex()) {
527 fLastRuleStatusIndex = 0;
528 fLastStatusIndexValid = true;
529 return BreakIterator.DONE;
532 if (fRData.fSRTable != null || fRData.fSFTable != null) {
533 return handlePrevious(fRData.fRTable);
537 // set things up. handlePrevious() will back us up to some valid
538 // break position before the current position (we back our internal
539 // iterator up one step to prevent handlePrevious() from returning
540 // the current position), but not necessarily the last one before
543 int start = current();
546 int lastResult = handlePrevious(fRData.fRTable);
547 if (lastResult == BreakIterator.DONE) {
548 lastResult = fText.getBeginIndex();
549 fText.setIndex(lastResult);
551 int result = lastResult;
553 boolean breakTagValid = false;
555 // iterate forward from the known break position until we pass our
556 // starting point. The last break position before the starting
557 // point is our return value
560 result = handleNext();
561 if (result == BreakIterator.DONE || result >= start) {
565 lastTag = fLastRuleStatusIndex;
566 breakTagValid = true;
569 // fLastBreakTag wants to have the value for section of text preceding
570 // the result position that we are to return (in lastResult.) If
571 // the backwards rules overshot and the above loop had to do two or more
572 // handleNext()s to move up to the desired return position, we will have a valid
573 // tag value. But, if handlePrevious() took us to exactly the correct result positon,
574 // we wont have a tag value for that position, which is only set by handleNext().
576 // set the current iteration position to be the last break position
577 // before where we started, and then return that value
578 fText.setIndex(lastResult);
579 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
580 fLastStatusIndexValid = breakTagValid;
585 * Sets the iterator to refer to the first boundary position following
586 * the specified position.
587 * @param offset The position from which to begin searching for a break position.
588 * @return The position of the first break after the current position.
591 public int following(int offset) {
592 CharacterIterator text = getText();
594 // if we have no cached break positions, or if "offset" is outside the
595 // range covered by the cache, then dump the cache and call our
596 // inherited following() method. This will call other methods in this
597 // class that may refresh the cache.
598 if (fCachedBreakPositions == null || offset < fCachedBreakPositions[0] ||
599 offset >= fCachedBreakPositions[fCachedBreakPositions.length - 1]) {
600 fCachedBreakPositions = null;
601 return rulesFollowing(offset);
604 // on the other hand, if "offset" is within the range covered by the
605 // cache, then just search the cache for the first break position
608 fPositionInCache = 0;
609 while (fPositionInCache < fCachedBreakPositions.length
610 && offset >= fCachedBreakPositions[fPositionInCache])
612 text.setIndex(fCachedBreakPositions[fPositionInCache]);
613 return text.getIndex();
617 private int rulesFollowing(int offset) {
618 // if the offset passed in is already past the end of the text,
619 // just return DONE; if it's before the beginning, return the
620 // text's starting offset
621 fLastRuleStatusIndex = 0;
622 fLastStatusIndexValid = true;
623 if (fText == null || offset >= fText.getEndIndex()) {
627 else if (offset < fText.getBeginIndex()) {
631 // otherwise, set our internal iteration position (temporarily)
632 // to the position passed in. If this is the _beginning_ position,
633 // then we can just use next() to get our return value
637 if (fRData.fSRTable != null) {
638 // Safe Point Reverse rules exist.
639 // This allows us to use the optimum algorithm.
640 fText.setIndex(offset);
641 // move forward one codepoint to prepare for moving back to a
643 // this handles offset being between a supplementary character
645 // handlePrevious will move most of the time to < 1 boundary away
646 handlePrevious(fRData.fSRTable);
648 while (result <= offset) {
653 if (fRData.fSFTable != null) {
654 // No Safe point reverse table, but there is a safe pt forward table.
656 fText.setIndex(offset);
658 // handle next will give result >= offset
659 handleNext(fRData.fSFTable);
660 // previous will give result 0 or 1 boundary away from offset,
663 int oldresult = previous();
664 while (oldresult > offset) {
666 if (result <= offset) {
672 if (result <= offset) {
677 // otherwise, we have to sync up first. Use handlePrevious() to back
678 // us up to a known break position before the specified position (if
679 // we can determine that the specified position is a break position,
680 // we don't back up at all). This may or may not be the last break
681 // position at or before our starting position. Advance forward
682 // from here until we've passed the starting position. The position
683 // we stop on will be the first break position after the specified one.
686 fText.setIndex(offset);
687 if (offset == fText.getBeginIndex()) {
692 while (result != BreakIterator.DONE && result <= offset) {
699 * Sets the iterator to refer to the last boundary position before the
700 * specified position.
701 * @param offset The position to begin searching for a break from.
702 * @return The position of the last boundary before the starting position.
705 public int preceding(int offset) {
706 CharacterIterator text = getText();
708 // if we have no cached break positions, or "offset" is outside the
709 // range covered by the cache, we can just call the inherited routine
710 // (which will eventually call other routines in this class that may
711 // refresh the cache)
712 if (fCachedBreakPositions == null || offset <= fCachedBreakPositions[0] ||
713 offset > fCachedBreakPositions[fCachedBreakPositions.length - 1]) {
714 fCachedBreakPositions = null;
715 return rulesPreceding(offset);
718 // on the other hand, if "offset" is within the range covered by the cache,
719 // then all we have to do is search the cache for the last break position
722 fPositionInCache = 0;
723 while (fPositionInCache < fCachedBreakPositions.length
724 && offset > fCachedBreakPositions[fPositionInCache])
727 text.setIndex(fCachedBreakPositions[fPositionInCache]);
728 return text.getIndex();
732 private int rulesPreceding(int offset) {
733 // if the offset passed in is already past the end of the text,
734 // just return DONE; if it's before the beginning, return the
736 // text's starting offset
737 if (fText == null || offset > fText.getEndIndex()) {
738 // return BreakIterator::DONE;
741 else if (offset < fText.getBeginIndex()) {
745 // if we start by updating the current iteration position to the
746 // position specified by the caller, we can just use previous()
747 // to carry out this operation
750 if (fRData.fSFTable != null) {
753 fText.setIndex(offset);
754 // move backwards one codepoint to prepare for moving forwards to a
756 // this handles offset being between a supplementary character
758 handleNext(fRData.fSFTable);
760 while (result >= offset) {
765 if (fRData.fSRTable != null) {
766 // backup plan if forward safe table is not available
767 fText.setIndex(offset);
769 // handle previous will give result <= offset
770 handlePrevious(fRData.fSRTable);
772 // next will give result 0 or 1 boundary away from offset,
775 int oldresult = next();
776 while (oldresult < offset) {
778 if (result >= offset) {
784 if (result >= offset) {
791 fText.setIndex(offset);
796 * Throw IllegalArgumentException unless begin <= offset < end.
799 protected static final void checkOffset(int offset, CharacterIterator text) {
800 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
801 throw new IllegalArgumentException("offset out of bounds");
807 * Returns true if the specified position is a boundary position. As a side
808 * effect, leaves the iterator pointing to the first boundary position at
810 * @param offset the offset to check.
811 * @return True if "offset" is a boundary position.
814 public boolean isBoundary(int offset) {
815 checkOffset(offset, fText);
817 // the beginning index of the iterator is always a boundary position by definition
818 if (offset == fText.getBeginIndex()) {
819 first(); // For side effects on current position, tag values.
823 if (offset == fText.getEndIndex()) {
824 last(); // For side effects on current position, tag values.
828 // otherwise, we can use following() on the position before the specified
829 // one and return true if the position we get back is the one the user
832 // return following(offset - 1) == offset;
833 // TODO: check whether it is safe to revert to the simpler offset-1 code
834 // The safe rules may take care of unpaired surrogates ok.
835 fText.setIndex(offset);
837 int pos = fText.getIndex();
838 boolean result = following(pos) == offset;
843 * Returns the current iteration position.
844 * @return The current iteration position.
847 public int current() {
848 return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
851 private void makeRuleStatusValid() {
852 if (fLastStatusIndexValid == false) {
853 // No cached status is available.
854 int curr = current();
855 if (curr == BreakIterator.DONE || curr == fText.getBeginIndex()) {
856 // At start of text, or there is no text. Status is always zero.
857 fLastRuleStatusIndex = 0;
858 fLastStatusIndexValid = true;
860 // Not at start of text. Find status the tedious way.
861 int pa = fText.getIndex();
864 while (fText.getIndex() < pa) {
867 Assert.assrt(pa == pb);
869 Assert.assrt(fLastStatusIndexValid == true);
870 Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length);
875 * Return the status tag from the break rule that determined the most recently
876 * returned break position. The values appear in the rule source
877 * within brackets, {123}, for example. For rules that do not specify a
878 * status, a default value of 0 is returned. If more than one rule applies,
879 * the numerically largest of the possible status values is returned.
881 * Of the standard types of ICU break iterators, only the word break
882 * iterator provides status values. The values are defined in
883 * class RuleBasedBreakIterator, and allow distinguishing between words
884 * that contain alphabetic letters, "words" that appear to be numbers,
885 * punctuation and spaces, words containing ideographic characters, and
886 * more. Call <code>getRuleStatus</code> after obtaining a boundary
887 * position from <code>next()<code>, <code>previous()</code>, or
888 * any other break iterator functions that returns a boundary position.
890 * @return the status from the break rule that determined the most recently
891 * returned break position.
894 * @provisional This is a draft API and might change in a future release of ICU.
897 public int getRuleStatus() {
898 makeRuleStatusValid();
899 // Status records have this form:
900 // Count N <-- fLastRuleStatusIndex points here.
904 // Status val N-1 <-- the value we need to return
905 // The status values are sorted in ascending order.
906 // This function returns the last (largest) of the array of status values.
907 int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
908 int tagVal = fRData.fStatusTable[idx];
913 * Get the status (tag) values from the break rule(s) that determined the most
914 * recently returned break position. The values appear in the rule source
915 * within brackets, {123}, for example. The default status value for rules
916 * that do not explicitly provide one is zero.
918 * The status values used by the standard ICU break rules are defined
919 * as public constants in class RuleBasedBreakIterator.
921 * If the size of the output array is insufficient to hold the data,
922 * the output will be truncated to the available length. No exception
925 * @param fillInArray an array to be filled in with the status values.
926 * @return The number of rule status values from rules that determined
927 * the most recent boundary returned by the break iterator.
928 * In the event that the array is too small, the return value
929 * is the total number of status values that were available,
930 * not the reduced number that were actually returned.
932 * @provisional This is a draft API and might change in a future release of ICU.
934 public int getRuleStatusVec(int[] fillInArray) {
935 makeRuleStatusValid();
936 int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
937 if (fillInArray != null) {
938 int numToCopy = Math.min(numStatusVals, fillInArray.length);
939 for (int i=0; i<numToCopy; i++) {
940 fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
943 return numStatusVals;
947 * Return a CharacterIterator over the text being analyzed. This version
948 * of this method returns the actual CharacterIterator we're using internally.
949 * Changing the state of this iterator can have undefined consequences. If
950 * you need to change it, clone it first.
951 * @return An iterator over the text being analyzed.
954 public CharacterIterator getText() {
959 * Set the iterator to analyze a new piece of text. This function resets
960 * the current iteration position to the beginning of the text.
961 * @param newText An iterator over the text to analyze.
964 public void setText(CharacterIterator newText) {
966 // first() resets the caches
967 int firstIdx = this.first();
968 if (newText != null) {
969 fUseDictionary = ((fBreakType == KIND_WORD || fBreakType == KIND_LINE)
970 && newText.getEndIndex() != firstIdx);
976 * @deprecated This API is ICU internal only.
978 void setBreakType(int type) {
980 if (type != KIND_WORD && type != KIND_LINE) {
981 fUseDictionary = false;
987 * @deprecated This API is ICU internal only.
994 * Control debug, trace and dump options.
997 static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
998 ICUDebug.value(RBBI_DEBUG_ARG) : null;
1001 * Finds an appropriate LanguageBreakEngine for this character and
1004 * @deprecated This API is ICU internal only.
1006 private LanguageBreakEngine getEngineFor(int c) {
1007 if (c == DONE32 || !fUseDictionary) {
1011 for (LanguageBreakEngine candidate : fBreakEngines) {
1012 if (candidate.handles(c, fBreakType)) {
1017 // if we don't have an existing engine, build one.
1018 int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
1019 LanguageBreakEngine eng = null;
1023 eng = new ThaiBreakEngine();
1026 eng = new LaoBreakEngine();
1028 case UScript.KATAKANA:
1029 case UScript.HIRAGANA:
1031 if (getBreakType() == KIND_WORD) {
1032 eng = new CjkBreakEngine(false);
1035 fUnhandledBreakEngine.handleChar(c, getBreakType());
1036 eng = fUnhandledBreakEngine;
1039 case UScript.HANGUL:
1040 if (getBreakType() == KIND_WORD) {
1041 eng = new CjkBreakEngine(true);
1043 fUnhandledBreakEngine.handleChar(c, getBreakType());
1044 eng = fUnhandledBreakEngine;
1048 fUnhandledBreakEngine.handleChar(c, getBreakType());
1049 eng = fUnhandledBreakEngine;
1052 } catch (IOException e) {
1057 fBreakEngines.add(eng);
1062 //-----------------------------------------------------------------------------------
1064 // handleNext(void) All forward iteration vectors through this function.
1066 //-----------------------------------------------------------------------------------
1067 private int handleNext() {
1068 // if there are no cached break positions, or if we've just moved
1069 // off the end of the range covered by the cache, we have to dump
1070 // and possibly regenerate the cache
1071 if (fCachedBreakPositions == null || fPositionInCache == fCachedBreakPositions.length - 1) {
1072 int startPos = fText.getIndex();
1074 // start by using the rules handleNext() to find a tentative return
1075 // value. dictionaryCharCount tells us how many dictionary characters
1076 // we passed over on our way to the tentative return value
1077 fDictionaryCharCount = 0;
1078 int result = handleNext(fRData.fFTable);
1080 // if we passed over more than one dictionary character, then we use
1081 // divideUpDictionaryRange() to regenerate the cached break positions
1082 // for the new range.
1083 if (fDictionaryCharCount > 1 && result - startPos > 1) {
1084 fText.setIndex(startPos);
1085 LanguageBreakEngine e = getEngineFor(current32(fText));
1087 // we have an engine! use it to produce breaks
1088 Stack<Integer> breaks = new Stack<Integer>();
1089 e.findBreaks(fText, startPos, result, false, getBreakType(), breaks);
1091 int breaksSize = breaks.size();
1092 fCachedBreakPositions = new int[breaksSize + 2];
1093 fCachedBreakPositions[0] = startPos;
1094 for (int i = 0; i < breaksSize; i++) {
1095 fCachedBreakPositions[i + 1] = breaks.elementAt(i).intValue();
1097 fCachedBreakPositions[breaksSize + 1] = result;
1099 fPositionInCache = 0;
1101 // we don't have an engine; just use the rules
1102 fText.setIndex(result);
1107 // otherwise, the value we got back from the inherited function
1108 // is our return value, and we can dump the cache
1109 fCachedBreakPositions = null;
1114 // if the cache of break positions has been regenerated (or existed all
1115 // along), then just advance to the next break position in the cache
1117 if (fCachedBreakPositions != null) {
1119 fText.setIndex(fCachedBreakPositions[fPositionInCache]);
1120 return fCachedBreakPositions[fPositionInCache];
1124 Assert.assrt(false);
1125 return BreakIterator.DONE; // WE SHOULD NEVER GET HERE!
1130 * The State Machine Engine for moving forward is here.
1131 * This function is the heart of the RBBI run time engine.
1134 * @return the new iterator position
1136 * A note on supplementary characters and the position of underlying
1137 * Java CharacterIterator: Normally, a character iterator is positioned at
1138 * the char most recently returned by next(). Within this function, when
1139 * a supplementary char is being processed, the char iterator is left
1140 * sitting on the trail surrogate, in the middle of the code point.
1141 * This is different from everywhere else, where an iterator always
1142 * points at the lead surrogate of a supplementary.
1144 private int handleNext(short stateTable[]) {
1146 System.out.println("Handle Next pos char state category");
1149 // No matter what, handleNext alway correctly sets the break tag value.
1150 fLastStatusIndexValid = true;
1151 fLastRuleStatusIndex = 0;
1153 // caches for quicker access
1154 CharacterIterator text = fText;
1155 CharTrie trie = fRData.fTrie;
1157 // Set up the starting char
1158 int c = text.current();
1159 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
1160 c = nextTrail32(text, c);
1162 return BreakIterator.DONE;
1165 int initialPosition = text.getIndex();
1166 int result = initialPosition;
1168 // Set the initial state for the state machine
1169 int state = START_STATE;
1170 int row = fRData.getRowIndex(state);
1172 short flagsState = stateTable[RBBIDataWrapper.FLAGS+1];
1173 int mode = RBBI_RUN;
1174 if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
1178 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
1179 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
1180 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
1183 int lookaheadStatus = 0;
1184 int lookaheadTagIdx = 0;
1185 int lookaheadResult = 0;
1187 // loop until we reach the end of the text or transition to state 0
1188 while (state != STOP_STATE) {
1190 // Reached end of input string.
1191 if (mode == RBBI_END) {
1192 // We have already run the loop one last time with the
1193 // character set to the pseudo {eof} value. Now it is time
1194 // to unconditionally bail out.
1196 if (lookaheadResult > result) {
1197 // We ran off the end of the string with a pending
1198 // look-ahead match.
1199 // Treat this as if the look-ahead condition had been
1201 // the match at the / position from the look-ahead rule.
1202 result = lookaheadResult;
1203 fLastRuleStatusIndex = lookaheadTagIdx;
1207 // Run the loop one last time with the fake end-of-input character category
1211 else if (mode == RBBI_RUN) {
1212 // Get the char category. An incoming category of 1 or 2 mens that
1213 // we are preset for doing the beginning or end of input, and
1214 // that we shouldn't get a category from an actual text input character.
1217 // look up the current character's character category, which tells us
1218 // which column in the state table to look at.
1220 category = (short) trie.getCodePointValue(c);
1222 // Check the dictionary bit in the character's category.
1223 // Counter is only used by dictionary based iterators (subclasses).
1224 // Chars that need to be handled by a dictionary have a flag bit set
1225 // in their category values.
1227 if ((category & 0x4000) != 0) {
1228 fDictionaryCharCount++;
1229 // And off the dictionary flag bit.
1230 category &= ~0x4000;
1234 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
1235 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
1236 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
1239 // Advance to the next character.
1240 // If this is a beginning-of-input loop iteration, don't advance.
1241 // The next iteration will be processing the first real input character.
1242 c = (int)text.next();
1243 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
1244 c = nextTrail32(text, c);
1251 // look up a state transition in the state table
1252 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1253 row = fRData.getRowIndex(state);
1255 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
1256 // Match found, common case
1257 result = text.getIndex();
1258 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
1259 // The iterator has been left in the middle of a surrogate pair.
1260 // We want the start of it.
1264 // Remember the break status (tag) values.
1265 fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
1268 if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1269 if (lookaheadStatus != 0
1270 && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
1271 // Lookahead match is completed. Set the result accordingly, but only
1272 // if no other rule has matched further in the mean time.
1273 result = lookaheadResult;
1274 fLastRuleStatusIndex = lookaheadTagIdx;
1275 lookaheadStatus = 0;
1276 // TODO: make a standalone hard break in a rule work.
1277 if ((flagsState & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0) {
1278 text.setIndex(result);
1281 // Look-ahead completed, but other rules may match further. Continue on.
1282 // TODO: junk this feature? I don't think it's used anywhere.
1286 lookaheadResult = text.getIndex();
1287 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
1288 // The iterator has been left in the middle of a surrogate pair.
1289 // We want the beginning of it.
1292 lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
1293 lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];
1297 if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
1298 // Because this is an accepting state, any in-progress look-ahead match
1299 // is no longer relevant. Clear out the pending lookahead status.
1300 lookaheadStatus = 0;
1302 } // End of state machine main loop
1304 // The state machine is done. Check whether it found a match...
1306 // If the iterator failed to advance in the match engine force it ahead by one.
1307 // This indicates a defect in the break rules, which should always match
1308 // at least one character.
1310 if (result == initialPosition) {
1312 System.out.println("Iterator did not move. Advancing by 1.");
1314 text.setIndex(initialPosition);
1316 result = text.getIndex();
1319 // Leave the iterator at our result position.
1320 // (we may have advanced beyond the last accepting position chasing after
1321 // longer matches that never completed.)
1322 text.setIndex(result);
1325 System.out.println("result = " + result);
1330 private int handlePrevious(short stateTable[]) {
1331 if (fText == null || stateTable == null) {
1340 int lookaheadStatus = 0;
1342 int initialPosition = 0;
1343 int lookaheadResult = 0;
1344 boolean lookAheadHardBreak =
1345 (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1347 // handlePrevious() never gets the rule status.
1348 // Flag the status as invalid; if the user ever asks for status, we will need
1349 // to back up, then re-find the break position using handleNext(), which does
1350 // get the status value.
1351 fLastStatusIndexValid = false;
1352 fLastRuleStatusIndex = 0;
1354 // set up the starting char
1355 initialPosition = fText.getIndex();
1356 result = initialPosition;
1357 c = previous32(fText);
1359 // Set up the initial state for the state machine
1360 state = START_STATE;
1361 row = fRData.getRowIndex(state);
1362 category = 3; // TODO: obsolete? from the old start/run mode scheme?
1364 if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
1370 System.out.println("Handle Prev pos char state category ");
1373 // loop until we reach the beginning of the text or transition to state 0
1375 mainLoop: for (;;) {
1378 // Reached end of input string.
1379 if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
1380 // Either this is the old (ICU 3.2 and earlier) format data which
1381 // does not support explicit support for matching {eof}, or
1382 // we have already done the {eof} iteration. Now is the time
1383 // to unconditionally bail out.
1384 if (lookaheadResult < result) {
1385 // We ran off the end of the string with a pending look-ahead match.
1386 // Treat this as if the look-ahead condition had been met, and return
1387 // the match at the / position from the look-ahead rule.
1388 result = lookaheadResult;
1389 lookaheadStatus = 0;
1390 } else if (result == initialPosition) {
1391 // Ran off start, no match found.
1392 // Move one position (towards the start, since we are doing previous.)
1393 fText.setIndex(initialPosition);
1402 if (mode == RBBI_RUN) {
1403 // look up the current character's category, which tells us
1404 // which column in the state table to look at.
1406 category = (short) fRData.fTrie.getCodePointValue(c);
1408 // Check the dictionary bit in the character's category.
1409 // Counter is only used by dictionary based iterators (subclasses).
1410 // Chars that need to be handled by a dictionary have a flag bit set
1411 // in their category values.
1413 if ((category & 0x4000) != 0) {
1414 fDictionaryCharCount++;
1415 // And off the dictionary flag bit.
1416 category &= ~0x4000;
1422 System.out.print(" " + fText.getIndex() + " ");
1423 if (0x20 <= c && c < 0x7f) {
1424 System.out.print(" " + c + " ");
1426 System.out.print(" " + Integer.toHexString(c) + " ");
1428 System.out.println(" " + state + " " + category + " ");
1431 // State Transition - move machine to its next state
1433 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1434 row = fRData.getRowIndex(state);
1436 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
1437 // Match found, common case, could have lookahead so we move
1439 result = fText.getIndex();
1442 if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
1443 if (lookaheadStatus != 0
1444 && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
1445 // Lookahead match is completed. Set the result
1446 // accordingly, but only
1447 // if no other rule has matched further in the mean
1449 result = lookaheadResult;
1450 lookaheadStatus = 0;
1451 // TODO: make a stand-alone hard break in a rule work.
1453 if (lookAheadHardBreak) {
1456 // Look-ahead completed, but other rules may match further.
1458 // TODO: junk this feature? I don't think that it's used anywhere.
1461 // Hit a possible look-ahead match. We are at the
1462 // position of the '/'. Remember this position.
1463 lookaheadResult = fText.getIndex();
1464 lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
1469 if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
1470 // This is a plain (non-look-ahead) accepting state.
1471 if (!lookAheadHardBreak) {
1472 // Clear out any pending look-ahead matches,
1473 // but only if not doing the lookAheadHardBreak option
1474 // which needs to force a break no matter what is going
1475 // on with the rest of the match, i.e. we can't abandon
1476 // a partially completed look-ahead match because
1477 // some other rule matched further than the '/' position
1478 // in the look-ahead match.
1479 lookaheadStatus = 0;
1483 } // end of innerBlock. "break innerBlock" in above code comes out here.
1486 if (state == STOP_STATE) {
1487 // Normal loop exit is here
1491 // then move iterator position backwards one character
1493 if (mode == RBBI_RUN) {
1494 c = previous32(fText);
1496 if (mode == RBBI_START) {
1502 } // End of the main loop.
1504 // The state machine is done. Check whether it found a match...
1506 // If the iterator failed to advance in the match engine, force it ahead by one.
1507 // (This really indicates a defect in the break rules. They should always match
1508 // at least one character.)
1509 if (result == initialPosition) {
1510 result = fText.setIndex(initialPosition);
1512 result = fText.getIndex();
1515 fText.setIndex(result);
1517 System.out.println("Result = " + result);