2 *******************************************************************************
\r
3 * Copyright (C) 2005-2008 International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.text.CharacterIterator;
\r
10 import java.io.IOException;
\r
11 import java.io.InputStream;
\r
12 import java.io.OutputStream;
\r
13 import java.io.ByteArrayInputStream;
\r
14 import java.io.ByteArrayOutputStream;
\r
16 import com.ibm.icu.impl.Assert;
\r
17 import com.ibm.icu.impl.ICUDebug;
\r
21 * Rule Based Break Iterator
\r
22 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
\r
26 public class RuleBasedBreakIterator extends BreakIterator {
\r
29 //=======================================================================
\r
30 // Constructors & Factories
\r
31 //=======================================================================
\r
35 * @deprecated This API is ICU internal only.
\r
37 public RuleBasedBreakIterator() {
\r
41 * Create a break iterator from a precompiled set of rules.
\r
43 * @deprecated This API is ICU internal only.
\r
45 public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
\r
46 RuleBasedBreakIterator This = new RuleBasedBreakIterator();
\r
47 This.fRData = RBBIDataWrapper.get(is);
\r
51 /*private RuleBasedBreakIterator(RuleBasedBreakIterator other) {
\r
52 // TODO: check types.
\r
53 fRData = other.fRData;
\r
54 if (fText != null) {
\r
55 fText = (CharacterIterator)(other.fText.clone());
\r
60 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
\r
61 * @param rules The break rules to be used.
\r
64 public RuleBasedBreakIterator(String rules) {
\r
67 ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
\r
68 compileRules(rules, ruleOS);
\r
69 byte [] ruleBA = ruleOS.toByteArray();
\r
70 InputStream ruleIS = new ByteArrayInputStream(ruleBA);
\r
71 fRData = RBBIDataWrapper.get(ruleIS);
\r
72 } catch (IOException e) {
\r
73 // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
\r
74 // causing bogus compiled rules to be produced, but with no compile error raised.
\r
75 RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
\r
82 //=======================================================================
\r
84 //=======================================================================
\r
87 * Clones this iterator.
\r
88 * @return A newly-constructed RuleBasedBreakIterator with the same
\r
89 * behavior as this one.
\r
92 public Object clone()
\r
94 RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone();
\r
95 if (fText != null) {
\r
96 result.fText = (CharacterIterator)(fText.clone());
\r
102 * Returns true if both BreakIterators are of the same class, have the same
\r
103 * rules, and iterate over the same text.
\r
106 public boolean equals(Object that) {
\r
108 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
\r
109 if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
\r
112 if (fRData != null && other.fRData != null &&
\r
113 (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
\r
116 if (fText == null && other.fText == null) {
\r
119 if (fText == null || other.fText == null) {
\r
122 return fText.equals(other.fText);
\r
124 catch(ClassCastException e) {
\r
130 * Returns the description (rules) used to create this iterator.
\r
131 * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
\r
134 public String toString() {
\r
135 String retStr = null;
\r
136 if (fRData != null) {
\r
137 retStr = fRData.fRuleSource;
\r
143 * Compute a hashcode for this BreakIterator
\r
144 * @return A hash code
\r
147 public int hashCode()
\r
149 return fRData.fRuleSource.hashCode();
\r
154 * Tag value for "words" that do not fit into any of other categories.
\r
155 * Includes spaces and most punctuation.
\r
157 * @provisional This is a draft API and might change in a future release of ICU.
\r
159 public static final int WORD_NONE = 0;
\r
162 * Upper bound for tags for uncategorized words.
\r
164 * @provisional This is a draft API and might change in a future release of ICU.
\r
166 public static final int WORD_NONE_LIMIT = 100;
\r
169 * Tag value for words that appear to be numbers, lower limit.
\r
171 * @provisional This is a draft API and might change in a future release of ICU.
\r
173 public static final int WORD_NUMBER = 100;
\r
176 * Tag value for words that appear to be numbers, upper limit.
\r
178 * @provisional This is a draft API and might change in a future release of ICU.
\r
180 public static final int WORD_NUMBER_LIMIT = 200;
\r
183 * Tag value for words that contain letters, excluding
\r
184 * hiragana, katakana or ideographic characters, lower limit.
\r
186 * @provisional This is a draft API and might change in a future release of ICU.
\r
188 public static final int WORD_LETTER = 200;
\r
191 * Tag value for words containing letters, upper limit
\r
193 * @provisional This is a draft API and might change in a future release of ICU.
\r
195 public static final int WORD_LETTER_LIMIT = 300;
\r
198 * Tag value for words containing kana characters, lower limit
\r
200 * @provisional This is a draft API and might change in a future release of ICU.
\r
202 public static final int WORD_KANA = 300;
\r
205 * Tag value for words containing kana characters, upper limit
\r
207 * @provisional This is a draft API and might change in a future release of ICU.
\r
209 public static final int WORD_KANA_LIMIT = 400;
\r
212 * Tag value for words containing ideographic characters, lower limit
\r
214 * @provisional This is a draft API and might change in a future release of ICU.
\r
216 public static final int WORD_IDEO = 400;
\r
219 * Tag value for words containing ideographic characters, upper limit
\r
221 * @provisional This is a draft API and might change in a future release of ICU.
\r
223 public static final int WORD_IDEO_LIMIT = 500;
\r
228 private static final int START_STATE = 1; // The state number of the starting state
\r
229 private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
\r
231 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
\r
232 // of user text. A variable with this enum type keeps track of where we
\r
233 // are. The state machine only fetches user text input while in RUN mode.
\r
234 private static final int RBBI_START = 0;
\r
235 private static final int RBBI_RUN = 1;
\r
236 private static final int RBBI_END = 2;
\r
239 * The character iterator through which this BreakIterator accesses the text.
\r
241 private CharacterIterator fText = new java.text.StringCharacterIterator("");
\r
244 * The rule data for this BreakIterator instance
\r
246 * @deprecated This API is ICU internal only.
\r
248 protected RBBIDataWrapper fRData;
\r
251 * Index of the Rule {tag} values for the most recent match.
\r
253 private int fLastRuleStatusIndex;
\r
256 * Rule tag value valid flag.
\r
257 * Some iterator operations don't intrinsically set the correct tag value.
\r
258 * This flag lets us lazily compute the value if we are ever asked for it.
\r
260 private boolean fLastStatusIndexValid;
\r
263 * Counter for the number of characters encountered with the "dictionary"
\r
264 * flag set. Normal RBBI iterators don't use it, although the code
\r
265 * for updating it is live. Dictionary Based break iterators (a subclass
\r
266 * of us) access this field directly.
\r
268 * @deprecated This API is ICU internal only.
\r
270 protected int fDictionaryCharCount;
\r
273 * Debugging flag. Trace operation of state machine when true.
\r
275 * @deprecated This API is ICU internal only.
\r
277 public static boolean fTrace;
\r
280 * ICU debug argument name for RBBI
\r
282 private static final String RBBI_DEBUG_ARG = "rbbi";
\r
285 * Dump the contents of the state table and character classes for this break iterator.
\r
286 * For debugging only.
\r
288 * @deprecated This API is ICU internal only.
\r
290 public void dump() {
\r
291 this.fRData.dump();
\r
294 private static boolean debugInitDone = false;
\r
296 private void init() {
\r
297 fLastStatusIndexValid = true;
\r
298 fDictionaryCharCount = 0;
\r
301 if (debugInitDone == false) {
\r
302 fTrace = ICUDebug.enabled(RBBI_DEBUG_ARG)
\r
303 && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
\r
304 debugInitDone = true;
\r
308 private static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
\r
309 RBBIRuleBuilder.compileRules(rules, ruleBinary);
\r
312 //=======================================================================
\r
313 // BreakIterator overrides
\r
314 //=======================================================================
\r
317 * Sets the current iteration position to the beginning of the text.
\r
318 * (i.e., the CharacterIterator's starting offset).
\r
319 * @return The offset of the beginning of the text.
\r
322 public int first() {
\r
323 fLastRuleStatusIndex = 0;
\r
324 fLastStatusIndexValid = true;
\r
325 if (fText == null) {
\r
326 return BreakIterator.DONE;
\r
329 return fText.getIndex();
\r
334 * Sets the current iteration position to the end of the text.
\r
335 * (i.e., the CharacterIterator's ending offset).
\r
336 * @return The text's past-the-end offset.
\r
339 public int last() {
\r
340 if (fText == null) {
\r
341 fLastRuleStatusIndex = 0;
\r
342 fLastStatusIndexValid = true;
\r
343 return BreakIterator.DONE;
\r
346 // I'm not sure why, but t.last() returns the offset of the last character,
\r
347 // rather than the past-the-end offset
\r
349 // (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
\r
350 // will work correctly.)
\r
353 fLastStatusIndexValid = false;
\r
354 int pos = fText.getEndIndex();
\r
355 fText.setIndex(pos);
\r
361 * Advances the iterator either forward or backward the specified number of steps.
\r
362 * Negative values move backward, and positive values move forward. This is
\r
363 * equivalent to repeatedly calling next() or previous().
\r
364 * @param n The number of steps to move. The sign indicates the direction
\r
365 * (negative is backwards, and positive is forwards).
\r
366 * @return The character offset of the boundary position n boundaries away from
\r
370 public int next(int n) {
\r
371 int result = current();
\r
373 result = handleNext();
\r
377 result = previous();
\r
385 * Advances the iterator to the next boundary position.
\r
386 * @return The position of the first boundary after this one.
\r
389 public int next() {
\r
390 return handleNext();
\r
395 * Moves the iterator backwards, to the last boundary preceding this one.
\r
396 * @return The position of the last boundary position preceding this one.
\r
399 public int previous() {
\r
400 // if we're already sitting at the beginning of the text, return DONE
\r
401 if (fText == null || current() == fText.getBeginIndex()) {
\r
402 fLastRuleStatusIndex = 0;
\r
403 fLastStatusIndexValid = true;
\r
404 return BreakIterator.DONE;
\r
407 if (fRData.fSRTable != null || fRData.fSFTable != null) {
\r
408 return handlePrevious(fRData.fRTable);
\r
412 // set things up. handlePrevious() will back us up to some valid
\r
413 // break position before the current position (we back our internal
\r
414 // iterator up one step to prevent handlePrevious() from returning
\r
415 // the current position), but not necessarily the last one before
\r
416 // where we started
\r
418 int start = current();
\r
420 CIPrevious32(fText);
\r
421 int lastResult = handlePrevious(fRData.fRTable);
\r
422 if (lastResult == BreakIterator.DONE) {
\r
423 lastResult = fText.getBeginIndex();
\r
424 fText.setIndex(lastResult);
\r
426 int result = lastResult;
\r
428 boolean breakTagValid = false;
\r
430 // iterate forward from the known break position until we pass our
\r
431 // starting point. The last break position before the starting
\r
432 // point is our return value
\r
435 result = handleNext();
\r
436 if (result == BreakIterator.DONE || result >= start) {
\r
439 lastResult = result;
\r
440 lastTag = fLastRuleStatusIndex;
\r
441 breakTagValid = true;
\r
444 // fLastBreakTag wants to have the value for section of text preceding
\r
445 // the result position that we are to return (in lastResult.) If
\r
446 // the backwards rules overshot and the above loop had to do two or more
\r
447 // handleNext()s to move up to the desired return position, we will have a valid
\r
448 // tag value. But, if handlePrevious() took us to exactly the correct result positon,
\r
449 // we wont have a tag value for that position, which is only set by handleNext().
\r
451 // set the current iteration position to be the last break position
\r
452 // before where we started, and then return that value
\r
453 fText.setIndex(lastResult);
\r
454 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
\r
455 fLastStatusIndexValid = breakTagValid;
\r
459 * Sets the iterator to refer to the first boundary position following
\r
460 * the specified position.
\r
461 * @param offset The position from which to begin searching for a break position.
\r
462 * @return The position of the first break after the current position.
\r
465 public int following(int offset) {
\r
466 // if the offset passed in is already past the end of the text,
\r
467 // just return DONE; if it's before the beginning, return the
\r
468 // text's starting offset
\r
469 fLastRuleStatusIndex = 0;
\r
470 fLastStatusIndexValid = true;
\r
471 if (fText == null || offset >= fText.getEndIndex()) {
\r
475 else if (offset < fText.getBeginIndex()) {
\r
479 // otherwise, set our internal iteration position (temporarily)
\r
480 // to the position passed in. If this is the _beginning_ position,
\r
481 // then we can just use next() to get our return value
\r
485 if (fRData.fSRTable != null) {
\r
486 // Safe Point Reverse rules exist.
\r
487 // This allows us to use the optimum algorithm.
\r
488 fText.setIndex(offset);
\r
489 // move forward one codepoint to prepare for moving back to a
\r
491 // this handles offset being between a supplementary character
\r
493 // handlePrevious will move most of the time to < 1 boundary away
\r
494 handlePrevious(fRData.fSRTable);
\r
496 while (result <= offset) {
\r
501 if (fRData.fSFTable != null) {
\r
502 // No Safe point reverse table, but there is a safe pt forward table.
\r
504 fText.setIndex(offset);
\r
505 CIPrevious32(fText);
\r
506 // handle next will give result >= offset
\r
507 handleNext(fRData.fSFTable);
\r
508 // previous will give result 0 or 1 boundary away from offset,
\r
509 // most of the time
\r
511 int oldresult = previous();
\r
512 while (oldresult > offset) {
\r
513 result = previous();
\r
514 if (result <= offset) {
\r
517 oldresult = result;
\r
520 if (result <= offset) {
\r
525 // otherwise, we have to sync up first. Use handlePrevious() to back
\r
526 // us up to a known break position before the specified position (if
\r
527 // we can determine that the specified position is a break position,
\r
528 // we don't back up at all). This may or may not be the last break
\r
529 // position at or before our starting position. Advance forward
\r
530 // from here until we've passed the starting position. The position
\r
531 // we stop on will be the first break position after the specified one.
\r
534 fText.setIndex(offset);
\r
535 if (offset == fText.getBeginIndex()) {
\r
536 return handleNext();
\r
538 result = previous();
\r
540 while (result != BreakIterator.DONE && result <= offset) {
\r
547 * Sets the iterator to refer to the last boundary position before the
\r
548 * specified position.
\r
549 * @param offset The position to begin searching for a break from.
\r
550 * @return The position of the last boundary before the starting position.
\r
553 public int preceding(int offset) {
\r
554 // if the offset passed in is already past the end of the text,
\r
555 // just return DONE; if it's before the beginning, return the
\r
557 // text's starting offset
\r
558 if (fText == null || offset > fText.getEndIndex()) {
\r
559 // return BreakIterator::DONE;
\r
562 else if (offset < fText.getBeginIndex()) {
\r
566 // if we start by updating the current iteration position to the
\r
567 // position specified by the caller, we can just use previous()
\r
568 // to carry out this operation
\r
571 if (fRData.fSFTable != null) {
\r
574 fText.setIndex(offset);
\r
575 // move backwards one codepoint to prepare for moving forwards to a
\r
577 // this handles offset being between a supplementary character
\r
578 CIPrevious32(fText);
\r
579 handleNext(fRData.fSFTable);
\r
580 result = previous();
\r
581 while (result >= offset) {
\r
582 result = previous();
\r
586 if (fRData.fSRTable != null) {
\r
587 // backup plan if forward safe table is not available
\r
588 fText.setIndex(offset);
\r
590 // handle previous will give result <= offset
\r
591 handlePrevious(fRData.fSRTable);
\r
593 // next will give result 0 or 1 boundary away from offset,
\r
594 // most of the time
\r
596 int oldresult = next();
\r
597 while (oldresult < offset) {
\r
599 if (result >= offset) {
\r
602 oldresult = result;
\r
604 result = previous();
\r
605 if (result >= offset) {
\r
612 fText.setIndex(offset);
\r
617 * Throw IllegalArgumentException unless begin <= offset < end.
\r
620 protected static final void checkOffset(int offset, CharacterIterator text) {
\r
621 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
\r
622 throw new IllegalArgumentException("offset out of bounds");
\r
628 * Returns true if the specfied position is a boundary position. As a side
\r
629 * effect, leaves the iterator pointing to the first boundary position at
\r
630 * or after "offset".
\r
631 * @param offset the offset to check.
\r
632 * @return True if "offset" is a boundary position.
\r
635 public boolean isBoundary(int offset) {
\r
636 checkOffset(offset, fText);
\r
638 // the beginning index of the iterator is always a boundary position by definition
\r
639 if (offset == fText.getBeginIndex()) {
\r
640 first(); // For side effects on current position, tag values.
\r
644 if (offset == fText.getEndIndex()) {
\r
645 last(); // For side effects on current position, tag values.
\r
649 // otherwise, we can use following() on the position before the specified
\r
650 // one and return true if the position we get back is the one the user
\r
653 // return following(offset - 1) == offset;
\r
654 // TODO: check whether it is safe to revert to the simpler offset-1 code
\r
655 // The safe rules may take care of unpaired surrogates ok.
\r
656 fText.setIndex(offset);
\r
657 CIPrevious32(fText);
\r
658 int pos = fText.getIndex();
\r
659 boolean result = following(pos) == offset;
\r
664 * Returns the current iteration position.
\r
665 * @return The current iteration position.
\r
668 public int current() {
\r
669 return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
\r
674 private void makeRuleStatusValid() {
\r
675 if (fLastStatusIndexValid == false) {
\r
676 // No cached status is available.
\r
677 if (fText == null || current() == fText.getBeginIndex()) {
\r
678 // At start of text, or there is no text. Status is always zero.
\r
679 fLastRuleStatusIndex = 0;
\r
680 fLastStatusIndexValid = true;
\r
682 // Not at start of text. Find status the tedious way.
\r
683 int pa = current();
\r
686 Assert.assrt (pa == pb);
\r
688 Assert.assrt(fLastStatusIndexValid == true);
\r
689 Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length);
\r
695 * Return the status tag from the break rule that determined the most recently
\r
696 * returned break position. The values appear in the rule source
\r
697 * within brackets, {123}, for example. For rules that do not specify a
\r
698 * status, a default value of 0 is returned. If more than one rule applies,
\r
699 * the numerically largest of the possible status values is returned.
\r
701 * Of the standard types of ICU break iterators, only the word break
\r
702 * iterator provides status values. The values are defined in
\r
703 * class RuleBasedBreakIterator, and allow distinguishing between words
\r
704 * that contain alphabetic letters, "words" that appear to be numbers,
\r
705 * punctuation and spaces, words containing ideographic characters, and
\r
706 * more. Call <code>getRuleStatus</code> after obtaining a boundary
\r
707 * position from <code>next()<code>, <code>previous()</code>, or
\r
708 * any other break iterator functions that returns a boundary position.
\r
710 * @return the status from the break rule that determined the most recently
\r
711 * returned break position.
\r
714 * @provisional This is a draft API and might change in a future release of ICU.
\r
717 public int getRuleStatus() {
\r
718 makeRuleStatusValid();
\r
719 // Status records have this form:
\r
720 // Count N <-- fLastRuleStatusIndex points here.
\r
724 // Status val N-1 <-- the value we need to return
\r
725 // The status values are sorted in ascending order.
\r
726 // This function returns the last (largest) of the array of status values.
\r
727 int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
\r
728 int tagVal = fRData.fStatusTable[idx];
\r
736 * Get the status (tag) values from the break rule(s) that determined the most
\r
737 * recently returned break position. The values appear in the rule source
\r
738 * within brackets, {123}, for example. The default status value for rules
\r
739 * that do not explicitly provide one is zero.
\r
741 * The status values used by the standard ICU break rules are defined
\r
742 * as public constants in class RuleBasedBreakIterator.
\r
744 * If the size of the output array is insufficient to hold the data,
\r
745 * the output will be truncated to the available length. No exception
\r
748 * @param fillInArray an array to be filled in with the status values.
\r
749 * @return The number of rule status values from rules that determined
\r
750 * the most recent boundary returned by the break iterator.
\r
751 * In the event that the array is too small, the return value
\r
752 * is the total number of status values that were available,
\r
753 * not the reduced number that were actually returned.
\r
755 * @provisional This is a draft API and might change in a future release of ICU.
\r
757 public int getRuleStatusVec(int[] fillInArray) {
\r
758 makeRuleStatusValid();
\r
759 int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
\r
760 if (fillInArray != null) {
\r
761 int numToCopy = Math.min(numStatusVals, fillInArray.length);
\r
762 for (int i=0; i<numToCopy; i++) {
\r
763 fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
\r
766 return numStatusVals;
\r
771 * Return a CharacterIterator over the text being analyzed. This version
\r
772 * of this method returns the actual CharacterIterator we're using internally.
\r
773 * Changing the state of this iterator can have undefined consequences. If
\r
774 * you need to change it, clone it first.
\r
775 * @return An iterator over the text being analyzed.
\r
778 public CharacterIterator getText() {
\r
784 * Set the iterator to analyze a new piece of text. This function resets
\r
785 * the current iteration position to the beginning of the text.
\r
786 * @param newText An iterator over the text to analyze.
\r
789 public void setText(CharacterIterator newText) {
\r
795 * Control debug, trace and dump options.
\r
797 * @deprecated This API is ICU internal only.
\r
799 protected static String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
\r
800 ICUDebug.value(RBBI_DEBUG_ARG) : null;
\r
803 // 32 bit Char value returned from when an iterator has run out of range.
\r
804 // Positive value so fast case (not end, not surrogate) can be checked
\r
805 // with a single test.
\r
806 private static int CI_DONE32 = 0x7fffffff;
\r
809 * Move the iterator forward to the next code point, and return that code point,
\r
810 * leaving the iterator positioned at char returned.
\r
811 * For Supplementary chars, the iterator is left positioned at the lead surrogate.
\r
812 * @param ci The character iterator
\r
813 * @return The next code point.
\r
815 static int CINext32(CharacterIterator ci) {
\r
816 // If the current position is at a surrogate pair, move to the trail surrogate
\r
817 // which leaves it in positon for underlying iterator's next() to work.
\r
818 int c= ci.current();
\r
819 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE && c<=UTF16.LEAD_SURROGATE_MAX_VALUE) {
\r
821 if (c<UTF16.TRAIL_SURROGATE_MIN_VALUE || c>UTF16.TRAIL_SURROGATE_MAX_VALUE) {
\r
822 c = ci.previous();
\r
826 // For BMP chars, this next() is the real deal.
\r
829 // If we might have a lead surrogate, we need to peak ahead to get the trail
\r
830 // even though we don't want to really be positioned there.
\r
831 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
832 c = CINextTrail32(ci, c);
\r
835 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {
\r
836 // We got a supplementary char. Back the iterator up to the postion
\r
837 // of the lead surrogate.
\r
844 // Out-of-line portion of the in-line Next32 code.
\r
845 // The call site does an initial ci.next() and calls this function
\r
846 // if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE.
\r
847 // NOTE: we leave the underlying char iterator positioned in the
\r
848 // middle of a surroage pair. ci.next() will work correctly
\r
849 // from there, but the ci.getIndex() will be wrong, and needs
\r
851 private static int CINextTrail32(CharacterIterator ci, int lead) {
\r
853 if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
\r
854 char cTrail = ci.next();
\r
855 if (UTF16.isTrailSurrogate(cTrail)) {
\r
856 retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
\r
857 (cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
\r
858 UTF16.SUPPLEMENTARY_MIN_VALUE;
\r
863 if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) {
\r
864 retVal = CI_DONE32;
\r
870 private static int CIPrevious32(CharacterIterator ci) {
\r
871 if (ci.getIndex() <= ci.getBeginIndex()) {
\r
874 char trail = ci.previous();
\r
875 int retVal = trail;
\r
876 if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) {
\r
877 char lead = ci.previous();
\r
878 if (UTF16.isLeadSurrogate(lead)) {
\r
879 retVal = (((int)lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
\r
880 ((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
\r
881 UTF16.SUPPLEMENTARY_MIN_VALUE;
\r
889 static int CICurrent32(CharacterIterator ci) {
\r
890 char lead = ci.current();
\r
892 if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
895 if (UTF16.isLeadSurrogate(lead)) {
\r
896 int trail = (int)ci.next();
\r
898 if (UTF16.isTrailSurrogate((char)trail)) {
\r
899 retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
\r
900 (trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
\r
901 UTF16.SUPPLEMENTARY_MIN_VALUE;
\r
904 if (lead == CharacterIterator.DONE) {
\r
905 if (ci.getIndex() >= ci.getEndIndex()) {
\r
906 retVal = CI_DONE32;
\r
914 //-----------------------------------------------------------------------------------
\r
916 // handleNext(void) All forward iteration vectors through this function.
\r
917 // NOTE: This function is overridden by the dictionary base break iterator.
\r
918 // User level API functions go to the dbbi implementation
\r
919 // when the break iterator type is dbbi.
\r
920 // The DBBI implementation sometimes explicitly calls back to here,
\r
921 // its inherited handleNext().
\r
923 //-----------------------------------------------------------------------------------
\r
925 return handleNext(fRData.fFTable);
\r
929 * The State Machine Engine for moving forward is here.
\r
930 * This function is the heart of the RBBI run time engine.
\r
932 * @param stateTable
\r
933 * @return the new iterator position
\r
935 * A note on supplementary characters and the position of underlying
\r
936 * Java CharacterIterator: Normally, a character iterator is positioned at
\r
937 * the char most recently returned by next(). Within this function, when
\r
938 * a supplementary char is being processed, the char iterator is left
\r
939 * sitting on the trail surrogate, in the middle of the code point.
\r
940 * This is different from everywhere else, where an iterator always
\r
941 * points at the lead surrogate of a supplementary.
\r
943 private int handleNext(short stateTable[]) {
\r
945 short category = 0;
\r
949 int lookaheadStatus = 0;
\r
950 int lookaheadTagIdx = 0;
\r
952 int initialPosition = 0;
\r
953 int lookaheadResult = 0;
\r
954 boolean lookAheadHardBreak =
\r
955 (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
\r
958 System.out.println("Handle Next pos char state category");
\r
961 // No matter what, handleNext alway correctly sets the break tag value.
\r
962 fLastStatusIndexValid = true;
\r
963 fLastRuleStatusIndex = 0;
\r
965 // if we're already at the end of the text, return DONE.
\r
966 if (fText == null) {
\r
967 fLastRuleStatusIndex = 0;
\r
968 return BreakIterator.DONE;
\r
971 // Set up the starting char
\r
972 initialPosition = fText.getIndex();
\r
973 result = initialPosition;
\r
974 c = fText.current();
\r
975 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
976 c = CINextTrail32(fText, c);
\r
977 if (c == CI_DONE32) {
\r
978 fLastRuleStatusIndex = 0;
\r
979 return BreakIterator.DONE;
\r
983 // Set the initial state for the state machine
\r
984 state = START_STATE;
\r
985 row = fRData.getRowIndex(state);
\r
988 if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
\r
994 // loop until we reach the end of the text or transition to state 0
\r
995 while (state != STOP_STATE) {
\r
996 if (c == CI_DONE32) {
\r
997 // Reached end of input string.
\r
998 if (mode == RBBI_END) {
\r
999 // We have already run the loop one last time with the
\r
1000 // character set to the pseudo {eof} value. Now it is time
\r
1001 // to unconditionally bail out.
\r
1003 if (lookaheadResult > result) {
\r
1004 // We ran off the end of the string with a pending
\r
1005 // look-ahead match.
\r
1006 // Treat this as if the look-ahead condition had been
\r
1007 // met, and return
\r
1008 // the match at the / position from the look-ahead rule.
\r
1009 result = lookaheadResult;
\r
1010 fLastRuleStatusIndex = lookaheadTagIdx;
\r
1011 lookaheadStatus = 0;
\r
1012 } else if (result == initialPosition) {
\r
1013 // Ran off end, no match found.
\r
1014 // move forward one
\r
1015 fText.setIndex(initialPosition);
\r
1020 // Run the loop one last time with the fake end-of-input character category
\r
1025 // Get the char category. An incoming category of 1 or 2 mens that
\r
1026 // we are preset for doing the beginning or end of input, and
\r
1027 // that we shouldn't get a category from an actual text input character.
\r
1029 if (mode == RBBI_RUN) {
\r
1030 // look up the current character's character category, which tells us
\r
1031 // which column in the state table to look at.
\r
1033 category = (short) fRData.fTrie.getCodePointValue(c);
\r
1035 // Check the dictionary bit in the character's category.
\r
1036 // Counter is only used by dictionary based iterators (subclasses).
\r
1037 // Chars that need to be handled by a dictionary have a flag bit set
\r
1038 // in their category values.
\r
1040 if ((category & 0x4000) != 0) {
\r
1041 fDictionaryCharCount++;
\r
1042 // And off the dictionary flag bit.
\r
1043 category &= ~0x4000;
\r
1048 System.out.print(" " + RBBIDataWrapper.intToString(fText.getIndex(), 5));
\r
1049 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
\r
1050 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
\r
1053 // look up a state transition in the state table
\r
1054 // state = row->fNextState[category];
\r
1055 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
\r
1056 row = fRData.getRowIndex(state);
\r
1058 // Advance to the next character.
\r
1059 // If this is a beginning-of-input loop iteration, don't advance.
\r
1060 // The next iteration will be processing the first real input character.
\r
1061 if (mode == RBBI_RUN) {
\r
1062 c = (int)fText.next();
\r
1063 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
1064 c = CINextTrail32(fText, c);
\r
1067 if (mode == RBBI_START) {
\r
1072 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
\r
1073 // Match found, common case
\r
1074 result = fText.getIndex();
\r
1075 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {
\r
1076 // The iterator has been left in the middle of a surrogate pair.
\r
1077 // We want the start of it.
\r
1081 // Remember the break status (tag) values.
\r
1082 fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
\r
1085 if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
\r
1086 if (lookaheadStatus != 0
\r
1087 && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
\r
1088 // Lookahead match is completed. Set the result accordingly, but only
\r
1089 // if no other rule has matched further in the mean time.
\r
1090 result = lookaheadResult;
\r
1091 fLastRuleStatusIndex = lookaheadTagIdx;
\r
1092 lookaheadStatus = 0;
\r
1093 // TODO: make a standalone hard break in a rule work.
\r
1094 if (lookAheadHardBreak) {
\r
1097 // Look-ahead completed, but other rules may match further. Continue on.
\r
1098 // TODO: junk this feature? I don't think it's used anywhere.
\r
1102 lookaheadResult = fText.getIndex();
\r
1103 if (c>=UTF16.SUPPLEMENTARY_MIN_VALUE && c!=CI_DONE32) {
\r
1104 // The iterator has been left in the middle of a surrogate pair.
\r
1105 // We want the beginning of it.
\r
1106 lookaheadResult--;
\r
1108 lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
\r
1109 lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];
\r
1114 if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
\r
1115 // Because this is an accepting state, any in-progress look-ahead match
\r
1116 // is no longer relavant. Clear out the pending lookahead status.
\r
1117 lookaheadStatus = 0;
\r
1120 } // End of state machine main loop
\r
1122 // The state machine is done. Check whether it found a match...
\r
1124 // If the iterator failed to advance in the match engine, force it ahead by one.
\r
1125 // (This really indicates a defect in the break rules. They should always match
\r
1126 // at least one character.)
\r
1127 if (result == initialPosition) {
\r
1128 result = fText.setIndex(initialPosition);
\r
1130 result = fText.getIndex();
\r
1133 // Leave the iterator at our result position.
\r
1134 // (we may have advanced beyond the last accepting position chasing after
\r
1135 // longer matches that never completed.)
\r
1136 fText.setIndex(result);
\r
1138 System.out.println("result = " + result);
\r
1145 private int handlePrevious(short stateTable[]) {
\r
1151 int lookaheadStatus = 0;
\r
1153 int initialPosition = 0;
\r
1154 int lookaheadResult = 0;
\r
1155 boolean lookAheadHardBreak =
\r
1156 (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
\r
1159 if (fText == null || stateTable == null) {
\r
1162 // handlePrevious() never gets the rule status.
\r
1163 // Flag the status as invalid; if the user ever asks for status, we will need
\r
1164 // to back up, then re-find the break position using handleNext(), which does
\r
1165 // get the status value.
\r
1166 fLastStatusIndexValid = false;
\r
1167 fLastRuleStatusIndex = 0;
\r
1169 // set up the starting char
\r
1170 initialPosition = fText.getIndex();
\r
1171 result = initialPosition;
\r
1172 c = CIPrevious32(fText);
\r
1174 // Set up the initial state for the state machine
\r
1175 state = START_STATE;
\r
1176 row = fRData.getRowIndex(state);
\r
1177 category = 3; // TODO: obsolete? from the old start/run mode scheme?
\r
1179 if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
\r
1181 mode = RBBI_START;
\r
1185 System.out.println("Handle Prev pos char state category ");
\r
1188 // loop until we reach the beginning of the text or transition to state 0
\r
1190 mainLoop: for (;;) {
\r
1192 if (c == CI_DONE32) {
\r
1193 // Reached end of input string.
\r
1194 if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
\r
1195 // Either this is the old (ICU 3.2 and earlier) format data which
\r
1196 // does not support explicit support for matching {eof}, or
\r
1197 // we have already done the {eof} iteration. Now is the time
\r
1198 // to unconditionally bail out.
\r
1199 if (lookaheadResult < result) {
\r
1200 // We ran off the end of the string with a pending look-ahead match.
\r
1201 // Treat this as if the look-ahead condition had been met, and return
\r
1202 // the match at the / position from the look-ahead rule.
\r
1203 result = lookaheadResult;
\r
1204 lookaheadStatus = 0;
\r
1205 } else if (result == initialPosition) {
\r
1206 // Ran off start, no match found.
\r
1207 // Move one position (towards the start, since we are doing previous.)
\r
1208 fText.setIndex(initialPosition);
\r
1209 CIPrevious32(fText);
\r
1217 if (mode == RBBI_RUN) {
\r
1218 // look up the current character's category, which tells us
\r
1219 // which column in the state table to look at.
\r
1221 category = (short) fRData.fTrie.getCodePointValue(c);
\r
1223 // Check the dictionary bit in the character's category.
\r
1224 // Counter is only used by dictionary based iterators (subclasses).
\r
1225 // Chars that need to be handled by a dictionary have a flag bit set
\r
1226 // in their category values.
\r
1228 if ((category & 0x4000) != 0) {
\r
1229 fDictionaryCharCount++;
\r
1230 // And off the dictionary flag bit.
\r
1231 category &= ~0x4000;
\r
1237 System.out.print(" " + fText.getIndex() + " ");
\r
1238 if (0x20 <= c && c < 0x7f) {
\r
1239 System.out.print(" " + c + " ");
\r
1241 System.out.print(" " + Integer.toHexString(c) + " ");
\r
1243 System.out.println(" " + state + " " + category + " ");
\r
1246 // State Transition - move machine to its next state
\r
1248 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
\r
1249 row = fRData.getRowIndex(state);
\r
1251 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
\r
1252 // Match found, common case, could have lookahead so we move
\r
1254 result = fText.getIndex();
\r
1257 if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
\r
1258 if (lookaheadStatus != 0
\r
1259 && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
\r
1260 // Lookahead match is completed. Set the result
\r
1261 // accordingly, but only
\r
1262 // if no other rule has matched further in the mean
\r
1264 result = lookaheadResult;
\r
1265 lookaheadStatus = 0;
\r
1266 // TODO: make a standalone hard break in a rule work.
\r
1268 if (lookAheadHardBreak) {
\r
1271 // Look-ahead completed, but other rules may match further.
\r
1273 // TODO: junk this feature? I don't think that it's used anywhere.
\r
1276 // Hit a possible look-ahead match. We are at the
\r
1277 // position of the '/'. Remember this position.
\r
1278 lookaheadResult = fText.getIndex();
\r
1279 lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
\r
1283 // not lookahead...
\r
1284 if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
\r
1285 // This is a plain (non-look-ahead) accepting state.
\r
1286 if (!lookAheadHardBreak) {
\r
1287 // Clear out any pending look-ahead matches,
\r
1288 // but only if not doing the lookAheadHardBreak option
\r
1289 // which needs to force a break no matter what is going
\r
1290 // on with the rest of the match, i.e. we can't abandon
\r
1291 // a partially completed look-ahead match because
\r
1292 // some other rule matched further than the '/' position
\r
1293 // in the look-ahead match.
\r
1294 lookaheadStatus = 0;
\r
1298 } // end of innerBlock. "break innerBlock" in above code comes out here.
\r
1301 if (state == STOP_STATE) {
\r
1302 // Normal loop exit is here
\r
1306 // then move iterator position backwards one character
\r
1308 if (mode == RBBI_RUN) {
\r
1309 c = CIPrevious32(fText);
\r
1311 if (mode == RBBI_START) {
\r
1317 } // End of the main loop.
\r
1319 // The state machine is done. Check whether it found a match...
\r
1321 // If the iterator failed to advance in the match engine, force it ahead by one.
\r
1322 // (This really indicates a defect in the break rules. They should always match
\r
1323 // at least one character.)
\r
1324 if (result == initialPosition) {
\r
1325 result = fText.setIndex(initialPosition);
\r
1326 CIPrevious32(fText);
\r
1327 result = fText.getIndex();
\r
1330 fText.setIndex(result);
\r
1332 System.out.println("Result = " + result);
\r
1342 //-------------------------------------------------------------------------------
\r
1346 // isDictionaryChar Return true if the category lookup for this char
\r
1348 // indicates that it is in the set of dictionary lookup
\r
1354 // This function is intended for use by dictionary based
\r
1356 // break iterators.
\r
1360 //-------------------------------------------------------------------------------
\r
1362 boolean isDictionaryChar(int c) {
\r
1364 short category = (short) fRData.fTrie.getCodePointValue(c);
\r
1366 return (category & 0x4000) != 0;
\r