2 *******************************************************************************
\r
3 * Copyright (C) 2005-2010 International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.io.ByteArrayInputStream;
\r
10 import java.io.ByteArrayOutputStream;
\r
11 import java.io.IOException;
\r
12 import java.io.InputStream;
\r
13 import java.io.OutputStream;
\r
14 import java.text.CharacterIterator;
\r
16 import com.ibm.icu.impl.Assert;
\r
17 import com.ibm.icu.impl.ICUDebug;
\r
21 * Rule Based Break Iterator
\r
22 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
\r
26 public class RuleBasedBreakIterator extends BreakIterator {
\r
29 //=======================================================================
\r
30 // Constructors & Factories
\r
31 //=======================================================================
\r
35 * @deprecated This API is ICU internal only.
\r
37 public RuleBasedBreakIterator() {
\r
41 * Create a break iterator from a precompiled set of rules.
\r
43 * @deprecated This API is ICU internal only.
\r
45 public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
\r
46 RuleBasedBreakIterator This = new RuleBasedBreakIterator();
\r
47 This.fRData = RBBIDataWrapper.get(is);
\r
51 /*private RuleBasedBreakIterator(RuleBasedBreakIterator other) {
\r
52 // TODO: check types.
\r
53 fRData = other.fRData;
\r
54 if (fText != null) {
\r
55 fText = (CharacterIterator)(other.fText.clone());
\r
60 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
\r
61 * @param rules The break rules to be used.
\r
64 public RuleBasedBreakIterator(String rules) {
\r
67 ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
\r
68 compileRules(rules, ruleOS);
\r
69 byte [] ruleBA = ruleOS.toByteArray();
\r
70 InputStream ruleIS = new ByteArrayInputStream(ruleBA);
\r
71 fRData = RBBIDataWrapper.get(ruleIS);
\r
72 } catch (IOException e) {
\r
74 // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
\r
75 // causing bogus compiled rules to be produced, but with no compile error raised.
\r
76 RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
\r
84 //=======================================================================
\r
86 //=======================================================================
\r
89 * Clones this iterator.
\r
90 * @return A newly-constructed RuleBasedBreakIterator with the same
\r
91 * behavior as this one.
\r
94 public Object clone()
\r
96 RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone();
\r
97 if (fText != null) {
\r
98 result.fText = (CharacterIterator)(fText.clone());
\r
104 * Returns true if both BreakIterators are of the same class, have the same
\r
105 * rules, and iterate over the same text.
\r
108 public boolean equals(Object that) {
\r
110 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
\r
111 if (fRData != other.fRData && (fRData == null || other.fRData == null)) {System.out.println("GOT HERE");
\r
114 if (fRData != null && other.fRData != null &&
\r
115 (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
\r
118 if (fText == null && other.fText == null) {
\r
121 if (fText == null || other.fText == null) {
\r
124 return fText.equals(other.fText);
\r
126 catch(ClassCastException e) {
\r
132 * Returns the description (rules) used to create this iterator.
\r
133 * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
\r
136 public String toString() {
\r
137 String retStr = null;
\r
138 if (fRData != null) {
\r
139 retStr = fRData.fRuleSource;
\r
145 * Compute a hashcode for this BreakIterator
\r
146 * @return A hash code
\r
149 public int hashCode()
\r
151 return fRData.fRuleSource.hashCode();
\r
156 * Tag value for "words" that do not fit into any of other categories.
\r
157 * Includes spaces and most punctuation.
\r
159 * @provisional This is a draft API and might change in a future release of ICU.
\r
161 public static final int WORD_NONE = 0;
\r
164 * Upper bound for tags for uncategorized words.
\r
166 * @provisional This is a draft API and might change in a future release of ICU.
\r
168 public static final int WORD_NONE_LIMIT = 100;
\r
171 * Tag value for words that appear to be numbers, lower limit.
\r
173 * @provisional This is a draft API and might change in a future release of ICU.
\r
175 public static final int WORD_NUMBER = 100;
\r
178 * Tag value for words that appear to be numbers, upper limit.
\r
180 * @provisional This is a draft API and might change in a future release of ICU.
\r
182 public static final int WORD_NUMBER_LIMIT = 200;
\r
185 * Tag value for words that contain letters, excluding
\r
186 * hiragana, katakana or ideographic characters, lower limit.
\r
188 * @provisional This is a draft API and might change in a future release of ICU.
\r
190 public static final int WORD_LETTER = 200;
\r
193 * Tag value for words containing letters, upper limit
\r
195 * @provisional This is a draft API and might change in a future release of ICU.
\r
197 public static final int WORD_LETTER_LIMIT = 300;
\r
200 * Tag value for words containing kana characters, lower limit
\r
202 * @provisional This is a draft API and might change in a future release of ICU.
\r
204 public static final int WORD_KANA = 300;
\r
207 * Tag value for words containing kana characters, upper limit
\r
209 * @provisional This is a draft API and might change in a future release of ICU.
\r
211 public static final int WORD_KANA_LIMIT = 400;
\r
214 * Tag value for words containing ideographic characters, lower limit
\r
216 * @provisional This is a draft API and might change in a future release of ICU.
\r
218 public static final int WORD_IDEO = 400;
\r
221 * Tag value for words containing ideographic characters, upper limit
\r
223 * @provisional This is a draft API and might change in a future release of ICU.
\r
225 public static final int WORD_IDEO_LIMIT = 500;
\r
230 private static final int START_STATE = 1; // The state number of the starting state
\r
231 private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
\r
233 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
\r
234 // of user text. A variable with this enum type keeps track of where we
\r
235 // are. The state machine only fetches user text input while in RUN mode.
\r
236 private static final int RBBI_START = 0;
\r
237 private static final int RBBI_RUN = 1;
\r
238 private static final int RBBI_END = 2;
\r
241 * The character iterator through which this BreakIterator accesses the text.
\r
243 private CharacterIterator fText = new java.text.StringCharacterIterator("");
\r
246 * The rule data for this BreakIterator instance
\r
248 * @deprecated This API is ICU internal only.
\r
250 protected RBBIDataWrapper fRData;
\r
253 * Index of the Rule {tag} values for the most recent match.
\r
255 private int fLastRuleStatusIndex;
\r
258 * Rule tag value valid flag.
\r
259 * Some iterator operations don't intrinsically set the correct tag value.
\r
260 * This flag lets us lazily compute the value if we are ever asked for it.
\r
262 private boolean fLastStatusIndexValid;
\r
265 * Counter for the number of characters encountered with the "dictionary"
\r
266 * flag set. Normal RBBI iterators don't use it, although the code
\r
267 * for updating it is live. Dictionary Based break iterators (a subclass
\r
268 * of us) access this field directly.
\r
270 * @deprecated This API is ICU internal only.
\r
272 protected int fDictionaryCharCount;
\r
275 * Debugging flag. Trace operation of state machine when true.
\r
277 * @deprecated This API is ICU internal only.
\r
279 public static boolean fTrace;
\r
282 * ICU debug argument name for RBBI
\r
284 private static final String RBBI_DEBUG_ARG = "rbbi";
\r
287 * Dump the contents of the state table and character classes for this break iterator.
\r
288 * For debugging only.
\r
290 * @deprecated This API is ICU internal only.
\r
292 public void dump() {
\r
293 this.fRData.dump();
\r
296 private static boolean debugInitDone = false;
\r
298 private void init() {
\r
299 fLastStatusIndexValid = true;
\r
300 fDictionaryCharCount = 0;
\r
303 if (debugInitDone == false) {
\r
304 fTrace = ICUDebug.enabled(RBBI_DEBUG_ARG)
\r
305 && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
\r
306 debugInitDone = true;
\r
310 private static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
\r
311 RBBIRuleBuilder.compileRules(rules, ruleBinary);
\r
314 //=======================================================================
\r
315 // BreakIterator overrides
\r
316 //=======================================================================
\r
319 * Sets the current iteration position to the beginning of the text.
\r
320 * (i.e., the CharacterIterator's starting offset).
\r
321 * @return The offset of the beginning of the text.
\r
324 public int first() {
\r
325 fLastRuleStatusIndex = 0;
\r
326 fLastStatusIndexValid = true;
\r
327 if (fText == null) {
\r
328 return BreakIterator.DONE;
\r
331 return fText.getIndex();
\r
336 * Sets the current iteration position to the end of the text.
\r
337 * (i.e., the CharacterIterator's ending offset).
\r
338 * @return The text's past-the-end offset.
\r
341 public int last() {
\r
342 if (fText == null) {
\r
343 fLastRuleStatusIndex = 0;
\r
344 fLastStatusIndexValid = true;
\r
345 return BreakIterator.DONE;
\r
348 // I'm not sure why, but t.last() returns the offset of the last character,
\r
349 // rather than the past-the-end offset
\r
351 // (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
\r
352 // will work correctly.)
\r
355 fLastStatusIndexValid = false;
\r
356 int pos = fText.getEndIndex();
\r
357 fText.setIndex(pos);
\r
363 * Advances the iterator either forward or backward the specified number of steps.
\r
364 * Negative values move backward, and positive values move forward. This is
\r
365 * equivalent to repeatedly calling next() or previous().
\r
366 * @param n The number of steps to move. The sign indicates the direction
\r
367 * (negative is backwards, and positive is forwards).
\r
368 * @return The character offset of the boundary position n boundaries away from
\r
372 public int next(int n) {
\r
373 int result = current();
\r
375 result = handleNext();
\r
379 result = previous();
\r
387 * Advances the iterator to the next boundary position.
\r
388 * @return The position of the first boundary after this one.
\r
391 public int next() {
\r
392 return handleNext();
\r
397 * Moves the iterator backwards, to the last boundary preceding this one.
\r
398 * @return The position of the last boundary position preceding this one.
\r
401 public int previous() {
\r
402 // if we're already sitting at the beginning of the text, return DONE
\r
403 if (fText == null || current() == fText.getBeginIndex()) {
\r
404 fLastRuleStatusIndex = 0;
\r
405 fLastStatusIndexValid = true;
\r
406 return BreakIterator.DONE;
\r
409 if (fRData.fSRTable != null || fRData.fSFTable != null) {
\r
410 return handlePrevious(fRData.fRTable);
\r
414 // set things up. handlePrevious() will back us up to some valid
\r
415 // break position before the current position (we back our internal
\r
416 // iterator up one step to prevent handlePrevious() from returning
\r
417 // the current position), but not necessarily the last one before
\r
418 // where we started
\r
420 int start = current();
\r
422 CIPrevious32(fText);
\r
423 int lastResult = handlePrevious(fRData.fRTable);
\r
424 if (lastResult == BreakIterator.DONE) {
\r
425 lastResult = fText.getBeginIndex();
\r
426 fText.setIndex(lastResult);
\r
428 int result = lastResult;
\r
430 boolean breakTagValid = false;
\r
432 // iterate forward from the known break position until we pass our
\r
433 // starting point. The last break position before the starting
\r
434 // point is our return value
\r
437 result = handleNext();
\r
438 if (result == BreakIterator.DONE || result >= start) {
\r
441 lastResult = result;
\r
442 lastTag = fLastRuleStatusIndex;
\r
443 breakTagValid = true;
\r
446 // fLastBreakTag wants to have the value for section of text preceding
\r
447 // the result position that we are to return (in lastResult.) If
\r
448 // the backwards rules overshot and the above loop had to do two or more
\r
449 // handleNext()s to move up to the desired return position, we will have a valid
\r
450 // tag value. But, if handlePrevious() took us to exactly the correct result positon,
\r
451 // we wont have a tag value for that position, which is only set by handleNext().
\r
453 // set the current iteration position to be the last break position
\r
454 // before where we started, and then return that value
\r
455 fText.setIndex(lastResult);
\r
456 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
\r
457 fLastStatusIndexValid = breakTagValid;
\r
461 * Sets the iterator to refer to the first boundary position following
\r
462 * the specified position.
\r
463 * @param offset The position from which to begin searching for a break position.
\r
464 * @return The position of the first break after the current position.
\r
467 public int following(int offset) {
\r
468 // if the offset passed in is already past the end of the text,
\r
469 // just return DONE; if it's before the beginning, return the
\r
470 // text's starting offset
\r
471 fLastRuleStatusIndex = 0;
\r
472 fLastStatusIndexValid = true;
\r
473 if (fText == null || offset >= fText.getEndIndex()) {
\r
477 else if (offset < fText.getBeginIndex()) {
\r
481 // otherwise, set our internal iteration position (temporarily)
\r
482 // to the position passed in. If this is the _beginning_ position,
\r
483 // then we can just use next() to get our return value
\r
487 if (fRData.fSRTable != null) {
\r
488 // Safe Point Reverse rules exist.
\r
489 // This allows us to use the optimum algorithm.
\r
490 fText.setIndex(offset);
\r
491 // move forward one codepoint to prepare for moving back to a
\r
493 // this handles offset being between a supplementary character
\r
495 // handlePrevious will move most of the time to < 1 boundary away
\r
496 handlePrevious(fRData.fSRTable);
\r
498 while (result <= offset) {
\r
503 if (fRData.fSFTable != null) {
\r
504 // No Safe point reverse table, but there is a safe pt forward table.
\r
506 fText.setIndex(offset);
\r
507 CIPrevious32(fText);
\r
508 // handle next will give result >= offset
\r
509 handleNext(fRData.fSFTable);
\r
510 // previous will give result 0 or 1 boundary away from offset,
\r
511 // most of the time
\r
513 int oldresult = previous();
\r
514 while (oldresult > offset) {
\r
515 result = previous();
\r
516 if (result <= offset) {
\r
519 oldresult = result;
\r
522 if (result <= offset) {
\r
527 // otherwise, we have to sync up first. Use handlePrevious() to back
\r
528 // us up to a known break position before the specified position (if
\r
529 // we can determine that the specified position is a break position,
\r
530 // we don't back up at all). This may or may not be the last break
\r
531 // position at or before our starting position. Advance forward
\r
532 // from here until we've passed the starting position. The position
\r
533 // we stop on will be the first break position after the specified one.
\r
536 fText.setIndex(offset);
\r
537 if (offset == fText.getBeginIndex()) {
\r
538 return handleNext();
\r
540 result = previous();
\r
542 while (result != BreakIterator.DONE && result <= offset) {
\r
549 * Sets the iterator to refer to the last boundary position before the
\r
550 * specified position.
\r
551 * @param offset The position to begin searching for a break from.
\r
552 * @return The position of the last boundary before the starting position.
\r
555 public int preceding(int offset) {
\r
556 // if the offset passed in is already past the end of the text,
\r
557 // just return DONE; if it's before the beginning, return the
\r
559 // text's starting offset
\r
560 if (fText == null || offset > fText.getEndIndex()) {
\r
561 // return BreakIterator::DONE;
\r
564 else if (offset < fText.getBeginIndex()) {
\r
568 // if we start by updating the current iteration position to the
\r
569 // position specified by the caller, we can just use previous()
\r
570 // to carry out this operation
\r
573 if (fRData.fSFTable != null) {
\r
576 fText.setIndex(offset);
\r
577 // move backwards one codepoint to prepare for moving forwards to a
\r
579 // this handles offset being between a supplementary character
\r
580 CIPrevious32(fText);
\r
581 handleNext(fRData.fSFTable);
\r
582 result = previous();
\r
583 while (result >= offset) {
\r
584 result = previous();
\r
588 if (fRData.fSRTable != null) {
\r
589 // backup plan if forward safe table is not available
\r
590 fText.setIndex(offset);
\r
592 // handle previous will give result <= offset
\r
593 handlePrevious(fRData.fSRTable);
\r
595 // next will give result 0 or 1 boundary away from offset,
\r
596 // most of the time
\r
598 int oldresult = next();
\r
599 while (oldresult < offset) {
\r
601 if (result >= offset) {
\r
604 oldresult = result;
\r
606 result = previous();
\r
607 if (result >= offset) {
\r
614 fText.setIndex(offset);
\r
619 * Throw IllegalArgumentException unless begin <= offset < end.
\r
622 protected static final void checkOffset(int offset, CharacterIterator text) {
\r
623 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
\r
624 throw new IllegalArgumentException("offset out of bounds");
\r
630 * Returns true if the specfied position is a boundary position. As a side
\r
631 * effect, leaves the iterator pointing to the first boundary position at
\r
632 * or after "offset".
\r
633 * @param offset the offset to check.
\r
634 * @return True if "offset" is a boundary position.
\r
637 public boolean isBoundary(int offset) {
\r
638 checkOffset(offset, fText);
\r
640 // the beginning index of the iterator is always a boundary position by definition
\r
641 if (offset == fText.getBeginIndex()) {
\r
642 first(); // For side effects on current position, tag values.
\r
646 if (offset == fText.getEndIndex()) {
\r
647 last(); // For side effects on current position, tag values.
\r
651 // otherwise, we can use following() on the position before the specified
\r
652 // one and return true if the position we get back is the one the user
\r
655 // return following(offset - 1) == offset;
\r
656 // TODO: check whether it is safe to revert to the simpler offset-1 code
\r
657 // The safe rules may take care of unpaired surrogates ok.
\r
658 fText.setIndex(offset);
\r
659 CIPrevious32(fText);
\r
660 int pos = fText.getIndex();
\r
661 boolean result = following(pos) == offset;
\r
666 * Returns the current iteration position.
\r
667 * @return The current iteration position.
\r
670 public int current() {
\r
671 return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
\r
676 private void makeRuleStatusValid() {
\r
677 if (fLastStatusIndexValid == false) {
\r
678 // No cached status is available.
\r
679 if (fText == null || current() == fText.getBeginIndex()) {
\r
680 // At start of text, or there is no text. Status is always zero.
\r
681 fLastRuleStatusIndex = 0;
\r
682 fLastStatusIndexValid = true;
\r
684 // Not at start of text. Find status the tedious way.
\r
685 int pa = current();
\r
688 Assert.assrt (pa == pb);
\r
690 Assert.assrt(fLastStatusIndexValid == true);
\r
691 Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length);
\r
697 * Return the status tag from the break rule that determined the most recently
\r
698 * returned break position. The values appear in the rule source
\r
699 * within brackets, {123}, for example. For rules that do not specify a
\r
700 * status, a default value of 0 is returned. If more than one rule applies,
\r
701 * the numerically largest of the possible status values is returned.
\r
703 * Of the standard types of ICU break iterators, only the word break
\r
704 * iterator provides status values. The values are defined in
\r
705 * class RuleBasedBreakIterator, and allow distinguishing between words
\r
706 * that contain alphabetic letters, "words" that appear to be numbers,
\r
707 * punctuation and spaces, words containing ideographic characters, and
\r
708 * more. Call <code>getRuleStatus</code> after obtaining a boundary
\r
709 * position from <code>next()<code>, <code>previous()</code>, or
\r
710 * any other break iterator functions that returns a boundary position.
\r
712 * @return the status from the break rule that determined the most recently
\r
713 * returned break position.
\r
716 * @provisional This is a draft API and might change in a future release of ICU.
\r
719 public int getRuleStatus() {
\r
720 makeRuleStatusValid();
\r
721 // Status records have this form:
\r
722 // Count N <-- fLastRuleStatusIndex points here.
\r
726 // Status val N-1 <-- the value we need to return
\r
727 // The status values are sorted in ascending order.
\r
728 // This function returns the last (largest) of the array of status values.
\r
729 int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
\r
730 int tagVal = fRData.fStatusTable[idx];
\r
738 * Get the status (tag) values from the break rule(s) that determined the most
\r
739 * recently returned break position. The values appear in the rule source
\r
740 * within brackets, {123}, for example. The default status value for rules
\r
741 * that do not explicitly provide one is zero.
\r
743 * The status values used by the standard ICU break rules are defined
\r
744 * as public constants in class RuleBasedBreakIterator.
\r
746 * If the size of the output array is insufficient to hold the data,
\r
747 * the output will be truncated to the available length. No exception
\r
750 * @param fillInArray an array to be filled in with the status values.
\r
751 * @return The number of rule status values from rules that determined
\r
752 * the most recent boundary returned by the break iterator.
\r
753 * In the event that the array is too small, the return value
\r
754 * is the total number of status values that were available,
\r
755 * not the reduced number that were actually returned.
\r
757 * @provisional This is a draft API and might change in a future release of ICU.
\r
759 public int getRuleStatusVec(int[] fillInArray) {
\r
760 makeRuleStatusValid();
\r
761 int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
\r
762 if (fillInArray != null) {
\r
763 int numToCopy = Math.min(numStatusVals, fillInArray.length);
\r
764 for (int i=0; i<numToCopy; i++) {
\r
765 fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
\r
768 return numStatusVals;
\r
773 * Return a CharacterIterator over the text being analyzed. This version
\r
774 * of this method returns the actual CharacterIterator we're using internally.
\r
775 * Changing the state of this iterator can have undefined consequences. If
\r
776 * you need to change it, clone it first.
\r
777 * @return An iterator over the text being analyzed.
\r
780 public CharacterIterator getText() {
\r
786 * Set the iterator to analyze a new piece of text. This function resets
\r
787 * the current iteration position to the beginning of the text.
\r
788 * @param newText An iterator over the text to analyze.
\r
791 public void setText(CharacterIterator newText) {
\r
797 * Control debug, trace and dump options.
\r
799 * @deprecated This API is ICU internal only.
\r
801 protected static String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
\r
802 ICUDebug.value(RBBI_DEBUG_ARG) : null;
\r
805 // 32 bit Char value returned from when an iterator has run out of range.
\r
806 // Positive value so fast case (not end, not surrogate) can be checked
\r
807 // with a single test.
\r
808 private static int CI_DONE32 = 0x7fffffff;
\r
811 * Move the iterator forward to the next code point, and return that code point,
\r
812 * leaving the iterator positioned at char returned.
\r
813 * For Supplementary chars, the iterator is left positioned at the lead surrogate.
\r
814 * @param ci The character iterator
\r
815 * @return The next code point.
\r
817 static int CINext32(CharacterIterator ci) {
\r
818 // If the current position is at a surrogate pair, move to the trail surrogate
\r
819 // which leaves it in positon for underlying iterator's next() to work.
\r
820 int c= ci.current();
\r
821 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE && c<=UTF16.LEAD_SURROGATE_MAX_VALUE) {
\r
823 if (c<UTF16.TRAIL_SURROGATE_MIN_VALUE || c>UTF16.TRAIL_SURROGATE_MAX_VALUE) {
\r
824 c = ci.previous();
\r
828 // For BMP chars, this next() is the real deal.
\r
831 // If we might have a lead surrogate, we need to peak ahead to get the trail
\r
832 // even though we don't want to really be positioned there.
\r
833 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
834 c = CINextTrail32(ci, c);
\r
837 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {
\r
838 // We got a supplementary char. Back the iterator up to the postion
\r
839 // of the lead surrogate.
\r
846 // Out-of-line portion of the in-line Next32 code.
\r
847 // The call site does an initial ci.next() and calls this function
\r
848 // if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE.
\r
849 // NOTE: we leave the underlying char iterator positioned in the
\r
850 // middle of a surroage pair. ci.next() will work correctly
\r
851 // from there, but the ci.getIndex() will be wrong, and needs
\r
853 private static int CINextTrail32(CharacterIterator ci, int lead) {
\r
855 if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
\r
856 char cTrail = ci.next();
\r
857 if (UTF16.isTrailSurrogate(cTrail)) {
\r
858 retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
\r
859 (cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
\r
860 UTF16.SUPPLEMENTARY_MIN_VALUE;
\r
865 if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) {
\r
866 retVal = CI_DONE32;
\r
872 private static int CIPrevious32(CharacterIterator ci) {
\r
873 if (ci.getIndex() <= ci.getBeginIndex()) {
\r
876 char trail = ci.previous();
\r
877 int retVal = trail;
\r
878 if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) {
\r
879 char lead = ci.previous();
\r
880 if (UTF16.isLeadSurrogate(lead)) {
\r
881 retVal = (((int)lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
\r
882 ((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
\r
883 UTF16.SUPPLEMENTARY_MIN_VALUE;
\r
891 static int CICurrent32(CharacterIterator ci) {
\r
892 char lead = ci.current();
\r
894 if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
897 if (UTF16.isLeadSurrogate(lead)) {
\r
898 int trail = (int)ci.next();
\r
900 if (UTF16.isTrailSurrogate((char)trail)) {
\r
901 retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
\r
902 (trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
\r
903 UTF16.SUPPLEMENTARY_MIN_VALUE;
\r
906 if (lead == CharacterIterator.DONE) {
\r
907 if (ci.getIndex() >= ci.getEndIndex()) {
\r
908 retVal = CI_DONE32;
\r
916 //-----------------------------------------------------------------------------------
\r
918 // handleNext(void) All forward iteration vectors through this function.
\r
919 // NOTE: This function is overridden by the dictionary base break iterator.
\r
920 // User level API functions go to the dbbi implementation
\r
921 // when the break iterator type is dbbi.
\r
922 // The DBBI implementation sometimes explicitly calls back to here,
\r
923 // its inherited handleNext().
\r
925 //-----------------------------------------------------------------------------------
\r
927 return handleNext(fRData.fFTable);
\r
931 * The State Machine Engine for moving forward is here.
\r
932 * This function is the heart of the RBBI run time engine.
\r
934 * @param stateTable
\r
935 * @return the new iterator position
\r
937 * A note on supplementary characters and the position of underlying
\r
938 * Java CharacterIterator: Normally, a character iterator is positioned at
\r
939 * the char most recently returned by next(). Within this function, when
\r
940 * a supplementary char is being processed, the char iterator is left
\r
941 * sitting on the trail surrogate, in the middle of the code point.
\r
942 * This is different from everywhere else, where an iterator always
\r
943 * points at the lead surrogate of a supplementary.
\r
945 private int handleNext(short stateTable[]) {
\r
947 short category = 0;
\r
951 int lookaheadStatus = 0;
\r
952 int lookaheadTagIdx = 0;
\r
954 int initialPosition = 0;
\r
955 int lookaheadResult = 0;
\r
956 boolean lookAheadHardBreak =
\r
957 (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
\r
960 System.out.println("Handle Next pos char state category");
\r
963 // No matter what, handleNext alway correctly sets the break tag value.
\r
964 fLastStatusIndexValid = true;
\r
965 fLastRuleStatusIndex = 0;
\r
967 // if we're already at the end of the text, return DONE.
\r
968 if (fText == null) {
\r
969 fLastRuleStatusIndex = 0;
\r
970 return BreakIterator.DONE;
\r
973 // Set up the starting char
\r
974 initialPosition = fText.getIndex();
\r
975 result = initialPosition;
\r
976 c = fText.current();
\r
977 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
978 c = CINextTrail32(fText, c);
\r
979 if (c == CI_DONE32) {
\r
980 fLastRuleStatusIndex = 0;
\r
981 return BreakIterator.DONE;
\r
985 // Set the initial state for the state machine
\r
986 state = START_STATE;
\r
987 row = fRData.getRowIndex(state);
\r
990 if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
\r
996 // loop until we reach the end of the text or transition to state 0
\r
997 while (state != STOP_STATE) {
\r
998 if (c == CI_DONE32) {
\r
999 // Reached end of input string.
\r
1000 if (mode == RBBI_END) {
\r
1001 // We have already run the loop one last time with the
\r
1002 // character set to the pseudo {eof} value. Now it is time
\r
1003 // to unconditionally bail out.
\r
1005 if (lookaheadResult > result) {
\r
1006 // We ran off the end of the string with a pending
\r
1007 // look-ahead match.
\r
1008 // Treat this as if the look-ahead condition had been
\r
1009 // met, and return
\r
1010 // the match at the / position from the look-ahead rule.
\r
1011 result = lookaheadResult;
\r
1012 fLastRuleStatusIndex = lookaheadTagIdx;
\r
1013 lookaheadStatus = 0;
\r
1014 } else if (result == initialPosition) {
\r
1015 // Ran off end, no match found.
\r
1016 // move forward one
\r
1017 fText.setIndex(initialPosition);
\r
1022 // Run the loop one last time with the fake end-of-input character category
\r
1027 // Get the char category. An incoming category of 1 or 2 mens that
\r
1028 // we are preset for doing the beginning or end of input, and
\r
1029 // that we shouldn't get a category from an actual text input character.
\r
1031 if (mode == RBBI_RUN) {
\r
1032 // look up the current character's character category, which tells us
\r
1033 // which column in the state table to look at.
\r
1035 category = (short) fRData.fTrie.getCodePointValue(c);
\r
1037 // Check the dictionary bit in the character's category.
\r
1038 // Counter is only used by dictionary based iterators (subclasses).
\r
1039 // Chars that need to be handled by a dictionary have a flag bit set
\r
1040 // in their category values.
\r
1042 if ((category & 0x4000) != 0) {
\r
1043 fDictionaryCharCount++;
\r
1044 // And off the dictionary flag bit.
\r
1045 category &= ~0x4000;
\r
1050 System.out.print(" " + RBBIDataWrapper.intToString(fText.getIndex(), 5));
\r
1051 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
\r
1052 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
\r
1055 // look up a state transition in the state table
\r
1056 // state = row->fNextState[category];
\r
1057 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
\r
1058 row = fRData.getRowIndex(state);
\r
1060 // Advance to the next character.
\r
1061 // If this is a beginning-of-input loop iteration, don't advance.
\r
1062 // The next iteration will be processing the first real input character.
\r
1063 if (mode == RBBI_RUN) {
\r
1064 c = (int)fText.next();
\r
1065 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
1066 c = CINextTrail32(fText, c);
\r
1069 if (mode == RBBI_START) {
\r
1074 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
\r
1075 // Match found, common case
\r
1076 result = fText.getIndex();
\r
1077 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {
\r
1078 // The iterator has been left in the middle of a surrogate pair.
\r
1079 // We want the start of it.
\r
1083 // Remember the break status (tag) values.
\r
1084 fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
\r
1087 if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
\r
1088 if (lookaheadStatus != 0
\r
1089 && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
\r
1090 // Lookahead match is completed. Set the result accordingly, but only
\r
1091 // if no other rule has matched further in the mean time.
\r
1092 result = lookaheadResult;
\r
1093 fLastRuleStatusIndex = lookaheadTagIdx;
\r
1094 lookaheadStatus = 0;
\r
1095 // TODO: make a standalone hard break in a rule work.
\r
1096 if (lookAheadHardBreak) {
\r
1099 // Look-ahead completed, but other rules may match further. Continue on.
\r
1100 // TODO: junk this feature? I don't think it's used anywhere.
\r
1104 lookaheadResult = fText.getIndex();
\r
1105 if (c>=UTF16.SUPPLEMENTARY_MIN_VALUE && c!=CI_DONE32) {
\r
1106 // The iterator has been left in the middle of a surrogate pair.
\r
1107 // We want the beginning of it.
\r
1108 lookaheadResult--;
\r
1110 lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
\r
1111 lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];
\r
1116 if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
\r
1117 // Because this is an accepting state, any in-progress look-ahead match
\r
1118 // is no longer relavant. Clear out the pending lookahead status.
\r
1119 lookaheadStatus = 0;
\r
1122 } // End of state machine main loop
\r
1124 // The state machine is done. Check whether it found a match...
\r
1126 // If the iterator failed to advance in the match engine, force it ahead by one.
\r
1127 // (This really indicates a defect in the break rules. They should always match
\r
1128 // at least one character.)
\r
1129 if (result == initialPosition) {
\r
1130 result = fText.setIndex(initialPosition);
\r
1132 result = fText.getIndex();
\r
1135 // Leave the iterator at our result position.
\r
1136 // (we may have advanced beyond the last accepting position chasing after
\r
1137 // longer matches that never completed.)
\r
1138 fText.setIndex(result);
\r
1140 System.out.println("result = " + result);
\r
1147 private int handlePrevious(short stateTable[]) {
\r
1153 int lookaheadStatus = 0;
\r
1155 int initialPosition = 0;
\r
1156 int lookaheadResult = 0;
\r
1157 boolean lookAheadHardBreak =
\r
1158 (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
\r
1161 if (fText == null || stateTable == null) {
\r
1164 // handlePrevious() never gets the rule status.
\r
1165 // Flag the status as invalid; if the user ever asks for status, we will need
\r
1166 // to back up, then re-find the break position using handleNext(), which does
\r
1167 // get the status value.
\r
1168 fLastStatusIndexValid = false;
\r
1169 fLastRuleStatusIndex = 0;
\r
1171 // set up the starting char
\r
1172 initialPosition = fText.getIndex();
\r
1173 result = initialPosition;
\r
1174 c = CIPrevious32(fText);
\r
1176 // Set up the initial state for the state machine
\r
1177 state = START_STATE;
\r
1178 row = fRData.getRowIndex(state);
\r
1179 category = 3; // TODO: obsolete? from the old start/run mode scheme?
\r
1181 if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
\r
1183 mode = RBBI_START;
\r
1187 System.out.println("Handle Prev pos char state category ");
\r
1190 // loop until we reach the beginning of the text or transition to state 0
\r
1192 mainLoop: for (;;) {
\r
1194 if (c == CI_DONE32) {
\r
1195 // Reached end of input string.
\r
1196 if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
\r
1197 // Either this is the old (ICU 3.2 and earlier) format data which
\r
1198 // does not support explicit support for matching {eof}, or
\r
1199 // we have already done the {eof} iteration. Now is the time
\r
1200 // to unconditionally bail out.
\r
1201 if (lookaheadResult < result) {
\r
1202 // We ran off the end of the string with a pending look-ahead match.
\r
1203 // Treat this as if the look-ahead condition had been met, and return
\r
1204 // the match at the / position from the look-ahead rule.
\r
1205 result = lookaheadResult;
\r
1206 lookaheadStatus = 0;
\r
1207 } else if (result == initialPosition) {
\r
1208 // Ran off start, no match found.
\r
1209 // Move one position (towards the start, since we are doing previous.)
\r
1210 fText.setIndex(initialPosition);
\r
1211 CIPrevious32(fText);
\r
1219 if (mode == RBBI_RUN) {
\r
1220 // look up the current character's category, which tells us
\r
1221 // which column in the state table to look at.
\r
1223 category = (short) fRData.fTrie.getCodePointValue(c);
\r
1225 // Check the dictionary bit in the character's category.
\r
1226 // Counter is only used by dictionary based iterators (subclasses).
\r
1227 // Chars that need to be handled by a dictionary have a flag bit set
\r
1228 // in their category values.
\r
1230 if ((category & 0x4000) != 0) {
\r
1231 fDictionaryCharCount++;
\r
1232 // And off the dictionary flag bit.
\r
1233 category &= ~0x4000;
\r
1239 System.out.print(" " + fText.getIndex() + " ");
\r
1240 if (0x20 <= c && c < 0x7f) {
\r
1241 System.out.print(" " + c + " ");
\r
1243 System.out.print(" " + Integer.toHexString(c) + " ");
\r
1245 System.out.println(" " + state + " " + category + " ");
\r
1248 // State Transition - move machine to its next state
\r
1250 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
\r
1251 row = fRData.getRowIndex(state);
\r
1253 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
\r
1254 // Match found, common case, could have lookahead so we move
\r
1256 result = fText.getIndex();
\r
1259 if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
\r
1260 if (lookaheadStatus != 0
\r
1261 && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
\r
1262 // Lookahead match is completed. Set the result
\r
1263 // accordingly, but only
\r
1264 // if no other rule has matched further in the mean
\r
1266 result = lookaheadResult;
\r
1267 lookaheadStatus = 0;
\r
1268 // TODO: make a standalone hard break in a rule work.
\r
1270 if (lookAheadHardBreak) {
\r
1273 // Look-ahead completed, but other rules may match further.
\r
1275 // TODO: junk this feature? I don't think that it's used anywhere.
\r
1278 // Hit a possible look-ahead match. We are at the
\r
1279 // position of the '/'. Remember this position.
\r
1280 lookaheadResult = fText.getIndex();
\r
1281 lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
\r
1285 // not lookahead...
\r
1286 if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
\r
1287 // This is a plain (non-look-ahead) accepting state.
\r
1288 if (!lookAheadHardBreak) {
\r
1289 // Clear out any pending look-ahead matches,
\r
1290 // but only if not doing the lookAheadHardBreak option
\r
1291 // which needs to force a break no matter what is going
\r
1292 // on with the rest of the match, i.e. we can't abandon
\r
1293 // a partially completed look-ahead match because
\r
1294 // some other rule matched further than the '/' position
\r
1295 // in the look-ahead match.
\r
1296 lookaheadStatus = 0;
\r
1300 } // end of innerBlock. "break innerBlock" in above code comes out here.
\r
1303 if (state == STOP_STATE) {
\r
1304 // Normal loop exit is here
\r
1308 // then move iterator position backwards one character
\r
1310 if (mode == RBBI_RUN) {
\r
1311 c = CIPrevious32(fText);
\r
1313 if (mode == RBBI_START) {
\r
1319 } // End of the main loop.
\r
1321 // The state machine is done. Check whether it found a match...
\r
1323 // If the iterator failed to advance in the match engine, force it ahead by one.
\r
1324 // (This really indicates a defect in the break rules. They should always match
\r
1325 // at least one character.)
\r
1326 if (result == initialPosition) {
\r
1327 result = fText.setIndex(initialPosition);
\r
1328 CIPrevious32(fText);
\r
1329 result = fText.getIndex();
\r
1332 fText.setIndex(result);
\r
1334 System.out.println("Result = " + result);
\r
1344 //-------------------------------------------------------------------------------
\r
1348 // isDictionaryChar Return true if the category lookup for this char
\r
1350 // indicates that it is in the set of dictionary lookup
\r
1356 // This function is intended for use by dictionary based
\r
1358 // break iterators.
\r
1362 //-------------------------------------------------------------------------------
\r
1364 boolean isDictionaryChar(int c) {
\r
1366 short category = (short) fRData.fTrie.getCodePointValue(c);
\r
1368 return (category & 0x4000) != 0;
\r