jars/icu4j-4_2_1-src/src/com/ibm/icu/text/RuleBasedBreakIterator.java

   1 /*\r
   2  *******************************************************************************\r
   3  * Copyright (C) 2005-2008 International Business Machines Corporation and          *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  */\r
   7 package com.ibm.icu.text;\r
   8 \r
   9 import java.text.CharacterIterator;\r
  10 import java.io.IOException;\r
  11 import java.io.InputStream;\r
  12 import java.io.OutputStream;\r
  13 import java.io.ByteArrayInputStream;\r
  14 import java.io.ByteArrayOutputStream;\r
  15 \r
  16 import com.ibm.icu.impl.Assert;\r
  17 import com.ibm.icu.impl.ICUDebug;\r
  18 \r
  19 \r
  20 /**\r
  21  * Rule Based Break Iterator \r
  22  * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.\r
  23  * \r
  24  * @stable ICU 2.0\r
  25  */\r
  26 public class RuleBasedBreakIterator extends BreakIterator {\r
  27 \r
  28     \r
  29     //=======================================================================\r
  30     // Constructors & Factories\r
  31     //=======================================================================\r
  32     \r
  33     /** \r
  34      * @internal \r
  35      * @deprecated This API is ICU internal only.\r
  36      */\r
  37     public RuleBasedBreakIterator() {\r
  38     }\r
  39 \r
  40     /**\r
  41      * Create a break iterator from a precompiled set of rules.\r
  42      * @internal\r
  43      * @deprecated This API is ICU internal only.\r
  44      */\r
  45     public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {\r
  46         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();\r
  47         This.fRData = RBBIDataWrapper.get(is);\r
  48         return This;   \r
  49     }\r
  50     \r
  51     /*private RuleBasedBreakIterator(RuleBasedBreakIterator other) {\r
  52         // TODO: check types.\r
  53         fRData = other.fRData;\r
  54         if (fText != null) {\r
  55             fText = (CharacterIterator)(other.fText.clone());   \r
  56         }\r
  57     }*/\r
  58 \r
  59     /**\r
  60      * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.\r
  61      * @param rules The break rules to be used.\r
  62      * @stable ICU 2.2\r
  63      */\r
  64     public RuleBasedBreakIterator(String rules)  {\r
  65         init();\r
  66         try {\r
  67             ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();\r
  68             compileRules(rules, ruleOS);\r
  69             byte [] ruleBA = ruleOS.toByteArray();\r
  70             InputStream ruleIS = new ByteArrayInputStream(ruleBA);\r
  71             fRData = RBBIDataWrapper.get(ruleIS);\r
  72         } catch (IOException e) {\r
  73             // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,\r
  74             //  causing bogus compiled rules to be produced, but with no compile error raised.\r
  75             RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "\r
  76                     + e.getMessage());\r
  77             throw rte;\r
  78         }\r
  79     }\r
  80     \r
  81     \r
  82     //=======================================================================\r
  83     // Boilerplate\r
  84     //=======================================================================\r
  85     \r
  86     /**\r
  87      * Clones this iterator.\r
  88      * @return A newly-constructed RuleBasedBreakIterator with the same\r
  89      * behavior as this one.\r
  90      * @stable ICU 2.0\r
  91      */\r
  92     public Object clone()\r
  93     {\r
  94         RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone();\r
  95         if (fText != null) {\r
  96             result.fText = (CharacterIterator)(fText.clone());   \r
  97         }\r
  98         return result;\r
  99     }\r
 100 \r
 101     /**\r
 102      * Returns true if both BreakIterators are of the same class, have the same\r
 103      * rules, and iterate over the same text.\r
 104      * @stable ICU 2.0\r
 105      */\r
 106     public boolean equals(Object that) {\r
 107         try {\r
 108             RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;\r
 109             if (fRData != other.fRData && (fRData == null || other.fRData == null)) {\r
 110                 return false;   \r
 111             }\r
 112             if (fRData != null && other.fRData != null && \r
 113                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {\r
 114                 return false;\r
 115             }\r
 116             if (fText == null && other.fText == null) {\r
 117                 return true;   \r
 118             }\r
 119             if (fText == null || other.fText == null) {\r
 120                 return false;   \r
 121             }\r
 122             return fText.equals(other.fText);\r
 123         }\r
 124         catch(ClassCastException e) {\r
 125             return false;\r
 126         }\r
 127      }\r
 128 \r
 129     /**\r
 130      * Returns the description (rules) used to create this iterator.\r
 131      * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())\r
 132      * @stable ICU 2.0\r
 133      */\r
 134     public String toString() {\r
 135         String   retStr = null;\r
 136         if (fRData != null) {\r
 137             retStr =  fRData.fRuleSource;\r
 138         }\r
 139         return retStr;\r
 140     }\r
 141 \r
 142     /**\r
 143      * Compute a hashcode for this BreakIterator\r
 144      * @return A hash code\r
 145      * @stable ICU 2.0\r
 146      */\r
 147     public int hashCode()\r
 148     {\r
 149         return fRData.fRuleSource.hashCode(); \r
 150     }\r
 151 \r
 152     \r
 153     /** \r
 154      * Tag value for "words" that do not fit into any of other categories. \r
 155      * Includes spaces and most punctuation. \r
 156      * @draft ICU 3.0 \r
 157      * @provisional This is a draft API and might change in a future release of ICU.\r
 158      */\r
 159     public static final int WORD_NONE           = 0;\r
 160 \r
 161     /**\r
 162      * Upper bound for tags for uncategorized words. \r
 163      * @draft ICU 3.0 \r
 164      * @provisional This is a draft API and might change in a future release of ICU.\r
 165      */\r
 166     public static final int WORD_NONE_LIMIT     = 100;\r
 167 \r
 168     /**\r
 169      * Tag value for words that appear to be numbers, lower limit. \r
 170      * @draft ICU 3.0 \r
 171      * @provisional This is a draft API and might change in a future release of ICU.\r
 172      */\r
 173     public static final int WORD_NUMBER         = 100;\r
 174 \r
 175     /** \r
 176      * Tag value for words that appear to be numbers, upper limit.\r
 177      * @draft ICU 3.0 \r
 178      * @provisional This is a draft API and might change in a future release of ICU.\r
 179      */\r
 180     public static final int WORD_NUMBER_LIMIT   = 200;\r
 181 \r
 182     /** \r
 183      * Tag value for words that contain letters, excluding\r
 184      * hiragana, katakana or ideographic characters, lower limit. \r
 185      * @draft ICU 3.0 \r
 186      * @provisional This is a draft API and might change in a future release of ICU.\r
 187      */\r
 188     public static final int WORD_LETTER         = 200;\r
 189 \r
 190     /** \r
 191      * Tag value for words containing letters, upper limit \r
 192      * @draft ICU 3.0 \r
 193      * @provisional This is a draft API and might change in a future release of ICU.\r
 194      */\r
 195     public static final int WORD_LETTER_LIMIT   = 300;\r
 196 \r
 197     /** \r
 198      * Tag value for words containing kana characters, lower limit\r
 199      * @draft ICU 3.0 \r
 200      * @provisional This is a draft API and might change in a future release of ICU.\r
 201      */\r
 202     public static final int WORD_KANA           = 300;\r
 203 \r
 204     /** \r
 205      * Tag value for words containing kana characters, upper limit\r
 206      * @draft ICU 3.0 \r
 207      * @provisional This is a draft API and might change in a future release of ICU.\r
 208      */\r
 209     public static final int WORD_KANA_LIMIT     = 400;\r
 210 \r
 211     /**\r
 212      * Tag value for words containing ideographic characters, lower limit\r
 213      * @draft ICU 3.0 \r
 214      * @provisional This is a draft API and might change in a future release of ICU.\r
 215      */\r
 216     public static final int WORD_IDEO           = 400;\r
 217 \r
 218     /**\r
 219      * Tag value for words containing ideographic characters, upper limit\r
 220      * @draft ICU 3.0 \r
 221      * @provisional This is a draft API and might change in a future release of ICU.\r
 222      */\r
 223     public static final int WORD_IDEO_LIMIT     = 500;\r
 224 \r
 225    \r
 226     \r
 227     \r
 228     private static final int  START_STATE = 1;     // The state number of the starting state\r
 229     private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"\r
 230     \r
 231     // RBBIRunMode - the state machine runs an extra iteration at the beginning and end\r
 232     //               of user text.  A variable with this enum type keeps track of where we\r
 233     //               are.  The state machine only fetches user text input while in RUN mode.\r
 234     private static final int  RBBI_START  = 0;\r
 235     private static final int  RBBI_RUN    = 1;\r
 236     private static final int  RBBI_END   = 2;\r
 237 \r
 238     /*\r
 239      * The character iterator through which this BreakIterator accesses the text.\r
 240      */\r
 241     private CharacterIterator   fText = new java.text.StringCharacterIterator("");\r
 242     \r
 243     /**\r
 244      * The rule data for this BreakIterator instance\r
 245      * @internal\r
 246      * @deprecated This API is ICU internal only.\r
 247      */\r
 248     protected RBBIDataWrapper     fRData;\r
 249     \r
 250     /*\r
 251      * Index of the Rule {tag} values for the most recent match. \r
 252      */\r
 253     private int                 fLastRuleStatusIndex;\r
 254 \r
 255     /*\r
 256      * Rule tag value valid flag.\r
 257      * Some iterator operations don't intrinsically set the correct tag value.\r
 258      * This flag lets us lazily compute the value if we are ever asked for it.\r
 259      */\r
 260     private boolean             fLastStatusIndexValid;\r
 261 \r
 262     /**\r
 263      * Counter for the number of characters encountered with the "dictionary"\r
 264      *   flag set.  Normal RBBI iterators don't use it, although the code\r
 265      *   for updating it is live.  Dictionary Based break iterators (a subclass\r
 266      *   of us) access this field directly.\r
 267      * @internal\r
 268      * @deprecated This API is ICU internal only.\r
 269      */\r
 270      protected int fDictionaryCharCount;\r
 271 \r
 272     /**\r
 273      * Debugging flag.  Trace operation of state machine when true.\r
 274      * @internal\r
 275      * @deprecated This API is ICU internal only.\r
 276      */\r
 277     public static boolean       fTrace;\r
 278 \r
 279     /*\r
 280      * ICU debug argument name for RBBI\r
 281      */\r
 282     private static final String RBBI_DEBUG_ARG = "rbbi";\r
 283 \r
 284     /**\r
 285      * Dump the contents of the state table and character classes for this break iterator.\r
 286      * For debugging only.\r
 287      * @internal\r
 288      * @deprecated This API is ICU internal only.\r
 289      */\r
 290     public void dump() {\r
 291         this.fRData.dump();   \r
 292     }\r
 293 \r
 294     private static boolean debugInitDone = false;\r
 295     \r
 296     private void init() {\r
 297         fLastStatusIndexValid = true;\r
 298         fDictionaryCharCount  = 0;\r
 299 \r
 300  \r
 301         if (debugInitDone == false) {\r
 302             fTrace = ICUDebug.enabled(RBBI_DEBUG_ARG)\r
 303                 && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;\r
 304             debugInitDone = true;\r
 305         }\r
 306     }\r
 307 \r
 308     private static void compileRules(String rules, OutputStream ruleBinary) throws IOException {\r
 309         RBBIRuleBuilder.compileRules(rules, ruleBinary);\r
 310     }\r
 311     \r
 312     //=======================================================================\r
 313     // BreakIterator overrides\r
 314     //=======================================================================\r
 315 \r
 316     /**\r
 317      * Sets the current iteration position to the beginning of the text.\r
 318      * (i.e., the CharacterIterator's starting offset).\r
 319      * @return The offset of the beginning of the text.\r
 320      * @stable ICU 2.0\r
 321      */\r
 322     public int first() {\r
 323         fLastRuleStatusIndex  = 0;\r
 324         fLastStatusIndexValid = true;\r
 325         if (fText == null) {\r
 326             return BreakIterator.DONE;\r
 327         }\r
 328         fText.first();\r
 329         return fText.getIndex();\r
 330     }\r
 331     \r
 332     \r
 333     /**\r
 334      * Sets the current iteration position to the end of the text.\r
 335      * (i.e., the CharacterIterator's ending offset).\r
 336      * @return The text's past-the-end offset.\r
 337      * @stable ICU 2.0\r
 338      */\r
 339     public int last() {\r
 340         if (fText == null) {\r
 341             fLastRuleStatusIndex  = 0;\r
 342             fLastStatusIndexValid = true;\r
 343             return BreakIterator.DONE;\r
 344         }\r
 345 \r
 346         // I'm not sure why, but t.last() returns the offset of the last character,\r
 347         // rather than the past-the-end offset\r
 348         //\r
 349         //   (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...\r
 350         //     will work correctly.)\r
 351 \r
 352 \r
 353         fLastStatusIndexValid = false;\r
 354         int pos = fText.getEndIndex();\r
 355         fText.setIndex(pos);\r
 356         return pos;\r
 357     }\r
 358     \r
 359     \r
 360     /**\r
 361      * Advances the iterator either forward or backward the specified number of steps.\r
 362      * Negative values move backward, and positive values move forward.  This is\r
 363      * equivalent to repeatedly calling next() or previous().\r
 364      * @param n The number of steps to move.  The sign indicates the direction\r
 365      * (negative is backwards, and positive is forwards).\r
 366      * @return The character offset of the boundary position n boundaries away from\r
 367      * the current one.\r
 368      * @stable ICU 2.0\r
 369      */\r
 370     public int next(int n) {\r
 371         int result = current();\r
 372         while (n > 0) {\r
 373             result = handleNext();\r
 374             --n;\r
 375         }\r
 376         while (n < 0) {\r
 377             result = previous();\r
 378             ++n;\r
 379         }\r
 380         return result;\r
 381     }\r
 382     \r
 383     \r
 384     /**\r
 385      * Advances the iterator to the next boundary position.\r
 386      * @return The position of the first boundary after this one.\r
 387      * @stable ICU 2.0\r
 388      */\r
 389     public int next() {\r
 390         return handleNext();\r
 391     }\r
 392     \r
 393     \r
 394     /**\r
 395      * Moves the iterator backwards, to the last boundary preceding this one.\r
 396      * @return The position of the last boundary position preceding this one.\r
 397      * @stable ICU 2.0\r
 398      */\r
 399     public int previous() {\r
 400         // if we're already sitting at the beginning of the text, return DONE\r
 401         if (fText == null || current() == fText.getBeginIndex()) {\r
 402             fLastRuleStatusIndex  = 0;\r
 403             fLastStatusIndexValid = true;\r
 404             return BreakIterator.DONE;\r
 405         }\r
 406 \r
 407         if (fRData.fSRTable != null || fRData.fSFTable != null) {\r
 408             return handlePrevious(fRData.fRTable);\r
 409         }\r
 410 \r
 411         // old rule syntax\r
 412         // set things up.  handlePrevious() will back us up to some valid\r
 413         // break position before the current position (we back our internal\r
 414         // iterator up one step to prevent handlePrevious() from returning\r
 415         // the current position), but not necessarily the last one before\r
 416         // where we started\r
 417 \r
 418         int       start = current();\r
 419 \r
 420         CIPrevious32(fText);\r
 421         int       lastResult    = handlePrevious(fRData.fRTable);\r
 422         if (lastResult == BreakIterator.DONE) {\r
 423             lastResult = fText.getBeginIndex();\r
 424             fText.setIndex(lastResult);\r
 425         }\r
 426         int       result        = lastResult;\r
 427         int       lastTag       = 0;\r
 428         boolean   breakTagValid = false;\r
 429 \r
 430         // iterate forward from the known break position until we pass our\r
 431         // starting point.  The last break position before the starting\r
 432         // point is our return value\r
 433 \r
 434         for (;;) {\r
 435             result         = handleNext();\r
 436             if (result == BreakIterator.DONE || result >= start) {\r
 437                 break;\r
 438             }\r
 439             lastResult     = result;\r
 440             lastTag        = fLastRuleStatusIndex;\r
 441             breakTagValid  = true;\r
 442         }\r
 443 \r
 444         // fLastBreakTag wants to have the value for section of text preceding\r
 445         // the result position that we are to return (in lastResult.)  If\r
 446         // the backwards rules overshot and the above loop had to do two or more\r
 447         // handleNext()s to move up to the desired return position, we will have a valid\r
 448         // tag value. But, if handlePrevious() took us to exactly the correct result positon,\r
 449         // we wont have a tag value for that position, which is only set by handleNext().\r
 450 \r
 451         // set the current iteration position to be the last break position\r
 452         // before where we started, and then return that value\r
 453         fText.setIndex(lastResult);\r
 454         fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()\r
 455         fLastStatusIndexValid = breakTagValid;\r
 456         return lastResult;\r
 457     }\r
 458     /**\r
 459      * Sets the iterator to refer to the first boundary position following\r
 460      * the specified position.\r
 461      * @param offset The position from which to begin searching for a break position.\r
 462      * @return The position of the first break after the current position.\r
 463      * @stable ICU 2.0\r
 464      */\r
 465     public int following(int offset) {\r
 466         // if the offset passed in is already past the end of the text,\r
 467         // just return DONE; if it's before the beginning, return the\r
 468         // text's starting offset\r
 469         fLastRuleStatusIndex  = 0;\r
 470         fLastStatusIndexValid = true;\r
 471         if (fText == null || offset >= fText.getEndIndex()) {\r
 472             last();\r
 473             return next();\r
 474         }\r
 475         else if (offset < fText.getBeginIndex()) {\r
 476             return first();\r
 477         }\r
 478 \r
 479         // otherwise, set our internal iteration position (temporarily)\r
 480         // to the position passed in.  If this is the _beginning_ position,\r
 481         // then we can just use next() to get our return value\r
 482 \r
 483         int result = 0;\r
 484 \r
 485         if (fRData.fSRTable != null) {\r
 486             // Safe Point Reverse rules exist.\r
 487             //   This allows us to use the optimum algorithm.\r
 488             fText.setIndex(offset);\r
 489             // move forward one codepoint to prepare for moving back to a\r
 490             // safe point.\r
 491             // this handles offset being between a supplementary character\r
 492             CINext32(fText);\r
 493             // handlePrevious will move most of the time to < 1 boundary away\r
 494             handlePrevious(fRData.fSRTable);\r
 495             result = next();\r
 496             while (result <= offset) {\r
 497                 result = next();\r
 498             }\r
 499             return result;\r
 500         }\r
 501         if (fRData.fSFTable != null) {\r
 502             // No Safe point reverse table, but there is a safe pt forward table.\r
 503             // \r
 504             fText.setIndex(offset);\r
 505             CIPrevious32(fText);\r
 506             // handle next will give result >= offset\r
 507             handleNext(fRData.fSFTable);\r
 508             // previous will give result 0 or 1 boundary away from offset,\r
 509             // most of the time\r
 510             // we have to\r
 511             int oldresult = previous();\r
 512             while (oldresult > offset) {\r
 513                 result = previous();\r
 514                 if (result <= offset) {\r
 515                     return oldresult;\r
 516                 }\r
 517                 oldresult = result;\r
 518             }\r
 519             result = next();\r
 520             if (result <= offset) {\r
 521                 return next();\r
 522             }\r
 523             return result;\r
 524         }\r
 525         // otherwise, we have to sync up first.  Use handlePrevious() to back\r
 526         // us up to a known break position before the specified position (if\r
 527         // we can determine that the specified position is a break position,\r
 528         // we don't back up at all).  This may or may not be the last break\r
 529         // position at or before our starting position.  Advance forward\r
 530         // from here until we've passed the starting position.  The position\r
 531         // we stop on will be the first break position after the specified one.\r
 532         // old rule syntax\r
 533 \r
 534         fText.setIndex(offset);\r
 535         if (offset == fText.getBeginIndex()) {\r
 536             return handleNext();\r
 537         }\r
 538         result = previous();\r
 539 \r
 540         while (result != BreakIterator.DONE && result <= offset) {\r
 541             result = next();\r
 542         }\r
 543 \r
 544         return result;\r
 545     }\r
 546     /**\r
 547      * Sets the iterator to refer to the last boundary position before the\r
 548      * specified position.\r
 549      * @param offset The position to begin searching for a break from.\r
 550      * @return The position of the last boundary before the starting position.\r
 551      * @stable ICU 2.0\r
 552      */\r
 553     public int preceding(int offset) {\r
 554         // if the offset passed in is already past the end of the text,\r
 555         // just return DONE; if it's before the beginning, return the\r
 556 \r
 557         // text's starting offset\r
 558         if (fText == null || offset > fText.getEndIndex()) {\r
 559             // return BreakIterator::DONE;\r
 560             return last();\r
 561         }\r
 562         else if (offset < fText.getBeginIndex()) {\r
 563             return first();\r
 564         }\r
 565 \r
 566         // if we start by updating the current iteration position to the\r
 567         // position specified by the caller, we can just use previous()\r
 568         // to carry out this operation\r
 569 \r
 570         int  result;\r
 571         if (fRData.fSFTable != null) {\r
 572             /// todo synwee\r
 573             // new rule syntax\r
 574             fText.setIndex(offset);\r
 575             // move backwards one codepoint to prepare for moving forwards to a\r
 576             // safe point.\r
 577             // this handles offset being between a supplementary character\r
 578             CIPrevious32(fText);\r
 579             handleNext(fRData.fSFTable);\r
 580             result = previous();\r
 581             while (result >= offset) {\r
 582                 result = previous();\r
 583             }\r
 584             return result;\r
 585         }\r
 586         if (fRData.fSRTable != null) {\r
 587             // backup plan if forward safe table is not available\r
 588             fText.setIndex(offset);\r
 589             CINext32(fText);\r
 590             // handle previous will give result <= offset\r
 591             handlePrevious(fRData.fSRTable);\r
 592 \r
 593             // next will give result 0 or 1 boundary away from offset,\r
 594             // most of the time\r
 595             // we have to\r
 596             int oldresult = next();\r
 597             while (oldresult < offset) {\r
 598                 result = next();\r
 599                 if (result >= offset) {\r
 600                     return oldresult;\r
 601                 }\r
 602                 oldresult = result;\r
 603             }\r
 604             result = previous();\r
 605             if (result >= offset) {\r
 606                 return previous();\r
 607             }\r
 608             return result;\r
 609         }\r
 610 \r
 611         // old rule syntax\r
 612         fText.setIndex(offset);\r
 613         return previous();\r
 614     }\r
 615 \r
 616     /**\r
 617      * Throw IllegalArgumentException unless begin <= offset < end.\r
 618      * @stable ICU 2.0\r
 619      */\r
 620     protected static final void checkOffset(int offset, CharacterIterator text) {\r
 621         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {\r
 622             throw new IllegalArgumentException("offset out of bounds");\r
 623         }\r
 624     }\r
 625 \r
 626 \r
 627 /**\r
 628  * Returns true if the specfied position is a boundary position.  As a side\r
 629  * effect, leaves the iterator pointing to the first boundary position at\r
 630  * or after "offset".\r
 631  * @param offset the offset to check.\r
 632  * @return True if "offset" is a boundary position.\r
 633  * @stable ICU 2.0\r
 634  */\r
 635 public boolean isBoundary(int offset) {\r
 636     checkOffset(offset, fText);\r
 637     \r
 638     // the beginning index of the iterator is always a boundary position by definition\r
 639     if (offset == fText.getBeginIndex()) {\r
 640         first();       // For side effects on current position, tag values.\r
 641         return true;\r
 642     }\r
 643 \r
 644     if (offset == fText.getEndIndex()) {\r
 645         last();       // For side effects on current position, tag values.\r
 646         return true;\r
 647     }\r
 648 \r
 649     // otherwise, we can use following() on the position before the specified\r
 650     // one and return true if the position we get back is the one the user\r
 651     // specified\r
 652     \r
 653     // return following(offset - 1) == offset;\r
 654     // TODO:  check whether it is safe to revert to the simpler offset-1 code\r
 655     //         The safe rules may take care of unpaired surrogates ok.\r
 656     fText.setIndex(offset);\r
 657     CIPrevious32(fText);\r
 658     int  pos = fText.getIndex();\r
 659     boolean result = following(pos) == offset;\r
 660     return result;\r
 661 }\r
 662 \r
 663 /**\r
 664  * Returns the current iteration position.\r
 665  * @return The current iteration position.\r
 666  * @stable ICU 2.0\r
 667  */\r
 668 public int current() {\r
 669     return (fText != null) ? fText.getIndex() : BreakIterator.DONE;\r
 670     }\r
 671 \r
 672 \r
 673 \r
 674 private void makeRuleStatusValid() {\r
 675     if (fLastStatusIndexValid == false) {\r
 676         //  No cached status is available.\r
 677         if (fText == null || current() == fText.getBeginIndex()) {\r
 678             //  At start of text, or there is no text.  Status is always zero.\r
 679             fLastRuleStatusIndex = 0;\r
 680             fLastStatusIndexValid = true;\r
 681         } else {\r
 682             //  Not at start of text.  Find status the tedious way.\r
 683             int pa = current();\r
 684             previous();\r
 685             int pb = next();\r
 686             Assert.assrt (pa == pb);\r
 687         }\r
 688         Assert.assrt(fLastStatusIndexValid == true);\r
 689         Assert.assrt(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fRData.fStatusTable.length);\r
 690     }\r
 691 }\r
 692 \r
 693 \r
 694 /**\r
 695  * Return the status tag from the break rule that determined the most recently\r
 696  * returned break position.  The values appear in the rule source\r
 697  * within brackets, {123}, for example.  For rules that do not specify a\r
 698  * status, a default value of 0 is returned.  If more than one rule applies,\r
 699  * the numerically largest of the possible status values is returned.\r
 700  * <p>\r
 701  * Of the standard types of ICU break iterators, only the word break\r
 702  * iterator provides status values.  The values are defined in\r
 703  * class RuleBasedBreakIterator, and allow distinguishing between words\r
 704  * that contain alphabetic letters, "words" that appear to be numbers,\r
 705  * punctuation and spaces, words containing ideographic characters, and\r
 706  * more.  Call <code>getRuleStatus</code> after obtaining a boundary\r
 707  * position from <code>next()<code>, <code>previous()</code>, or \r
 708  * any other break iterator functions that returns a boundary position.\r
 709  * <p>\r
 710  * @return the status from the break rule that determined the most recently\r
 711  * returned break position.\r
 712  *\r
 713  * @draft ICU 3.0\r
 714  * @provisional This is a draft API and might change in a future release of ICU.\r
 715  */\r
 716 \r
 717 public int  getRuleStatus() {\r
 718     makeRuleStatusValid();\r
 719     //   Status records have this form:\r
 720     //           Count N         <--  fLastRuleStatusIndex points here.\r
 721     //           Status val 0\r
 722     //           Status val 1\r
 723     //              ...\r
 724     //           Status val N-1  <--  the value we need to return\r
 725     //   The status values are sorted in ascending order.\r
 726     //   This function returns the last (largest) of the array of status values.\r
 727     int  idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];\r
 728     int  tagVal = fRData.fStatusTable[idx];\r
 729 \r
 730     return tagVal;\r
 731 }\r
 732 \r
 733 \r
 734 \r
 735 /**\r
 736  * Get the status (tag) values from the break rule(s) that determined the most \r
 737  * recently returned break position.  The values appear in the rule source\r
 738  * within brackets, {123}, for example.  The default status value for rules\r
 739  * that do not explicitly provide one is zero.\r
 740  * <p>\r
 741  * The status values used by the standard ICU break rules are defined\r
 742  * as public constants in class RuleBasedBreakIterator.\r
 743  * <p>\r
 744  * If the size  of the output array is insufficient to hold the data,\r
 745  *  the output will be truncated to the available length.  No exception\r
 746  *  will be thrown.\r
 747  *\r
 748  * @param fillInArray an array to be filled in with the status values.  \r
 749  * @return          The number of rule status values from rules that determined \r
 750  *                  the most recent boundary returned by the break iterator.\r
 751  *                  In the event that the array is too small, the return value\r
 752  *                  is the total number of status values that were available,\r
 753  *                  not the reduced number that were actually returned.\r
 754  * @draft ICU 3.0\r
 755  * @provisional This is a draft API and might change in a future release of ICU.\r
 756  */\r
 757 public int getRuleStatusVec(int[] fillInArray) {\r
 758     makeRuleStatusValid();\r
 759     int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];\r
 760     if (fillInArray != null) {  \r
 761         int numToCopy = Math.min(numStatusVals, fillInArray.length);\r
 762         for (int i=0; i<numToCopy; i++) {\r
 763             fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];\r
 764         }\r
 765     }\r
 766     return numStatusVals;\r
 767  }\r
 768 \r
 769 \r
 770 /**\r
 771  * Return a CharacterIterator over the text being analyzed.  This version\r
 772  * of this method returns the actual CharacterIterator we're using internally.\r
 773  * Changing the state of this iterator can have undefined consequences.  If\r
 774  * you need to change it, clone it first.\r
 775  * @return An iterator over the text being analyzed.\r
 776  * @stable ICU 2.0\r
 777  */\r
 778     public CharacterIterator getText() {\r
 779         return fText;\r
 780     }\r
 781 \r
 782 \r
 783     /**\r
 784      * Set the iterator to analyze a new piece of text.  This function resets\r
 785      * the current iteration position to the beginning of the text.\r
 786      * @param newText An iterator over the text to analyze.\r
 787      * @stable ICU 2.0\r
 788      */\r
 789     public void setText(CharacterIterator newText) {\r
 790         fText = newText;\r
 791         this.first();\r
 792     }\r
 793     \r
 794     /**\r
 795      * Control debug, trace and dump options.\r
 796      * @internal\r
 797      * @deprecated This API is ICU internal only.\r
 798      */\r
 799     protected static String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?\r
 800                                         ICUDebug.value(RBBI_DEBUG_ARG) : null;\r
 801 \r
 802     \r
 803     // 32 bit Char value returned from when an iterator has run out of range.\r
 804     //     Positive value so fast case (not end, not surrogate) can be checked\r
 805     //     with a single test.\r
 806     private static int CI_DONE32 = 0x7fffffff;\r
 807     \r
 808     /**\r
 809      * Move the iterator forward to the next code point, and return that code point,\r
 810      *   leaving the iterator positioned at char returned.\r
 811      *   For Supplementary chars, the iterator is left positioned at the lead surrogate.\r
 812      * @param ci  The character iterator\r
 813      * @return    The next code point.\r
 814      */\r
 815      static int CINext32(CharacterIterator ci) {\r
 816         // If the current position is at a surrogate pair, move to the trail surrogate\r
 817         //   which leaves it in positon for underlying iterator's next() to work.\r
 818         int c= ci.current();\r
 819         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE && c<=UTF16.LEAD_SURROGATE_MAX_VALUE) {\r
 820             c = ci.next();   \r
 821             if (c<UTF16.TRAIL_SURROGATE_MIN_VALUE || c>UTF16.TRAIL_SURROGATE_MAX_VALUE) {\r
 822                c = ci.previous();   \r
 823             }\r
 824         }\r
 825 \r
 826         // For BMP chars, this next() is the real deal.\r
 827         c = ci.next();\r
 828         \r
 829         // If we might have a lead surrogate, we need to peak ahead to get the trail \r
 830         //  even though we don't want to really be positioned there.\r
 831         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {\r
 832             c = CINextTrail32(ci, c);   \r
 833         }\r
 834         \r
 835         if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {\r
 836             // We got a supplementary char.  Back the iterator up to the postion\r
 837             // of the lead surrogate.\r
 838             ci.previous();   \r
 839         }\r
 840         return c;\r
 841    }\r
 842 \r
 843     \r
 844     // Out-of-line portion of the in-line Next32 code.\r
 845     // The call site does an initial ci.next() and calls this function\r
 846     //    if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE.\r
 847     // NOTE:  we leave the underlying char iterator positioned in the\r
 848     //        middle of a surroage pair.  ci.next() will work correctly\r
 849     //        from there, but the ci.getIndex() will be wrong, and needs\r
 850     //        adjustment.\r
 851     private static int CINextTrail32(CharacterIterator ci, int lead) {\r
 852         int retVal = lead;\r
 853         if (lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {\r
 854             char  cTrail = ci.next();\r
 855             if (UTF16.isTrailSurrogate(cTrail)) {\r
 856                 retVal = ((lead  - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +\r
 857                             (cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +\r
 858                             UTF16.SUPPLEMENTARY_MIN_VALUE;\r
 859             } else {\r
 860                 ci.previous();\r
 861             }\r
 862         } else {\r
 863             if (lead == CharacterIterator.DONE && ci.getIndex() >= ci.getEndIndex()) {\r
 864                 retVal = CI_DONE32;\r
 865             }\r
 866         }\r
 867         return retVal;\r
 868     }\r
 869        \r
 870     private static int CIPrevious32(CharacterIterator ci) {\r
 871         if (ci.getIndex() <= ci.getBeginIndex()) {\r
 872             return CI_DONE32;   \r
 873         }\r
 874         char trail = ci.previous();\r
 875         int retVal = trail;\r
 876         if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) {\r
 877             char lead = ci.previous();\r
 878             if (UTF16.isLeadSurrogate(lead)) {\r
 879                 retVal = (((int)lead  - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +\r
 880                           ((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +\r
 881                           UTF16.SUPPLEMENTARY_MIN_VALUE;\r
 882             } else {\r
 883                 ci.next();\r
 884             }           \r
 885         }\r
 886         return retVal;\r
 887     }\r
 888    \r
 889     static int CICurrent32(CharacterIterator ci) {\r
 890         char  lead   = ci.current();\r
 891         int   retVal = lead;\r
 892         if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) {\r
 893             return retVal;   \r
 894         }\r
 895         if (UTF16.isLeadSurrogate(lead)) {\r
 896             int  trail = (int)ci.next();\r
 897             ci.previous();\r
 898             if (UTF16.isTrailSurrogate((char)trail)) {\r
 899                 retVal = ((lead  - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +\r
 900                          (trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +\r
 901                          UTF16.SUPPLEMENTARY_MIN_VALUE;\r
 902             }\r
 903          } else {\r
 904             if (lead == CharacterIterator.DONE) {\r
 905                 if (ci.getIndex() >= ci.getEndIndex())   {\r
 906                     retVal = CI_DONE32;   \r
 907                 }\r
 908             }\r
 909          }\r
 910         return retVal;\r
 911     }\r
 912     \r
 913 \r
 914     //-----------------------------------------------------------------------------------\r
 915     //\r
 916     //      handleNext(void)    All forward iteration vectors through this function.\r
 917     //                          NOTE:  This function is overridden by the dictionary base break iterator.\r
 918     //                                 User level API functions go to the dbbi implementation\r
 919     //                                     when the break iterator type is dbbi.\r
 920     //                                 The DBBI implementation sometimes explicitly calls back to here, \r
 921     //                                     its inherited handleNext().\r
 922     //                      \r
 923     //-----------------------------------------------------------------------------------\r
 924     int handleNext() {\r
 925         return handleNext(fRData.fFTable);\r
 926     }\r
 927 \r
 928     /**\r
 929      * The State Machine Engine for moving forward is here.\r
 930      * This function is the heart of the RBBI run time engine.\r
 931      * \r
 932      * @param stateTable\r
 933      * @return the new iterator position\r
 934      * \r
 935      * A note on supplementary characters and the position of underlying\r
 936      * Java CharacterIterator:   Normally, a character iterator is positioned at\r
 937      * the char most recently returned by next().  Within this function, when\r
 938      * a supplementary char is being processed, the char iterator is left\r
 939      * sitting on the trail surrogate, in the middle of the code point.\r
 940      * This is different from everywhere else, where an iterator always\r
 941      * points at the lead surrogate of a supplementary.\r
 942      */\r
 943     private int handleNext(short stateTable[]) {\r
 944         int               state;\r
 945         short             category        = 0;\r
 946         int               mode;\r
 947         int               row;\r
 948         int               c;\r
 949         int               lookaheadStatus = 0;\r
 950         int               lookaheadTagIdx = 0;\r
 951         int               result          = 0;\r
 952         int               initialPosition = 0;\r
 953         int               lookaheadResult = 0;\r
 954         boolean          lookAheadHardBreak = \r
 955             (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;\r
 956         \r
 957         if (fTrace) {\r
 958             System.out.println("Handle Next   pos      char  state category");\r
 959         }\r
 960 \r
 961         // No matter what, handleNext alway correctly sets the break tag value.\r
 962         fLastStatusIndexValid = true;\r
 963         fLastRuleStatusIndex  = 0;\r
 964 \r
 965         // if we're already at the end of the text, return DONE.\r
 966         if (fText == null) {\r
 967             fLastRuleStatusIndex = 0;\r
 968             return BreakIterator.DONE;\r
 969         }\r
 970 \r
 971         // Set up the starting char\r
 972         initialPosition = fText.getIndex();\r
 973         result          = initialPosition;\r
 974         c               = fText.current();\r
 975         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {\r
 976             c = CINextTrail32(fText, c);\r
 977             if (c == CI_DONE32) {\r
 978                 fLastRuleStatusIndex = 0;\r
 979                 return BreakIterator.DONE;\r
 980             }\r
 981         }\r
 982 \r
 983         // Set the initial state for the state machine\r
 984         state           = START_STATE;\r
 985         row             = fRData.getRowIndex(state); \r
 986         category        = 3;\r
 987         mode            = RBBI_RUN;\r
 988         if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {\r
 989             category = 2;\r
 990             mode     = RBBI_START;\r
 991         }\r
 992 \r
 993 \r
 994         // loop until we reach the end of the text or transition to state 0\r
 995         while (state != STOP_STATE) {\r
 996             if (c == CI_DONE32) {\r
 997                 // Reached end of input string.\r
 998                 if (mode == RBBI_END) {\r
 999                     // We have already run the loop one last time with the\r
1000                     // character set to the pseudo {eof} value. Now it is time\r
1001                     // to unconditionally bail out.\r
1002 \r
1003                     if (lookaheadResult > result) {\r
1004                         // We ran off the end of the string with a pending\r
1005                         // look-ahead match.\r
1006                         // Treat this as if the look-ahead condition had been\r
1007                         // met, and return\r
1008                         // the match at the / position from the look-ahead rule.\r
1009                         result = lookaheadResult;\r
1010                         fLastRuleStatusIndex = lookaheadTagIdx;\r
1011                         lookaheadStatus = 0;\r
1012                     } else if (result == initialPosition) {\r
1013                         // Ran off end, no match found.\r
1014                         // move forward one\r
1015                         fText.setIndex(initialPosition);\r
1016                         CINext32(fText);\r
1017                     }\r
1018                     break;\r
1019                 }\r
1020                 // Run the loop one last time with the fake end-of-input character category\r
1021                 mode = RBBI_END;\r
1022                 category = 1;\r
1023             }\r
1024             \r
1025             // Get the char category.  An incoming category of 1 or 2 mens that\r
1026             //      we are preset for doing the beginning or end of input, and\r
1027             //      that we shouldn't get a category from an actual text input character.\r
1028             //\r
1029             if (mode == RBBI_RUN) {\r
1030                 // look up the current character's character category, which tells us\r
1031                 // which column in the state table to look at.\r
1032                 //\r
1033                 category = (short) fRData.fTrie.getCodePointValue(c);\r
1034                 \r
1035                 // Check the dictionary bit in the character's category.\r
1036                 //    Counter is only used by dictionary based iterators (subclasses).\r
1037                 //    Chars that need to be handled by a dictionary have a flag bit set\r
1038                 //    in their category values.\r
1039                 //\r
1040                 if ((category & 0x4000) != 0)  {\r
1041                     fDictionaryCharCount++;\r
1042                     //  And off the dictionary flag bit.\r
1043                     category &= ~0x4000;\r
1044                 }\r
1045            }\r
1046 \r
1047             if (fTrace) {\r
1048                 System.out.print("            " +  RBBIDataWrapper.intToString(fText.getIndex(), 5)); \r
1049                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));\r
1050                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));\r
1051             }\r
1052 \r
1053             // look up a state transition in the state table\r
1054             //     state = row->fNextState[category];\r
1055             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];\r
1056             row   = fRData.getRowIndex(state);  \r
1057 \r
1058             // Advance to the next character.  \r
1059             // If this is a beginning-of-input loop iteration, don't advance.\r
1060             //    The next iteration will be processing the first real input character.\r
1061             if (mode == RBBI_RUN) {\r
1062                 c = (int)fText.next(); \r
1063                 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {\r
1064                     c = CINextTrail32(fText, c);\r
1065                 }\r
1066             } else {\r
1067                 if (mode == RBBI_START) {\r
1068                     mode = RBBI_RUN;\r
1069                 }\r
1070             }\r
1071              \r
1072             if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {\r
1073                 // Match found, common case\r
1074                 result = fText.getIndex();\r
1075                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != CI_DONE32) {\r
1076                     // The iterator has been left in the middle of a surrogate pair.\r
1077                     // We want the start of it.\r
1078                     result--;\r
1079                 }\r
1080 \r
1081                 //  Remember the break status (tag) values.\r
1082                 fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];\r
1083             }\r
1084 \r
1085             if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {\r
1086                 if (lookaheadStatus != 0\r
1087                     && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {\r
1088                     // Lookahead match is completed.  Set the result accordingly, but only\r
1089                     // if no other rule has matched further in the mean time.\r
1090                     result               = lookaheadResult;\r
1091                     fLastRuleStatusIndex = lookaheadTagIdx;\r
1092                     lookaheadStatus      = 0;\r
1093                     // TODO: make a standalone hard break in a rule work.\r
1094                     if (lookAheadHardBreak) {\r
1095                         return result;\r
1096                     }\r
1097                     // Look-ahead completed, but other rules may match further.  Continue on.\r
1098                     //   TODO:  junk this feature?  I don't think it's used anywhere.\r
1099                     continue;\r
1100                 }\r
1101 \r
1102                 lookaheadResult = fText.getIndex();\r
1103                 if (c>=UTF16.SUPPLEMENTARY_MIN_VALUE && c!=CI_DONE32) {\r
1104                     // The iterator has been left in the middle of a surrogate pair.\r
1105                     // We want the beginning  of it.\r
1106                     lookaheadResult--;\r
1107                 }\r
1108                 lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];\r
1109                 lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];\r
1110                 continue;\r
1111             }\r
1112 \r
1113 \r
1114             if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {\r
1115                 // Because this is an accepting state, any in-progress look-ahead match\r
1116                 //   is no longer relavant.  Clear out the pending lookahead status.\r
1117                 lookaheadStatus = 0; \r
1118             }\r
1119             \r
1120          }        // End of state machine main loop\r
1121 \r
1122         // The state machine is done.  Check whether it found a match...\r
1123 \r
1124         // If the iterator failed to advance in the match engine, force it ahead by one.\r
1125         //   (This really indicates a defect in the break rules.  They should always match\r
1126         //    at least one character.)\r
1127         if (result == initialPosition) {\r
1128             result = fText.setIndex(initialPosition);\r
1129             CINext32(fText);\r
1130             result = fText.getIndex();\r
1131         }\r
1132 \r
1133         // Leave the iterator at our result position.\r
1134         //   (we may have advanced beyond the last accepting position chasing after\r
1135         //    longer matches that never completed.)\r
1136         fText.setIndex(result);\r
1137         if (fTrace) {\r
1138             System.out.println("result = " + result);\r
1139         }\r
1140         return result;\r
1141     }\r
1142 \r
1143     \r
1144     \r
1145     private int handlePrevious(short stateTable[]) {\r
1146         int            state;\r
1147         int            category           = 0;\r
1148         int            mode;\r
1149         int            row;        \r
1150         int            c;\r
1151         int            lookaheadStatus    = 0;\r
1152         int            result             = 0;\r
1153         int            initialPosition    = 0;\r
1154         int            lookaheadResult    = 0;\r
1155         boolean        lookAheadHardBreak = \r
1156             (stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;\r
1157         \r
1158         \r
1159         if (fText == null || stateTable == null) {\r
1160             return 0;\r
1161         }\r
1162         // handlePrevious() never gets the rule status.\r
1163         // Flag the status as invalid; if the user ever asks for status, we will need\r
1164         // to back up, then re-find the break position using handleNext(), which does\r
1165         // get the status value.\r
1166         fLastStatusIndexValid = false;\r
1167         fLastRuleStatusIndex  = 0;\r
1168         \r
1169         // set up the starting char\r
1170         initialPosition = fText.getIndex();\r
1171         result          = initialPosition;\r
1172         c               = CIPrevious32(fText);\r
1173         \r
1174         // Set up the initial state for the state machine\r
1175         state = START_STATE;\r
1176         row = fRData.getRowIndex(state);\r
1177         category = 3;   // TODO:  obsolete?  from the old start/run mode scheme?\r
1178         mode     = RBBI_RUN;\r
1179         if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {\r
1180             category = 2;\r
1181             mode     = RBBI_START;\r
1182         }\r
1183         \r
1184         if (fTrace) {\r
1185             System.out.println("Handle Prev   pos   char  state category ");\r
1186         }\r
1187         \r
1188         // loop until we reach the beginning of the text or transition to state 0\r
1189         //\r
1190         mainLoop: for (;;) {\r
1191             innerBlock: {\r
1192                 if (c == CI_DONE32) {\r
1193                     // Reached end of input string.\r
1194                     if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {\r
1195                         // Either this is the old (ICU 3.2 and earlier) format data which\r
1196                         // does not support explicit support for matching {eof}, or\r
1197                         // we have already done the {eof} iteration.  Now is the time\r
1198                         // to unconditionally bail out.\r
1199                         if (lookaheadResult < result) {\r
1200                             // We ran off the end of the string with a pending look-ahead match.\r
1201                             // Treat this as if the look-ahead condition had been met, and return\r
1202                             //  the match at the / position from the look-ahead rule.\r
1203                             result = lookaheadResult;\r
1204                             lookaheadStatus = 0;\r
1205                         } else if (result == initialPosition) {\r
1206                             // Ran off start, no match found.\r
1207                             // Move one position (towards the start, since we are doing previous.)\r
1208                             fText.setIndex(initialPosition);\r
1209                             CIPrevious32(fText);\r
1210                         }\r
1211                         break mainLoop;\r
1212                     }\r
1213                     mode = RBBI_END;\r
1214                     category = 1;\r
1215                 }\r
1216                 \r
1217                 if (mode == RBBI_RUN) {\r
1218                     // look up the current character's category, which tells us\r
1219                     // which column in the state table to look at.\r
1220                     //\r
1221                     category = (short) fRData.fTrie.getCodePointValue(c);\r
1222                     \r
1223                     // Check the dictionary bit in the character's category.\r
1224                     //    Counter is only used by dictionary based iterators (subclasses).\r
1225                     //    Chars that need to be handled by a dictionary have a flag bit set\r
1226                     //    in their category values.\r
1227                     //\r
1228                     if ((category & 0x4000) != 0)  {\r
1229                         fDictionaryCharCount++;\r
1230                         //  And off the dictionary flag bit.\r
1231                         category &= ~0x4000;\r
1232                     }\r
1233                 }\r
1234                 \r
1235                 \r
1236                 if (fTrace) {\r
1237                     System.out.print("             " + fText.getIndex() + "   ");\r
1238                     if (0x20 <= c && c < 0x7f) {\r
1239                         System.out.print("  " + c + "  ");\r
1240                     } else {\r
1241                         System.out.print(" " + Integer.toHexString(c) + " ");\r
1242                     }\r
1243                     System.out.println(" " + state + "  " + category + " ");\r
1244                 }\r
1245                 \r
1246                 // State Transition - move machine to its next state\r
1247                 //\r
1248                 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];\r
1249                 row = fRData.getRowIndex(state);\r
1250                 \r
1251                 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {\r
1252                     // Match found, common case, could have lookahead so we move\r
1253                     // on to check it\r
1254                     result = fText.getIndex();\r
1255                 }\r
1256                 \r
1257                 if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {\r
1258                     if (lookaheadStatus != 0\r
1259                             && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {\r
1260                         // Lookahead match is completed. Set the result\r
1261                         // accordingly, but only\r
1262                         // if no other rule has matched further in the mean\r
1263                         // time.\r
1264                         result = lookaheadResult;\r
1265                         lookaheadStatus = 0;\r
1266                         // TODO: make a standalone hard break in a rule work.\r
1267                         \r
1268                         if (lookAheadHardBreak) {\r
1269                             break mainLoop;\r
1270                         }\r
1271                         // Look-ahead completed, but other rules may match further.\r
1272                         // Continue on.\r
1273                         // TODO: junk this feature?  I don't think that it's used anywhere.\r
1274                         break innerBlock;\r
1275                     }\r
1276                     // Hit a possible look-ahead match. We are at the\r
1277                     // position of the '/'. Remember this position.\r
1278                     lookaheadResult = fText.getIndex();\r
1279                     lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];\r
1280                     break innerBlock;\r
1281                 } \r
1282                 \r
1283                 // not lookahead...\r
1284                 if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {\r
1285                     // This is a plain (non-look-ahead) accepting state.\r
1286                     if (!lookAheadHardBreak) {\r
1287                         // Clear out any pending look-ahead matches,\r
1288                         // but only if not doing the lookAheadHardBreak option\r
1289                         // which needs to force a break no matter what is going\r
1290                         // on with the rest of the match, i.e. we can't abandon\r
1291                         // a partially completed look-ahead match because\r
1292                         // some other rule matched further than the '/' position\r
1293                         // in the look-ahead match.\r
1294                         lookaheadStatus = 0; \r
1295                     }\r
1296                 }\r
1297                 \r
1298             } // end of innerBlock.  "break innerBlock" in above code comes out here.\r
1299         \r
1300         \r
1301             if (state == STOP_STATE) {\r
1302                 // Normal loop exit is here\r
1303                 break mainLoop;\r
1304             }\r
1305         \r
1306             // then move iterator position backwards one character\r
1307             //\r
1308             if (mode == RBBI_RUN) {\r
1309                 c = CIPrevious32(fText);\r
1310             } else {\r
1311                 if (mode == RBBI_START) {\r
1312                     mode = RBBI_RUN;\r
1313                 }\r
1314             }\r
1315         \r
1316         \r
1317         }   // End of the main loop.\r
1318         \r
1319         // The state machine is done.  Check whether it found a match...\r
1320         //\r
1321         // If the iterator failed to advance in the match engine, force it ahead by one.\r
1322         //   (This really indicates a defect in the break rules.  They should always match\r
1323         //    at least one character.)\r
1324         if (result == initialPosition) {\r
1325             result = fText.setIndex(initialPosition);\r
1326             CIPrevious32(fText);\r
1327             result = fText.getIndex();\r
1328         }\r
1329         \r
1330         fText.setIndex(result);\r
1331         if (fTrace) {\r
1332             System.out.println("Result = " + result);\r
1333         }\r
1334         \r
1335         return result;\r
1336     }\r
1337 \r
1338 \r
1339 \r
1340 \r
1341 \r
1342     //-------------------------------------------------------------------------------\r
1343     \r
1344     //\r
1345     \r
1346     //  isDictionaryChar      Return true if the category lookup for this char\r
1347     \r
1348     //                        indicates that it is in the set of dictionary lookup\r
1349     \r
1350     //                        chars.\r
1351     \r
1352     //\r
1353     \r
1354     //                        This function is intended for use by dictionary based\r
1355     \r
1356     //                        break iterators.\r
1357     \r
1358     //\r
1359     \r
1360     //-------------------------------------------------------------------------------\r
1361     \r
1362     boolean isDictionaryChar(int c) {\r
1363     \r
1364         short  category = (short) fRData.fTrie.getCodePointValue(c);\r
1365     \r
1366         return (category & 0x4000) != 0;\r
1367     \r
1368     }\r
1369 \r
1370 }\r
1371 //eof\r