jars/icu4j-52_1/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java

   1 /*
   2  *******************************************************************************
   3  * Copyright (C) 2003-2013 International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  *******************************************************************************
   6  */
   7  package com.ibm.icu.dev.test.rbbi;
   8
   9
  10 // Monkey testing of RuleBasedBreakIterator
  11 import java.util.ArrayList;
  12 import java.util.Arrays;
  13 import java.util.List;
  14 import java.util.Locale;
  15
  16 import com.ibm.icu.dev.test.TestFmwk;
  17 import com.ibm.icu.lang.UCharacter;
  18 import com.ibm.icu.lang.UProperty;
  19 import com.ibm.icu.text.BreakIterator;
  20 import com.ibm.icu.text.RuleBasedBreakIterator;
  21 import com.ibm.icu.text.UTF16;
  22 import com.ibm.icu.text.UnicodeSet;
  23
  24
  25 /**
  26  * Monkey tests for RBBI.  These tests have independent implementations of
  27  * the Unicode TR boundary rules, and compare results between these and ICU's
  28  * implementation, using random data.
  29  *
  30  * Tests cover Grapheme Cluster (char), Word and Line breaks
  31  *
  32  * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
  33  *
  34  */
  35 public class RBBITestMonkey extends TestFmwk {
  36
  37     public static void main(String[] args) {
  38         new RBBITestMonkey().run(args);
  39     }
  40
  41 //
  42 //     classs RBBIMonkeyKind
  43 //
  44 //        Monkey Test for Break Iteration
  45 //        Abstract interface class.   Concrete derived classes independently
  46 //        implement the break rules for different iterator types.
  47 //
  48 //        The Monkey Test itself uses doesn't know which type of break iterator it is
  49 //        testing, but works purely in terms of the interface defined here.
  50 //
  51     abstract static class RBBIMonkeyKind {
  52
  53         // Return a List of UnicodeSets, representing the character classes used
  54         //   for this type of iterator.
  55         abstract  List  charClasses();
  56
  57         // Set the test text on which subsequent calls to next() will operate
  58         abstract  void   setText(StringBuffer text);
  59
  60         // Find the next break position, starting from the specified position.
  61         // Return -1 after reaching end of string.
  62         abstract   int   next(int i);
  63
  64         // A Character Property, one of the constants defined in class UProperty.
  65         //   The value of this property will be displayed for the characters
  66         //    near any test failure.
  67         int   fCharProperty;
  68     }
  69
  70
  71     /**
  72      * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
  73      * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets
  74      */
  75     static class RBBICharMonkey extends RBBIMonkeyKind {
  76         List                      fSets;
  77
  78         UnicodeSet                fCRLFSet;
  79         UnicodeSet                fControlSet;
  80         UnicodeSet                fExtendSet;
  81         UnicodeSet                fRegionalIndicatorSet;
  82         UnicodeSet                fPrependSet;
  83         UnicodeSet                fSpacingSet;
  84         UnicodeSet                fLSet;
  85         UnicodeSet                fVSet;
  86         UnicodeSet                fTSet;
  87         UnicodeSet                fLVSet;
  88         UnicodeSet                fLVTSet;
  89         UnicodeSet                fHangulSet;
  90         UnicodeSet                fAnySet;
  91
  92         StringBuffer              fText;
  93
  94
  95     RBBICharMonkey() {
  96         fText       = null;
  97         fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
  98         fCRLFSet    = new UnicodeSet("[\\r\\n]");
  99         fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
 100         fExtendSet  = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
 101         fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]");
 102         fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
 103         fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
 104         fLSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
 105         fVSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");
 106         fTSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");
 107         fLVSet      = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");
 108         fLVTSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");
 109         fHangulSet  = new UnicodeSet();
 110         fHangulSet.addAll(fLSet);
 111         fHangulSet.addAll(fVSet);
 112         fHangulSet.addAll(fTSet);
 113         fHangulSet.addAll(fLVSet);
 114         fHangulSet.addAll(fLVTSet);
 115
 116         fAnySet     = new UnicodeSet("[\\u0000-\\U0010ffff]");
 117
 118         fSets       = new ArrayList();
 119         fSets.add(fCRLFSet);
 120         fSets.add(fControlSet);
 121         fSets.add(fExtendSet);
 122         fSets.add(fRegionalIndicatorSet);
 123         if (!fPrependSet.isEmpty()) {
 124             fSets.add(fPrependSet);
 125         }
 126         fSets.add(fSpacingSet);
 127         fSets.add(fHangulSet);
 128         fSets.add(fAnySet);
 129      }
 130
 131
 132     void setText(StringBuffer s) {
 133         fText = s;
 134     }
 135
 136     List charClasses() {
 137         return fSets;
 138     }
 139
 140     int next(int prevPos) {
 141         int    p1, p2, p3;    // Indices of the significant code points around the
 142                               //   break position being tested.  The candidate break
 143                               //   location is before p2.
 144
 145         int     breakPos = -1;
 146
 147         int   c1, c2, c3;     // The code points at p0, p1, p2 & p3.
 148
 149         // Previous break at end of string.  return DONE.
 150         if (prevPos >= fText.length()) {
 151             return -1;
 152         }
 153         p1 = p2 = p3 = prevPos;
 154         c3 =  UTF16.charAt(fText, prevPos);
 155         c1 = c2 = 0;
 156
 157         // Loop runs once per "significant" character position in the input text.
 158         for (;;) {
 159             // Move all of the positions forward in the input string.
 160             p1 = p2;  c1 = c2;
 161             p2 = p3;  c2 = c3;
 162
 163             // Advance p3 by one codepoint
 164             p3 = moveIndex32(fText, p3, 1);
 165             c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);
 166
 167             if (p1 == p2) {
 168                 // Still warming up the loop.  (won't work with zero length strings, but we don't care)
 169                 continue;
 170             }
 171             if (p2 == fText.length()) {
 172                 // Reached end of string.  Always a break position.
 173                 break;
 174             }
 175
 176             // Rule  GB3   CR x LF
 177             //     No Extend or Format characters may appear between the CR and LF,
 178             //     which requires the additional check for p2 immediately following p1.
 179             //
 180             if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
 181                 continue;
 182             }
 183
 184             // Rule (GB4).   ( Control | CR | LF ) <break>
 185             if (fControlSet.contains(c1) ||
 186                 c1 == 0x0D ||
 187                 c1 == 0x0A)  {
 188                 break;
 189             }
 190
 191             // Rule (GB5)    <break>  ( Control | CR | LF )
 192             //
 193             if (fControlSet.contains(c2) ||
 194                 c2 == 0x0D ||
 195                 c2 == 0x0A)  {
 196                 break;
 197             }
 198
 199
 200             // Rule (GB6)  L x ( L | V | LV | LVT )
 201             if (fLSet.contains(c1) &&
 202                 (fLSet.contains(c2)  ||
 203                     fVSet.contains(c2)  ||
 204                     fLVSet.contains(c2) ||
 205                     fLVTSet.contains(c2))) {
 206                 continue;
 207             }
 208
 209             // Rule (GB7)    ( LV | V )  x  ( V | T )
 210             if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&
 211                 (fVSet.contains(c2) || fTSet.contains(c2)))  {
 212                 continue;
 213             }
 214
 215             // Rule (GB8)    ( LVT | T)  x T
 216             if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&
 217                 fTSet.contains(c2))  {
 218                 continue;
 219             }
 220
 221             // Rule (GB8a)   Regional_Indicator x Regional_Indicator
 222             if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
 223                 continue;
 224             }
 225
 226             // Rule (GB9)    Numeric x ALetter
 227             if (fExtendSet.contains(c2))  {
 228                 continue;
 229             }
 230
 231             // Rule (GB9a)   x  SpacingMark
 232             if (fSpacingSet.contains(c2)) {
 233                 continue;
 234             }
 235
 236             // Rule (GB9b)   Prepend x
 237             if (fPrependSet.contains(c1)) {
 238                 continue;
 239             }
 240
 241             // Rule (GB10)  Any  <break>  Any
 242             break;
 243         }
 244
 245         breakPos = p2;
 246         return breakPos;
 247         }
 248     }
 249
 250
 251     /**
 252      *
 253      * Word Monkey Test Class
 254      *
 255      *
 256      *
 257      */
 258     static class RBBIWordMonkey extends RBBIMonkeyKind {
 259         List                      fSets;
 260         StringBuffer              fText;
 261
 262         UnicodeSet                fCRSet;
 263         UnicodeSet                fLFSet;
 264         UnicodeSet                fNewlineSet;
 265         UnicodeSet                fRegionalIndicatorSet;
 266         UnicodeSet                fKatakanaSet;
 267         UnicodeSet                fHebrew_LetterSet;
 268         UnicodeSet                fALetterSet;
 269         UnicodeSet                fSingle_QuoteSet;
 270         UnicodeSet                fDouble_QuoteSet;
 271         UnicodeSet                fMidNumLetSet;
 272         UnicodeSet                fMidLetterSet;
 273         UnicodeSet                fMidNumSet;
 274         UnicodeSet                fNumericSet;
 275         UnicodeSet                fFormatSet;
 276         UnicodeSet                fExtendSet;
 277         UnicodeSet                fExtendNumLetSet;
 278         UnicodeSet                fOtherSet;
 279         UnicodeSet                fDictionaryCjkSet;
 280
 281
 282         RBBIWordMonkey() {
 283             fCharProperty    = UProperty.WORD_BREAK;
 284
 285             fDictionaryCjkSet= new UnicodeSet("[[:Script=Hangul:][:Han:][:Hiragana:][:Katakana:]]");
 286             fCRSet           = new UnicodeSet("[\\p{Word_Break = CR}]");
 287             fLFSet           = new UnicodeSet("[\\p{Word_Break = LF}]");
 288             fNewlineSet      = new UnicodeSet("[\\p{Word_Break = Newline}]");
 289             fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]");
 290             fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}]");
 291             fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]");
 292             fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]");
 293             fALetterSet.removeAll(fDictionaryCjkSet);
 294             fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]");
 295             fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]");
 296             fMidNumLetSet    = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
 297             fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
 298             fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]");
 299             fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]");
 300             fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]");
 301             fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
 302             fExtendSet       = new UnicodeSet("[\\p{Word_Break = Extend}]");
 303
 304             fOtherSet        = new UnicodeSet();
 305             fOtherSet.complement();
 306             fOtherSet.removeAll(fCRSet);
 307             fOtherSet.removeAll(fLFSet);
 308             fOtherSet.removeAll(fNewlineSet);
 309             fOtherSet.removeAll(fALetterSet);
 310             fOtherSet.removeAll(fSingle_QuoteSet);
 311             fOtherSet.removeAll(fDouble_QuoteSet);
 312             fOtherSet.removeAll(fKatakanaSet);
 313             fOtherSet.removeAll(fHebrew_LetterSet);
 314             fOtherSet.removeAll(fMidLetterSet);
 315             fOtherSet.removeAll(fMidNumSet);
 316             fOtherSet.removeAll(fNumericSet);
 317             fOtherSet.removeAll(fFormatSet);
 318             fOtherSet.removeAll(fExtendSet);
 319             fOtherSet.removeAll(fExtendNumLetSet);
 320             fOtherSet.removeAll(fRegionalIndicatorSet);
 321             // Inhibit dictionary characters from being tested at all.
 322             // remove surrogates so as to not generate higher CJK characters
 323             fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]"));
 324             fOtherSet.removeAll(fDictionaryCjkSet);
 325
 326             fSets            = new ArrayList();
 327             fSets.add(fCRSet);
 328             fSets.add(fLFSet);
 329             fSets.add(fNewlineSet);
 330             fSets.add(fRegionalIndicatorSet);
 331             fSets.add(fHebrew_LetterSet);
 332             fSets.add(fALetterSet);
 333             //fSets.add(fKatakanaSet); // TODO: work out how to test katakana
 334             fSets.add(fSingle_QuoteSet);
 335             fSets.add(fDouble_QuoteSet);
 336             fSets.add(fMidLetterSet);
 337             fSets.add(fMidNumLetSet);
 338             fSets.add(fMidNumSet);
 339             fSets.add(fNumericSet);
 340             fSets.add(fFormatSet);
 341             fSets.add(fExtendSet);
 342             fSets.add(fExtendNumLetSet);
 343             fSets.add(fOtherSet);
 344         }
 345
 346
 347         List  charClasses() {
 348          return fSets;
 349         }
 350
 351         void   setText(StringBuffer s) {
 352             fText = s;
 353         }
 354
 355         int   next(int prevPos) {
 356             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the
 357                                         //   break position being tested.  The candidate break
 358                                         //   location is before p2.
 359             int     breakPos = -1;
 360
 361             int c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
 362
 363             // Previous break at end of string.  return DONE.
 364             if (prevPos >= fText.length()) {
 365                 return -1;
 366             }
 367             /*p0 =*/ p1 = p2 = p3 = prevPos;
 368             c3 = UTF16.charAt(fText, prevPos);
 369             c0 = c1 = c2 = 0;
 370
 371
 372
 373             // Loop runs once per "significant" character position in the input text.
 374             for (;;) {
 375                 // Move all of the positions forward in the input string.
 376                 /*p0 = p1;*/  c0 = c1;
 377                 p1 = p2;  c1 = c2;
 378                 p2 = p3;  c2 = c3;
 379
 380                 // Advance p3 by    X(Extend | Format)*   Rule 4
 381                 //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
 382                 do {
 383                     p3 = moveIndex32(fText, p3, 1);
 384                     c3 = -1;
 385                     if (p3>=fText.length()) {
 386                         break;
 387                     }
 388                     c3 = UTF16.charAt(fText, p3);
 389                     if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
 390                         break;
 391                     }
 392                 }
 393                 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3));
 394
 395                 if (p1 == p2) {
 396                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
 397                     continue;
 398                 }
 399                 if (p2 == fText.length()) {
 400                     // Reached end of string.  Always a break position.
 401                     break;
 402                 }
 403
 404                 // Rule (3)   CR x LF
 405                 //     No Extend or Format characters may appear between the CR and LF,
 406                 //     which requires the additional check for p2 immediately following p1.
 407                 //
 408                 if (c1==0x0D && c2==0x0A) {
 409                     continue;
 410                 }
 411
 412                 // Rule (3a)  Break before and after newlines (including CR and LF)
 413                 //
 414                 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {
 415                     break;
 416                 }
 417                 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
 418                     break;
 419                 }
 420
 421                 // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
 422                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
 423                     (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
 424                     continue;
 425                 }
 426
 427                 // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
 428                 //
 429                 if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1))   &&
 430                      (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
 431                      (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) {
 432                     continue;
 433                 }
 434
 435                 // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
 436                 if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) &&
 437                     (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
 438                     (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
 439                     continue;
 440                 }
 441
 442                 // Rule (7a)     Hebrew_Letter x Single_Quote
 443                 if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) {
 444                     continue;
 445                 }
 446
 447                 // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
 448                 if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) {
 449                     continue;
 450                 }
 451
 452                 // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
 453                 if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) {
 454                     continue;
 455                 }
 456
 457                 //  Rule (8)    Numeric x Numeric
 458                 if (fNumericSet.contains(c1) &&
 459                         fNumericSet.contains(c2))  {
 460                     continue;
 461                 }
 462
 463                 // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
 464                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
 465                     fNumericSet.contains(c2))  {
 466                     continue;
 467                 }
 468
 469                 // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
 470                 if (fNumericSet.contains(c1) &&
 471                     (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
 472                     continue;
 473                 }
 474
 475                 // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
 476                 if (fNumericSet.contains(c0) &&
 477                         (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1))  &&
 478                         fNumericSet.contains(c2)) {
 479                     continue;
 480                 }
 481
 482                 // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
 483                 if (fNumericSet.contains(c1) &&
 484                     (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2))  &&
 485                     setContains(fNumericSet, c3)) {
 486                     continue;
 487                 }
 488
 489                 // Rule (13)  Katakana x Katakana
 490                 if (fKatakanaSet.contains(c1) &&
 491                         fKatakanaSet.contains(c2))  {
 492                     continue;
 493                 }
 494
 495                 // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
 496                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) ||
 497                         fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
 498                         fExtendNumLetSet.contains(c2)) {
 499                     continue;
 500                 }
 501
 502                 // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
 503                 if (fExtendNumLetSet.contains(c1) &&
 504                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) ||
 505                          fNumericSet.contains(c2) || fKatakanaSet.contains(c2)))  {
 506                     continue;
 507                 }
 508
 509
 510                 // Rule 13c   Do not break between Regional Indicators.
 511                 //            Regional_Indicator  ×   Regional_Indicator
 512                 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
 513                     continue;
 514                 }
 515
 516                 // Rule 14.  Break found here.
 517                 break;
 518             }
 519
 520             breakPos = p2;
 521             return breakPos;
 522         }
 523
 524     }
 525
 526
 527     static class RBBILineMonkey extends RBBIMonkeyKind {
 528
 529         List        fSets;
 530
 531         UnicodeSet  fBK;
 532         UnicodeSet  fCR;
 533         UnicodeSet  fLF;
 534         UnicodeSet  fCM;
 535         UnicodeSet  fNL;
 536         UnicodeSet  fSG;
 537         UnicodeSet  fWJ;
 538         UnicodeSet  fZW;
 539         UnicodeSet  fGL;
 540         UnicodeSet  fCB;
 541         UnicodeSet  fSP;
 542         UnicodeSet  fB2;
 543         UnicodeSet  fBA;
 544         UnicodeSet  fBB;
 545         UnicodeSet  fHY;
 546         UnicodeSet  fCL;
 547         UnicodeSet  fCP;
 548         UnicodeSet  fEX;
 549         UnicodeSet  fIN;
 550         UnicodeSet  fNS;
 551         UnicodeSet  fOP;
 552         UnicodeSet  fQU;
 553         UnicodeSet  fIS;
 554         UnicodeSet  fNU;
 555         UnicodeSet  fPO;
 556         UnicodeSet  fPR;
 557         UnicodeSet  fSY;
 558         UnicodeSet  fAI;
 559         UnicodeSet  fAL;
 560         UnicodeSet  fHL;
 561         UnicodeSet  fID;
 562         UnicodeSet  fSA;
 563         UnicodeSet  fJL;
 564         UnicodeSet  fJV;
 565         UnicodeSet  fJT;
 566         UnicodeSet  fH2;
 567         UnicodeSet  fH3;
 568         UnicodeSet  fRI;
 569         UnicodeSet  fXX;
 570
 571         StringBuffer  fText;
 572         int           fOrigPositions;
 573
 574
 575
 576         RBBILineMonkey()
 577         {
 578             fCharProperty  = UProperty.LINE_BREAK;
 579             fSets          = new ArrayList();
 580
 581             fBK    = new UnicodeSet("[\\p{Line_Break=BK}]");
 582             fCR    = new UnicodeSet("[\\p{Line_break=CR}]");
 583             fLF    = new UnicodeSet("[\\p{Line_break=LF}]");
 584             fCM    = new UnicodeSet("[\\p{Line_break=CM}]");
 585             fNL    = new UnicodeSet("[\\p{Line_break=NL}]");
 586             fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]");
 587             fZW    = new UnicodeSet("[\\p{Line_break=ZW}]");
 588             fGL    = new UnicodeSet("[\\p{Line_break=GL}]");
 589             fCB    = new UnicodeSet("[\\p{Line_break=CB}]");
 590             fSP    = new UnicodeSet("[\\p{Line_break=SP}]");
 591             fB2    = new UnicodeSet("[\\p{Line_break=B2}]");
 592             fBA    = new UnicodeSet("[\\p{Line_break=BA}]");
 593             fBB    = new UnicodeSet("[\\p{Line_break=BB}]");
 594             fHY    = new UnicodeSet("[\\p{Line_break=HY}]");
 595             fCL    = new UnicodeSet("[\\p{Line_break=CL}]");
 596             fCP    = new UnicodeSet("[\\p{Line_break=CP}]");
 597             fEX    = new UnicodeSet("[\\p{Line_break=EX}]");
 598             fIN    = new UnicodeSet("[\\p{Line_break=IN}]");
 599             fNS    = new UnicodeSet("[\\p{Line_break=NS}]");
 600             fOP    = new UnicodeSet("[\\p{Line_break=OP}]");
 601             fQU    = new UnicodeSet("[\\p{Line_break=QU}]");
 602             fIS    = new UnicodeSet("[\\p{Line_break=IS}]");
 603             fNU    = new UnicodeSet("[\\p{Line_break=NU}]");
 604             fPO    = new UnicodeSet("[\\p{Line_break=PO}]");
 605             fPR    = new UnicodeSet("[\\p{Line_break=PR}]");
 606             fSY    = new UnicodeSet("[\\p{Line_break=SY}]");
 607             fAI    = new UnicodeSet("[\\p{Line_break=AI}]");
 608             fAL    = new UnicodeSet("[\\p{Line_break=AL}]");
 609             fHL    = new UnicodeSet("[\\p{Line_break=HL}]");
 610             fID    = new UnicodeSet("[\\p{Line_break=ID}]");
 611             fSA    = new UnicodeSet("[\\p{Line_break=SA}]");
 612             fJL    = new UnicodeSet("[\\p{Line_break=JL}]");
 613             fJV    = new UnicodeSet("[\\p{Line_break=JV}]");
 614             fJT    = new UnicodeSet("[\\p{Line_break=JT}]");
 615             fH2    = new UnicodeSet("[\\p{Line_break=H2}]");
 616             fH3    = new UnicodeSet("[\\p{Line_break=H3}]");
 617             fSG    = new UnicodeSet("[\\ud800-\\udfff]");
 618             fRI    = new UnicodeSet("[\\p{Line_break=RI}]");
 619             fXX    = new UnicodeSet("[\\p{Line_break=XX}]");
 620
 621
 622             fAL.addAll(fXX);     // Default behavior for XX is identical to AL
 623             fAL.addAll(fAI);     // Default behavior for AI is identical to AL
 624             fAL.addAll(fSA);     // Default behavior for SA is XX, which defaults to AL
 625             fAL.addAll(fSG);     // Default behavior for SG (unpaired surrogates) is AL
 626
 627
 628
 629             fSets.add(fBK);
 630             fSets.add(fCR);
 631             fSets.add(fLF);
 632             fSets.add(fCM);
 633             fSets.add(fNL);
 634             fSets.add(fWJ);
 635             fSets.add(fZW);
 636             fSets.add(fGL);
 637             fSets.add(fCB);
 638             fSets.add(fSP);
 639             fSets.add(fB2);
 640             fSets.add(fBA);
 641             fSets.add(fBB);
 642             fSets.add(fHY);
 643             fSets.add(fH2);
 644             fSets.add(fH3);
 645             fSets.add(fCL);
 646             fSets.add(fCP);
 647             fSets.add(fEX);
 648             fSets.add(fIN);
 649             fSets.add(fJL);
 650             fSets.add(fJT);
 651             fSets.add(fJV);
 652             fSets.add(fNS);
 653             fSets.add(fOP);
 654             fSets.add(fQU);
 655             fSets.add(fIS);
 656             fSets.add(fNU);
 657             fSets.add(fPO);
 658             fSets.add(fPR);
 659             fSets.add(fSY);
 660             fSets.add(fAI);
 661             fSets.add(fAL);
 662             fSets.add(fHL);
 663             fSets.add(fID);
 664             fSets.add(fWJ);
 665             fSets.add(fSA);
 666             fSets.add(fSG);
 667             fSets.add(fRI);
 668         }
 669
 670         void setText(StringBuffer s) {
 671             fText       = s;
 672         }
 673
 674
 675
 676
 677         int next(int startPos) {
 678             int    pos;       //  Index of the char following a potential break position
 679             int    thisChar;  //  Character at above position "pos"
 680
 681             int    prevPos;   //  Index of the char preceding a potential break position
 682             int    prevChar;  //  Character at above position.  Note that prevChar
 683                               //   and thisChar may not be adjacent because combining
 684                               //   characters between them will be ignored.
 685             int    prevCharX2; //  Character before prevChar, more contex for LB 21a
 686
 687             int    nextPos;   //  Index of the next character following pos.
 688                               //     Usually skips over combining marks.
 689             int    tPos;      //  temp value.
 690             int    matchVals[]  = null;       // Number  Expression Match Results
 691
 692
 693             if (startPos >= fText.length()) {
 694                 return -1;
 695             }
 696
 697
 698             // Initial values for loop.  Loop will run the first time without finding breaks,
 699             //                           while the invalid values shift out and the "this" and
 700             //                           "prev" positions are filled in with good values.
 701             pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
 702             thisChar = prevChar  = prevCharX2 = 0;
 703             nextPos  = startPos;
 704
 705
 706             // Loop runs once per position in the test text, until a break position
 707             //  is found.  In each iteration, we are testing for a possible break
 708             //  just preceding the character at index "pos".  The character preceding
 709             //  this char is at postion "prevPos"; because of combining sequences,
 710             //  "prevPos" can be arbitrarily far before "pos".
 711             for (;;) {
 712                 // Advance to the next position to be tested.
 713                 prevCharX2 = prevChar;
 714                 prevPos   = pos;
 715                 prevChar  = thisChar;
 716                 pos       = nextPos;
 717                 nextPos   = moveIndex32(fText, pos, 1);
 718
 719                 // Rule LB2 - Break at end of text.
 720                 if (pos >= fText.length()) {
 721                     break;
 722                 }
 723
 724                 // Rule LB 9 - adjust for combining sequences.
 725                 //             We do this rule out-of-order because the adjustment does
 726                 //             not effect the way that rules LB 3 through LB 6 match,
 727                 //             and doing it here rather than after LB 6 is substantially
 728                 //             simpler when combining sequences do occur.
 729
 730
 731                 // LB 9         Keep combining sequences together.
 732                 //              advance over any CM class chars at "pos",
 733                 //              result is "nextPos" for the following loop iteration.
 734                 thisChar  = UTF16.charAt(fText, pos);
 735                 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||
 736                         thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {
 737                     for (;;) {
 738                         if (nextPos == fText.length()) {
 739                             break;
 740                         }
 741                         int nextChar = UTF16.charAt(fText, nextPos);
 742                         if (!fCM.contains(nextChar)) {
 743                             break;
 744                         }
 745                         nextPos = moveIndex32(fText, nextPos, 1);
 746                     }
 747                 }
 748
 749                 // LB 9 Treat X CM* as if it were X
 750                 //        No explicit action required.
 751
 752                 // LB 10     Treat any remaining combining mark as AL
 753                 if (fCM.contains(thisChar)) {
 754                     thisChar = 'A';
 755                 }
 756
 757
 758                 // If the loop is still warming up - if we haven't shifted the initial
 759                 //   -1 positions out of prevPos yet - loop back to advance the
 760                 //    position in the input without any further looking for breaks.
 761                 if (prevPos == -1) {
 762                     continue;
 763                 }
 764
 765                 // LB 4  Always break after hard line breaks,
 766                 if (fBK.contains(prevChar)) {
 767                     break;
 768                 }
 769
 770                 // LB 5  Break after CR, LF, NL, but not inside CR LF
 771                 if (fCR.contains(prevChar) && fLF.contains(thisChar)) {
 772                     continue;
 773                 }
 774                 if  (fCR.contains(prevChar) ||
 775                      fLF.contains(prevChar) ||
 776                      fNL.contains(prevChar))  {
 777                     break;
 778                 }
 779
 780                 // LB 6  Don't break before hard line breaks
 781                 if (fBK.contains(thisChar) || fCR.contains(thisChar) ||
 782                         fLF.contains(thisChar) || fNL.contains(thisChar) ) {
 783                     continue;
 784                 }
 785
 786
 787                 // LB 7  Don't break before spaces or zero-width space.
 788                 if (fSP.contains(thisChar)) {
 789                     continue;
 790                 }
 791
 792                 if (fZW.contains(thisChar)) {
 793                     continue;
 794                 }
 795
 796                 // LB 8  Break after zero width space
 797                 if (fZW.contains(prevChar)) {
 798                     break;
 799                 }
 800
 801                 //  LB 9, 10  Already done, at top of loop.
 802                 //
 803
 804
 805                 // LB 11
 806                 //    x  WJ
 807                 //    WJ  x
 808                 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
 809                     continue;
 810                 }
 811
 812
 813                 // LB 12
 814                 //        GL x
 815                 if (fGL.contains(prevChar)) {
 816                     continue;
 817                 }
 818
 819                 // LB 12a
 820                 //    [^SP BA HY] x GL
 821                 if (!(fSP.contains(prevChar) ||
 822                       fBA.contains(prevChar) ||
 823                       fHY.contains(prevChar)     ) && fGL.contains(thisChar)) {
 824                     continue;
 825                 }
 826
 827
 828
 829                 // LB 13  Don't break before closings.
 830                 //       NU x CL, NU x CP  and NU x IS are not matched here so that they will
 831                 //       fall into LB 17 and the more general number regular expression.
 832                 //
 833                 if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
 834                     !fNU.contains(prevChar) && fCP.contains(thisChar) ||
 835                                                fEX.contains(thisChar) ||
 836                     !fNU.contains(prevChar) && fIS.contains(thisChar) ||
 837                     !fNU.contains(prevChar) && fSY.contains(thisChar))    {
 838                     continue;
 839                 }
 840
 841                 // LB 14  Don't break after OP SP*
 842                 //       Scan backwards, checking for this sequence.
 843                 //       The OP char could include combining marks, so we actually check for
 844                 //           OP CM* SP* x
 845                 tPos = prevPos;
 846                 if (fSP.contains(prevChar)) {
 847                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
 848                         tPos=moveIndex32(fText, tPos, -1);
 849                     }
 850                 }
 851                 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
 852                     tPos=moveIndex32(fText, tPos, -1);
 853                 }
 854                 if (fOP.contains(UTF16.charAt(fText, tPos))) {
 855                     continue;
 856                 }
 857
 858                 // LB 15 Do not break within "[
 859                 //       QU CM* SP* x OP
 860                 if (fOP.contains(thisChar)) {
 861                     // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
 862                     tPos = prevPos;
 863                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
 864                         tPos = moveIndex32(fText, tPos, -1);
 865                     }
 866                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
 867                         tPos = moveIndex32(fText, tPos, -1);
 868                     }
 869                     if (fQU.contains(UTF16.charAt(fText, tPos))) {
 870                         continue;
 871                     }
 872                 }
 873
 874                 // LB 16   (CL | CP) SP* x NS
 875                 if (fNS.contains(thisChar)) {
 876                     tPos = prevPos;
 877                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
 878                         tPos = moveIndex32(fText, tPos, -1);
 879                     }
 880                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
 881                         tPos = moveIndex32(fText, tPos, -1);
 882                     }
 883                     if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {
 884                         continue;
 885                     }
 886                 }
 887
 888
 889                 // LB 17        B2 SP* x B2
 890                 if (fB2.contains(thisChar)) {
 891                     tPos = prevPos;
 892                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
 893                         tPos = moveIndex32(fText, tPos, -1);
 894                     }
 895                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
 896                         tPos = moveIndex32(fText, tPos, -1);
 897                     }
 898                     if (fB2.contains(UTF16.charAt(fText, tPos))) {
 899                         continue;
 900                     }
 901                 }
 902
 903                 // LB 18    break after space
 904                 if (fSP.contains(prevChar)) {
 905                     break;
 906                 }
 907
 908                 // LB 19
 909                 //    x   QU
 910                 //    QU  x
 911                 if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
 912                     continue;
 913                 }
 914
 915                 // LB 20  Break around a CB
 916                 if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
 917                     break;
 918                 }
 919
 920                 // LB 21
 921                 if (fBA.contains(thisChar) ||
 922                         fHY.contains(thisChar) ||
 923                         fNS.contains(thisChar) ||
 924                         fBB.contains(prevChar) )   {
 925                     continue;
 926                 }
 927
 928                  // LB 21a, HL (HY | BA) x
 929                 if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) {
 930                     continue;
 931                 }
 932
 933                  // LB 21b, SY x HL
 934                 if (fSY.contains(prevChar) && fHL.contains(thisChar)) {
 935                     continue;
 936                 }
 937
 938                // LB 22
 939                 if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
 940                         fHL.contains(prevChar) && fIN.contains(thisChar) ||
 941                         fID.contains(prevChar) && fIN.contains(thisChar) ||
 942                         fIN.contains(prevChar) && fIN.contains(thisChar) ||
 943                         fNU.contains(prevChar) && fIN.contains(thisChar) )   {
 944                     continue;
 945                 }
 946
 947
 948                 // LB 23    ID x PO    (Note:  Leading CM behaves like ID)
 949                 //          AL x NU
 950                 //          NU x AL
 951                 if (fID.contains(prevChar) && fPO.contains(thisChar) ||
 952                         fAL.contains(prevChar) && fNU.contains(thisChar) ||
 953                         fHL.contains(prevChar) && fNU.contains(thisChar) ||
 954                         fNU.contains(prevChar) && fAL.contains(thisChar) ||
 955                         fNU.contains(prevChar) && fHL.contains(thisChar) )   {
 956                    continue;
 957                 }
 958
 959                 // LB 24  Do not break between prefix and letters or ideographs.
 960                 //        PR x ID
 961                 //        PR x AL
 962                 //        PO x AL
 963                 if (fPR.contains(prevChar) && fID.contains(thisChar) ||
 964                     fPR.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar)) ||
 965                     fPO.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar)))  {
 966                     continue;
 967                 }
 968
 969
 970                 // LB 25    Numbers
 971                 matchVals = LBNumberCheck(fText, prevPos, matchVals);
 972                 if (matchVals[0] != -1) {
 973                     // Matched a number.  But could have been just a single digit, which would
 974                     //    not represent a "no break here" between prevChar and thisChar
 975                     int numEndIdx = matchVals[1];  // idx of first char following num
 976                     if (numEndIdx > pos) {
 977                         // Number match includes at least the two chars being checked
 978                         if (numEndIdx > nextPos) {
 979                             // Number match includes additional chars.  Update pos and nextPos
 980                             //   so that next loop iteration will continue at the end of the number,
 981                             //   checking for breaks between last char in number & whatever follows.
 982                             nextPos = numEndIdx;
 983                             pos     = numEndIdx;
 984                             do {
 985                                 pos = moveIndex32(fText, pos, -1);
 986                                 thisChar = UTF16.charAt(fText, pos);
 987                             }
 988                             while (fCM.contains(thisChar));
 989                         }
 990                         continue;
 991                     }
 992                 }
 993
 994
 995                 // LB 26  Do not break Korean Syllables
 996                 if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
 997                                                 fJV.contains(thisChar) ||
 998                                                 fH2.contains(thisChar) ||
 999                                                 fH3.contains(thisChar))) {
1000                                                     continue;
1001                                                 }
1002
1003                 if ((fJV.contains(prevChar) || fH2.contains(prevChar))  &&
1004                     (fJV.contains(thisChar) || fJT.contains(thisChar))) {
1005                         continue;
1006                 }
1007
1008                 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&
1009                     fJT.contains(thisChar)) {
1010                         continue;
1011                 }
1012
1013                 // LB 27 Treat a Korean Syllable Block the same as ID
1014                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
1015                     fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
1016                     fIN.contains(thisChar)) {
1017                         continue;
1018                     }
1019                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
1020                     fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
1021                     fPO.contains(thisChar)) {
1022                         continue;
1023                     }
1024                 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||
1025                     fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {
1026                         continue;
1027                     }
1028
1029
1030
1031                 // LB 28 Do not break between alphabetics
1032                 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1033                     continue;
1034                 }
1035
1036                 // LB 29  Do not break between numeric punctuation and alphabetics
1037                 if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1038                     continue;
1039                 }
1040
1041                 // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
1042                 //          (AL | NU) x OP
1043                 //          CP x (AL | NU)
1044                 if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
1045                     continue;
1046                 }
1047                 if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) {
1048                     continue;
1049                 }
1050
1051                 // LB 30a   Do not break between regional indicators.  RI × RI
1052                 if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
1053                     continue;
1054                 }
1055
1056                 // LB 31    Break everywhere else
1057                 break;
1058             }
1059
1060             return pos;
1061         }
1062
1063
1064
1065         // Match the following regular expression in the input text.
1066         //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)?  (PR | PO) CM*)?
1067         //      0    0   1       3    3    4              7    7    7    7      9    9    9     11   11    (match states)
1068         //  retVals array  [0]  index of the start of the match, or -1 if no match
1069         //                 [1]  index of first char following the match.
1070         //  Can not use Java regex because need supplementary character support,
1071         //     and because Unicode char properties version must be the same as in
1072         //     the version of ICU being tested.
1073         private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
1074             if (retVals == null) {
1075                 retVals = new int[2];
1076              }
1077             retVals[0]     = -1;  // Indicates no match.
1078             int matchState = 0;
1079             int idx        = startIdx;
1080
1081             matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
1082                 int c = UTF16.charAt(s, idx);
1083                 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
1084                 switch (matchState) {
1085                     case 0:
1086                         if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||
1087                             cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1088                             matchState = 1;
1089                             break;
1090                         }
1091                         if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1092                             matchState = 4;
1093                             break;
1094                         }
1095                         if (cLBType == UCharacter.LineBreak.HYPHEN) {
1096                             matchState = 4;
1097                             break;
1098                         }
1099                         if (cLBType == UCharacter.LineBreak.NUMERIC) {
1100                             matchState = 7;
1101                             break;
1102                         }
1103                         break matchLoop;   /* No Match  */
1104
1105                     case 1:
1106                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1107                             matchState = 1;
1108                             break;
1109                         }
1110                         if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1111                             matchState = 4;
1112                             break;
1113                         }
1114                         if (cLBType == UCharacter.LineBreak.HYPHEN) {
1115                             matchState = 4;
1116                             break;
1117                         }
1118                         if (cLBType == UCharacter.LineBreak.NUMERIC) {
1119                             matchState = 7;
1120                             break;
1121                         }
1122                         break matchLoop;   /* No Match  */
1123
1124
1125                     case 4:
1126                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1127                             matchState = 4;
1128                             break;
1129                         }
1130                         if (cLBType == UCharacter.LineBreak.NUMERIC) {
1131                             matchState = 7;
1132                             break;
1133                         }
1134                         break matchLoop;   /* No Match  */
1135                         //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)?  (PR | PO) CM*)?
1136                         //      0    0   1       3    3    4              7    7    7    7      9   9     11   11    (match states)
1137
1138                     case 7:
1139                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1140                             matchState = 7;
1141                             break;
1142                         }
1143                         if (cLBType == UCharacter.LineBreak.NUMERIC) {
1144                             matchState = 7;
1145                             break;
1146                         }
1147                         if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
1148                             matchState = 7;
1149                             break;
1150                         }
1151                         if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
1152                             matchState = 7;
1153                             break;
1154                         }
1155                         if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
1156                             matchState = 9;
1157                             break;
1158                         }
1159                         if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {
1160                             matchState = 9;
1161                             break;
1162                         }
1163                         if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1164                             matchState = 11;
1165                             break;
1166                         }
1167                         if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1168                             matchState = 11;
1169                             break;
1170                         }
1171
1172                         break matchLoop;    // Match Complete.
1173                     case 9:
1174                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1175                             matchState = 9;
1176                             break;
1177                         }
1178                         if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1179                             matchState = 11;
1180                             break;
1181                         }
1182                         if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1183                             matchState = 11;
1184                             break;
1185                         }
1186                         break matchLoop;    // Match Complete.
1187                     case 11:
1188                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1189                             matchState = 11;
1190                             break;
1191                         }
1192                         break matchLoop;    // Match Complete.
1193                 }
1194             }
1195             if (matchState > 4) {
1196                 retVals[0] = startIdx;
1197                  retVals[1] = idx;
1198             }
1199             return retVals;
1200         }
1201
1202
1203         List  charClasses() {
1204             return fSets;
1205         }
1206
1207
1208
1209     }
1210
1211
1212     /**
1213      *
1214      * Sentence Monkey Test Class
1215      *
1216      *
1217      *
1218      */
1219     static class RBBISentenceMonkey extends RBBIMonkeyKind {
1220         List                 fSets;
1221         StringBuffer         fText;
1222
1223         UnicodeSet           fSepSet;
1224         UnicodeSet           fFormatSet;
1225         UnicodeSet           fSpSet;
1226         UnicodeSet           fLowerSet;
1227         UnicodeSet           fUpperSet;
1228         UnicodeSet           fOLetterSet;
1229         UnicodeSet           fNumericSet;
1230         UnicodeSet           fATermSet;
1231         UnicodeSet           fSContinueSet;
1232         UnicodeSet           fSTermSet;
1233         UnicodeSet           fCloseSet;
1234         UnicodeSet           fOtherSet;
1235         UnicodeSet           fExtendSet;
1236
1237
1238
1239         RBBISentenceMonkey() {
1240             fCharProperty  = UProperty.SENTENCE_BREAK;
1241
1242             fSets            = new ArrayList();
1243
1244             //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
1245             //                       set and made into character classes of their own.  For the monkey impl,
1246             //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
1247             fSepSet          = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");
1248             fFormatSet       = new UnicodeSet("[\\p{Sentence_Break = Format}]");
1249             fSpSet           = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
1250             fLowerSet        = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
1251             fUpperSet        = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
1252             fOLetterSet      = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");
1253             fNumericSet      = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");
1254             fATermSet        = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
1255             fSContinueSet    = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");
1256             fSTermSet        = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
1257             fCloseSet        = new UnicodeSet("[\\p{Sentence_Break = Close}]");
1258             fExtendSet       = new UnicodeSet("[\\p{Sentence_Break = Extend}]");
1259             fOtherSet        = new UnicodeSet();
1260
1261
1262             fOtherSet.complement();
1263             fOtherSet.removeAll(fSepSet);
1264             fOtherSet.removeAll(fFormatSet);
1265             fOtherSet.removeAll(fSpSet);
1266             fOtherSet.removeAll(fLowerSet);
1267             fOtherSet.removeAll(fUpperSet);
1268             fOtherSet.removeAll(fOLetterSet);
1269             fOtherSet.removeAll(fNumericSet);
1270             fOtherSet.removeAll(fATermSet);
1271             fOtherSet.removeAll(fSContinueSet);
1272             fOtherSet.removeAll(fSTermSet);
1273             fOtherSet.removeAll(fCloseSet);
1274             fOtherSet.removeAll(fExtendSet);
1275
1276             fSets.add(fSepSet);
1277             fSets.add(fFormatSet);
1278
1279             fSets.add(fSpSet);
1280             fSets.add(fLowerSet);
1281             fSets.add(fUpperSet);
1282             fSets.add(fOLetterSet);
1283             fSets.add(fNumericSet);
1284             fSets.add(fATermSet);
1285             fSets.add(fSContinueSet);
1286             fSets.add(fSTermSet);
1287             fSets.add(fCloseSet);
1288             fSets.add(fOtherSet);
1289             fSets.add(fExtendSet);
1290         }
1291
1292
1293         List  charClasses() {
1294             return fSets;
1295         }
1296
1297         void   setText(StringBuffer s) {
1298             fText = s;
1299         }
1300
1301
1302         //      moveBack()   Find the "significant" code point preceding the index i.
1303         //      Skips over ($Extend | $Format)*
1304         //
1305         private int moveBack(int i) {
1306
1307             if (i <= 0) {
1308                 return -1;
1309             }
1310
1311             int      c;
1312             int      j = i;
1313             do {
1314                 j = moveIndex32(fText, j, -1);
1315                 c = UTF16.charAt(fText, j);
1316             }
1317             while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));
1318             return j;
1319         }
1320
1321
1322         int moveForward(int i) {
1323             if (i>=fText.length()) {
1324                 return fText.length();
1325             }
1326             int   c;
1327             int   j = i;
1328             do {
1329                 j = moveIndex32(fText, j, 1);
1330                 c = cAt(j);
1331             }
1332             while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));
1333             return j;
1334
1335         }
1336
1337         int cAt(int pos) {
1338             if (pos<0 || pos>=fText.length()) {
1339                 return -1;
1340             }
1341             return UTF16.charAt(fText, pos);
1342         }
1343
1344         int   next(int prevPos) {
1345             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the
1346                                         //   break position being tested.  The candidate break
1347                                         //   location is before p2.
1348             int     breakPos = -1;
1349
1350             int c0, c1, c2, c3;         // The code points at p0, p1, p2 & p3.
1351             int c;
1352
1353             // Prev break at end of string.  return DONE.
1354             if (prevPos >= fText.length()) {
1355                 return -1;
1356             }
1357             /*p0 =*/ p1 = p2 = p3 = prevPos;
1358             c3 = UTF16.charAt(fText, prevPos);
1359             c0 = c1 = c2 = 0;
1360
1361             // Loop runs once per "significant" character position in the input text.
1362             for (;;) {
1363                 // Move all of the positions forward in the input string.
1364                 /*p0 = p1;*/  c0 = c1;
1365                 p1 = p2;  c1 = c2;
1366                 p2 = p3;  c2 = c3;
1367
1368                 // Advancd p3 by  X(Extend | Format)*   Rule 4
1369                 p3 = moveForward(p3);
1370                 c3 = cAt(p3);
1371
1372                 // Rule (3) CR x LF
1373                 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
1374                     continue;
1375                 }
1376
1377                 // Rule (4)    Sep  <break>
1378                 if (fSepSet.contains(c1)) {
1379                     p2 = p1+1;   // Separators don't combine with Extend or Format
1380                     break;
1381                 }
1382
1383                 if (p2 >= fText.length()) {
1384                     // Reached end of string.  Always a break position.
1385                     break;
1386                 }
1387
1388                 if (p2 == prevPos) {
1389                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1390                     continue;
1391                 }
1392
1393                 // Rule (6).   ATerm x Numeric
1394                 if (fATermSet.contains(c1) &&  fNumericSet.contains(c2))  {
1395                     continue;
1396                 }
1397
1398                 // Rule (7).  Upper ATerm  x  Uppper
1399                 if (fUpperSet.contains(c0) && fATermSet.contains(c1) && fUpperSet.contains(c2)) {
1400                     continue;
1401                 }
1402
1403                 // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep))* Lower
1404                 //           Note:  Sterm | ATerm are added to the negated part of the expression by a
1405                 //                  note to the Unicode 5.0 documents.
1406                 int p8 = p1;
1407                 while (p8>0 && fSpSet.contains(cAt(p8))) {
1408                     p8 = moveBack(p8);
1409                 }
1410                 while (p8>0 && fCloseSet.contains(cAt(p8))) {
1411                     p8 = moveBack(p8);
1412                 }
1413                 if (fATermSet.contains(cAt(p8))) {
1414                     p8=p2;
1415                     for (;;) {
1416                         c = cAt(p8);
1417                         if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||
1418                             fLowerSet.contains(c) || fSepSet.contains(c) ||
1419                             fATermSet.contains(c) || fSTermSet.contains(c))
1420                          {
1421                             break;
1422                         }
1423                         p8 = moveForward(p8);
1424                     }
1425                     if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {
1426                         continue;
1427                     }
1428                 }
1429
1430                 // Rule 8a  (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)
1431                 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {
1432                     p8 = p1;
1433                     while (setContains(fSpSet, cAt(p8))) {
1434                         p8 = moveBack(p8);
1435                     }
1436                     while (setContains(fCloseSet, cAt(p8))) {
1437                         p8 = moveBack(p8);
1438                     }
1439                     c = cAt(p8);
1440                     if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {
1441                         continue;
1442                     }
1443                 }
1444
1445
1446                 // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
1447                 int p9 = p1;
1448                 while (p9>0 && fCloseSet.contains(cAt(p9))) {
1449                     p9 = moveBack(p9);
1450                 }
1451                 c = cAt(p9);
1452                 if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
1453                     if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {
1454                         continue;
1455                     }
1456                 }
1457
1458                 // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
1459                 int p10 = p1;
1460                 while (p10>0 && fSpSet.contains(cAt(p10))) {
1461                     p10 = moveBack(p10);
1462                 }
1463                 while (p10>0 && fCloseSet.contains(cAt(p10))) {
1464                     p10 = moveBack(p10);
1465                 }
1466                 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {
1467                     if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
1468                         continue;
1469                     }
1470                 }
1471
1472                 // Rule (11)  (STerm | ATerm) Close* Sp*   <break>
1473                 int p11 = p1;
1474                 if (p11>0 && fSepSet.contains(cAt(p11))) {
1475                     p11 = moveBack(p11);
1476                 }
1477                 while (p11>0 && fSpSet.contains(cAt(p11))) {
1478                     p11 = moveBack(p11);
1479                 }
1480                 while (p11>0 && fCloseSet.contains(cAt(p11))) {
1481                     p11 = moveBack(p11);
1482                 }
1483                 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {
1484                     break;
1485                 }
1486
1487                 //  Rule (12)  Any x Any
1488                 continue;
1489             }
1490             breakPos = p2;
1491             return breakPos;
1492         }
1493
1494
1495
1496     }
1497
1498
1499     /**
1500      * Move an index into a string by n code points.
1501      *   Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
1502      *   complicating usage.
1503      * @param s   a Text string
1504      * @param pos The starting code unit index into the text string
1505      * @param amt The amount to adjust the string by.
1506      * @return    The adjusted code unit index, pinned to the string's length, or
1507      *            unchanged if input index was outside of the string.
1508      */
1509     static int moveIndex32(StringBuffer s, int pos, int amt) {
1510         int i;
1511         char  c;
1512         if (amt>0) {
1513             for (i=0; i<amt; i++) {
1514                 if (pos >= s.length()) {
1515                     return s.length();
1516                 }
1517                 c = s.charAt(pos);
1518                 pos++;
1519                 if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
1520                     c = s.charAt(pos);
1521                     if (UTF16.isTrailSurrogate(c)) {
1522                         pos++;
1523                     }
1524                 }
1525             }
1526         } else {
1527             for (i=0; i>amt; i--) {
1528                 if (pos <= 0) {
1529                     return 0;
1530                 }
1531                 pos--;
1532                 c = s.charAt(pos);
1533                 if (UTF16.isTrailSurrogate(c) && pos >= 0) {
1534                     c = s.charAt(pos);
1535                     if (UTF16.isLeadSurrogate(c)) {
1536                         pos--;
1537                     }
1538                 }
1539             }
1540         }
1541         return pos;
1542     }
1543
1544     /**
1545      * No-exceptions form of UnicodeSet.contains(c).
1546      *    Simplifies loops that terminate with an end-of-input character value.
1547      * @param s  A unicode set
1548      * @param c  A code point value
1549      * @return   true if the set contains c.
1550      */
1551     static boolean setContains(UnicodeSet s, int c) {
1552         if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {
1553             return false;
1554         }
1555         return s.contains(c);
1556     }
1557
1558
1559     /**
1560      * return the index of the next code point in the input text.
1561      * @param i the preceding index
1562      */
1563     static int  nextCP(StringBuffer s, int i) {
1564         if (i == -1) {
1565             // End of Input indication.  Continue to return end value.
1566             return -1;
1567         }
1568         int  retVal = i + 1;
1569         if (retVal > s.length()) {
1570             return -1;
1571         }
1572         int  c = UTF16.charAt(s, i);
1573         if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {
1574             retVal++;
1575         }
1576         return retVal;
1577     }
1578
1579
1580     /**
1581      * random number generator.  Not using Java's built-in Randoms for two reasons:
1582      *    1.  Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
1583      *    2.  We need to get and restore the seed from values occurring in the middle
1584      *        of a long sequence, to more easily reproduce failing cases.
1585      */
1586     private static int m_seed = 1;
1587     private static int  m_rand()
1588     {
1589         m_seed = m_seed * 1103515245 + 12345;
1590         return (int)(m_seed >>> 16) % 32768;
1591     }
1592
1593     // Helper function for formatting error output.
1594     //   Append a string into a fixed-size field in a StringBuffer.
1595     //   Blank-pad the string if it is shorter than the field.
1596     //   Truncate the source string if it is too long.
1597     //
1598     private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {
1599         int appendLen = src.length();
1600         if (appendLen >= fieldLen) {
1601             dest.append(src.substring(0, fieldLen));
1602         } else {
1603             dest.append(src);
1604             while (appendLen < fieldLen) {
1605                 dest.append(' ');
1606                 appendLen++;
1607             }
1608         }
1609     }
1610
1611     // Helper function for formatting error output.
1612     // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format
1613     private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {
1614            String hexChars = "0123456789abcdef";
1615            if (c < 0x10000) {
1616                 dest.append("\\u");
1617                 for (int bn=12; bn>=0; bn-=4) {
1618                     dest.append(hexChars.charAt((((int)c)>>bn)&0xf));
1619                 }
1620                 appendToBuf(dest, " ", fieldLen-6);
1621             } else {
1622                 dest.append("\\U");
1623                 for (int bn=28; bn>=0; bn-=4) {
1624                     dest.append(hexChars.charAt((((int)c)>>bn)&0xf));
1625                 }
1626                 appendToBuf(dest, " ", fieldLen-10);
1627
1628             }
1629        }
1630
1631 /**
1632  *  Run a RBBI monkey test.  Common routine, for all break iterator types.
1633  *    Parameters:
1634  *       bi      - the break iterator to use
1635  *       mk      - MonkeyKind, abstraction for obtaining expected results
1636  *       name    - Name of test (char, word, etc.) for use in error messages
1637  *       seed    - Seed for starting random number generator (parameter from user)
1638  *       numIterations
1639  */
1640 void RunMonkey(BreakIterator  bi, RBBIMonkeyKind mk, String name, int  seed, int numIterations) {
1641     int              TESTSTRINGLEN = 500;
1642     StringBuffer     testText         = new StringBuffer();
1643     int              numCharClasses;
1644     List             chClasses;
1645     int[]            expected         = new int[TESTSTRINGLEN*2 + 1];
1646     int              expectedCount    = 0;
1647     boolean[]        expectedBreaks   = new boolean[TESTSTRINGLEN*2 + 1];
1648     boolean[]        forwardBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
1649     boolean[]        reverseBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
1650     boolean[]        isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1651     boolean[]        followingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
1652     boolean[]        precedingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
1653     int              i;
1654     int              loopCount        = 0;
1655     boolean          printTestData    = false;
1656     boolean          printBreaksFromBI = false;
1657
1658     m_seed = seed;
1659
1660     numCharClasses = mk.charClasses().size();
1661     chClasses      = mk.charClasses();
1662
1663     // Verify that the character classes all have at least one member.
1664     for (i=0; i<numCharClasses; i++) {
1665         UnicodeSet s = (UnicodeSet)chClasses.get(i);
1666         if (s == null || s.size() == 0) {
1667             errln("Character Class " + i + " is null or of zero size.");
1668             return;
1669         }
1670     }
1671
1672     //--------------------------------------------------------------------------------------------
1673     //
1674     //  Debugging settings.  Comment out everything in the following block for normal operation
1675     //
1676     //--------------------------------------------------------------------------------------------
1677     // numIterations = -1;
1678     // RuleBasedBreakIterator_New.fTrace = true;
1679     // m_seed = 859056465;
1680     // TESTSTRINGLEN = 50;
1681     // printTestData = true;
1682     // printBreaksFromBI = true;
1683     // ((RuleBasedBreakIterator_New)bi).dump();
1684
1685     //--------------------------------------------------------------------------------------------
1686     //
1687     //  End of Debugging settings.
1688     //
1689     //--------------------------------------------------------------------------------------------
1690
1691     int  dotsOnLine = 0;
1692      while (loopCount < numIterations || numIterations == -1) {
1693         if (numIterations == -1 && loopCount % 10 == 0) {
1694             // If test is running in an infinite loop, display a periodic tic so
1695             //   we can tell that it is making progress.
1696             System.out.print(".");
1697             if (dotsOnLine++ >= 80){
1698                 System.out.println();
1699                 dotsOnLine = 0;
1700             }
1701         }
1702         // Save current random number seed, so that we can recreate the random numbers
1703         //   for this loop iteration in event of an error.
1704         seed = m_seed;
1705
1706         testText.setLength(0);
1707         // Populate a test string with data.
1708         if (printTestData) {
1709             System.out.println("Test Data string ...");
1710         }
1711         for (i=0; i<TESTSTRINGLEN; i++) {
1712             int        aClassNum = m_rand() % numCharClasses;
1713             UnicodeSet classSet  = (UnicodeSet)chClasses.get(aClassNum);
1714             int        charIdx   = m_rand() % classSet.size();
1715             int        c         = classSet.charAt(charIdx);
1716             if (c < 0) {   // TODO:  deal with sets containing strings.
1717                 errln("c < 0");
1718             }
1719             UTF16.appendCodePoint(testText, c);
1720             if (printTestData) {
1721                 System.out.print(Integer.toHexString(c) + " ");
1722             }
1723         }
1724         if (printTestData) {
1725             System.out.println();
1726         }
1727
1728         Arrays.fill(expected, 0);
1729         Arrays.fill(expectedBreaks, false);
1730         Arrays.fill(forwardBreaks, false);
1731         Arrays.fill(reverseBreaks, false);
1732         Arrays.fill(isBoundaryBreaks, false);
1733         Arrays.fill(followingBreaks, false);
1734         Arrays.fill(precedingBreaks, false);
1735
1736         // Calculate the expected results for this test string.
1737         mk.setText(testText);
1738         expectedCount = 0;
1739         expectedBreaks[0] = true;
1740         expected[expectedCount ++] = 0;
1741         int breakPos = 0;
1742         int lastBreakPos = -1;
1743         for (;;) {
1744             lastBreakPos = breakPos;
1745             breakPos = mk.next(breakPos);
1746             if (breakPos == -1) {
1747                 break;
1748             }
1749             if (breakPos > testText.length()) {
1750                 errln("breakPos > testText.length()");
1751             }
1752             if (lastBreakPos >= breakPos) {
1753                 errln("Next() not increasing.");
1754                 // break;
1755             }
1756             expectedBreaks[breakPos] = true;
1757             expected[expectedCount ++] = breakPos;
1758         }
1759
1760         // Find the break positions using forward iteration
1761         if (printBreaksFromBI) {
1762             System.out.println("Breaks from BI...");
1763         }
1764         bi.setText(testText.toString());
1765         for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {
1766             if (i < 0 || i > testText.length()) {
1767                 errln(name + " break monkey test: Out of range value returned by breakIterator::next()");
1768                 break;
1769             }
1770             if (printBreaksFromBI) {
1771                 System.out.print(Integer.toHexString(i) + " ");
1772             }
1773             forwardBreaks[i] = true;
1774         }
1775         if (printBreaksFromBI) {
1776             System.out.println();
1777         }
1778
1779         // Find the break positions using reverse iteration
1780         for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {
1781             if (i < 0 || i > testText.length()) {
1782                 errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);
1783                 break;
1784             }
1785             reverseBreaks[i] = true;
1786         }
1787
1788         // Find the break positions using isBoundary() tests.
1789         for (i=0; i<=testText.length(); i++) {
1790             isBoundaryBreaks[i] = bi.isBoundary(i);
1791         }
1792
1793         // Find the break positions using the following() function.
1794         lastBreakPos = 0;
1795         followingBreaks[0] = true;
1796         for (i=0; i<testText.length(); i++) {
1797             breakPos = bi.following(i);
1798             if (breakPos <= i ||
1799                 breakPos < lastBreakPos ||
1800                 breakPos > testText.length() ||
1801                 breakPos > lastBreakPos && lastBreakPos > i ) {
1802                 errln(name + " break monkey test: " +
1803                     "Out of range value returned by BreakIterator::following().\n" +
1804                     "index=" + i + "following returned=" + breakPos +
1805                     "lastBreak=" + lastBreakPos);
1806                 precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
1807             } else {
1808                 followingBreaks[breakPos] = true;
1809                 lastBreakPos = breakPos;
1810             }
1811         }
1812
1813         // Find the break positions using the preceding() function.
1814         lastBreakPos = testText.length();
1815         precedingBreaks[testText.length()] = true;
1816         for (i=testText.length(); i>0; i--) {
1817             breakPos = bi.preceding(i);
1818             if (breakPos >= i ||
1819                 breakPos > lastBreakPos ||
1820                 breakPos < 0 ||
1821                 breakPos < lastBreakPos && lastBreakPos < i ) {
1822                 errln(name + " break monkey test: " +
1823                         "Out of range value returned by BreakIterator::preceding().\n" +
1824                         "index=" + i + "preceding returned=" + breakPos +
1825                         "lastBreak=" + lastBreakPos);
1826                 precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
1827             } else {
1828                 precedingBreaks[breakPos] = true;
1829                 lastBreakPos = breakPos;
1830             }
1831         }
1832
1833
1834
1835         // Compare the expected and actual results.
1836         for (i=0; i<=testText.length(); i++) {
1837             String errorType = null;
1838             if  (forwardBreaks[i] != expectedBreaks[i]) {
1839                 errorType = "next()";
1840             } else if (reverseBreaks[i] != forwardBreaks[i]) {
1841                 errorType = "previous()";
1842             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
1843                 errorType = "isBoundary()";
1844             } else if (followingBreaks[i] != expectedBreaks[i]) {
1845                 errorType = "following()";
1846             } else if (precedingBreaks[i] != expectedBreaks[i]) {
1847                 errorType = "preceding()";
1848             }
1849
1850
1851             if (errorType != null) {
1852                 // Format a range of the test text that includes the failure as
1853                 //  a data item that can be included in the rbbi test data file.
1854
1855                 // Start of the range is the last point where expected and actual results
1856                 //   both agreed that there was a break position.
1857                 int startContext = i;
1858                 int count = 0;
1859                 for (;;) {
1860                     if (startContext==0) { break; }
1861                     startContext --;
1862                     if (expectedBreaks[startContext]) {
1863                         if (count == 2) break;
1864                         count ++;
1865                     }
1866                 }
1867
1868                 // End of range is two expected breaks past the start position.
1869                 int endContext = i + 1;
1870                 int ci;
1871                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
1872                     for (;;) {
1873                         if (endContext >= testText.length()) {break;}
1874                         if (expectedBreaks[endContext-1]) {
1875                             if (count == 0) break;
1876                             count --;
1877                         }
1878                         endContext ++;
1879                     }
1880                 }
1881
1882                 // Format looks like   "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
1883                 StringBuffer errorText = new StringBuffer();
1884
1885                 int      c;    // Char from test data
1886                 for (ci = startContext;  ci <= endContext && ci != -1;  ci = nextCP(testText, ci)) {
1887                     if (ci == i) {
1888                         // This is the location of the error.
1889                         errorText.append("<?>---------------------------------\n");
1890                     } else if (expectedBreaks[ci]) {
1891                         // This a non-error expected break position.
1892                         errorText.append("------------------------------------\n");
1893                     }
1894                     if (ci < testText.length()) {
1895                         c = UTF16.charAt(testText, ci);
1896                         appendCharToBuf(errorText, c, 11);
1897                         String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT);
1898                         appendToBuf(errorText, gc, 8);
1899                         int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty);
1900                         String extraPropValue =
1901                             UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG);
1902                         appendToBuf(errorText, extraPropValue, 20);
1903
1904                         String charName = UCharacter.getExtendedName(c);
1905                         appendToBuf(errorText, charName, 40);
1906                         errorText.append('\n');
1907                     }
1908                 }
1909                 if (ci == testText.length() && ci != -1) {
1910                     errorText.append("<>");
1911                 }
1912                 errorText.append("</data>\n");
1913
1914                 // Output the error
1915                 errln(name + " break monkey test error.  " +
1916                      (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") +
1917                       "\nOperation = " + errorType + "; random seed = " + seed + ";  buf Idx = " + i + "\n" +
1918                       errorText);
1919                 break;
1920             }
1921         }
1922
1923         loopCount++;
1924     }
1925 }
1926
1927 public void TestCharMonkey() {
1928
1929     int        loopCount = 500;
1930     int        seed      = 1;
1931
1932     if (params.inclusion >= 9) {
1933         loopCount = 10000;
1934     }
1935
1936     RBBICharMonkey  m = new RBBICharMonkey();
1937     BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
1938     RunMonkey(bi, m, "char", seed, loopCount);
1939 }
1940
1941 public void TestWordMonkey() {
1942
1943     int        loopCount = 500;
1944     int        seed      = 1;
1945
1946     if (params.inclusion >= 9) {
1947         loopCount = 10000;
1948     }
1949
1950     logln("Word Break Monkey Test");
1951     RBBIWordMonkey  m = new RBBIWordMonkey();
1952     BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
1953     RunMonkey(bi, m, "word", seed, loopCount);
1954 }
1955
1956 public void TestLineMonkey() {
1957     int        loopCount = 500;
1958     int        seed      = 1;
1959
1960     if (params.inclusion >= 9) {
1961         loopCount = 10000;
1962     }
1963
1964     logln("Line Break Monkey Test");
1965     RBBILineMonkey  m = new RBBILineMonkey();
1966     BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
1967     if (params == null) {
1968         loopCount = 50;
1969     }
1970     RunMonkey(bi, m, "line", seed, loopCount);
1971 }
1972
1973 public void TestSentMonkey() {
1974
1975     int        loopCount = 500;
1976     int        seed      = 1;
1977
1978     if (params.inclusion >= 9) {
1979         loopCount = 3000;
1980     }
1981
1982     logln("Sentence Break Monkey Test");
1983     RBBISentenceMonkey  m = new RBBISentenceMonkey();
1984     BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
1985     if (params == null) {
1986         loopCount = 30;
1987     }
1988     RunMonkey(bi, m, "sent", seed, loopCount);
1989 }
1990 //
1991 //  Round-trip monkey tests.
1992 //  Verify that break iterators created from the rule source from the default
1993 //    break iterators still pass the monkey test for the iterator type.
1994 //
1995 //  This is a major test for the Rule Compiler.  The default break iterators are built
1996 //  from pre-compiled binary rule data that was created using ICU4C; these
1997 //  round-trip rule recompile tests verify that the Java rule compiler can
1998 //  rebuild break iterators from the original source rules.
1999 //
2000 public void TestRTCharMonkey() {
2001
2002     int        loopCount = 200;
2003     int        seed      = 1;
2004
2005     if (params.inclusion >= 9) {
2006         loopCount = 2000;
2007     }
2008
2009     RBBICharMonkey  m = new RBBICharMonkey();
2010     BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
2011     String rules = bi.toString();
2012     BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2013     RunMonkey(rtbi, m, "char", seed, loopCount);
2014 }
2015
2016 public void TestRTWordMonkey() {
2017
2018     int        loopCount = 200;
2019     int        seed      = 1;
2020
2021     if (params.inclusion >= 9) {
2022         loopCount = 2000;
2023     }
2024     logln("Word Break Monkey Test");
2025     RBBIWordMonkey  m = new RBBIWordMonkey();
2026     BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
2027     String rules = bi.toString();
2028     BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2029     RunMonkey(rtbi, m, "word", seed, loopCount);
2030 }
2031
2032 public void TestRTLineMonkey() {
2033     int        loopCount = 200;
2034     int        seed      = 1;
2035
2036     if (params.inclusion >= 9) {
2037         loopCount = 2000;
2038     }
2039
2040     logln("Line Break Monkey Test");
2041     RBBILineMonkey  m = new RBBILineMonkey();
2042     BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
2043     String rules = bi.toString();
2044     BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2045     if (params == null) {
2046         loopCount = 50;
2047     }
2048     RunMonkey(rtbi, m, "line", seed, loopCount);
2049 }
2050
2051 public void TestRTSentMonkey() {
2052
2053     int        loopCount = 200;
2054     int        seed      = 1;
2055
2056     if (params.inclusion >= 9) {
2057         loopCount = 1000;
2058     }
2059
2060     logln("Sentence Break Monkey Test");
2061     RBBISentenceMonkey  m = new RBBISentenceMonkey();
2062     BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
2063     String rules = bi.toString();
2064     BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2065     if (params == null) {
2066         loopCount = 30;
2067     }
2068     RunMonkey(rtbi, m, "sent", seed, loopCount);
2069 }
2070
2071
2072
2073 }
2074