jars/icu4j-4_2_1-src/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java

   1 /*\r
   2  *******************************************************************************\r
   3  * Copyright (C) 2003-2008 International Business Machines Corporation and     *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  */\r
   7  package com.ibm.icu.dev.test.rbbi;\r
   8 \r
   9 \r
  10 // Monkey testing of RuleBasedBreakIterator\r
  11 import java.util.ArrayList;\r
  12 import java.util.Arrays;\r
  13 import java.util.List;\r
  14 import java.util.Locale;\r
  15 \r
  16 import com.ibm.icu.dev.test.TestFmwk;\r
  17 import com.ibm.icu.lang.UCharacter;\r
  18 import com.ibm.icu.lang.UProperty;\r
  19 import com.ibm.icu.text.BreakIterator;\r
  20 import com.ibm.icu.text.RuleBasedBreakIterator;\r
  21 import com.ibm.icu.text.UTF16;\r
  22 import com.ibm.icu.text.UnicodeSet;\r
  23 \r
  24 \r
  25 /**\r
  26  * Monkey tests for RBBI.  These tests have independent implementations of\r
  27  * the Unicode TR boundary rules, and compare results between these and ICU's\r
  28  * implementation, using random data.\r
  29  * \r
  30  * Tests cover Grapheme Cluster (char), Word and Line breaks\r
  31  * \r
  32  * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp\r
  33  *\r
  34  */\r
  35 public class RBBITestMonkey extends TestFmwk {\r
  36     \r
  37     public static void main(String[] args) {\r
  38         new RBBITestMonkey().run(args);\r
  39     }\r
  40     \r
  41 //\r
  42 //     classs RBBIMonkeyKind\r
  43 //\r
  44 //        Monkey Test for Break Iteration\r
  45 //        Abstract interface class.   Concrete derived classes independently\r
  46 //        implement the break rules for different iterator types.\r
  47 //\r
  48 //        The Monkey Test itself uses doesn't know which type of break iterator it is\r
  49 //        testing, but works purely in terms of the interface defined here.\r
  50 //\r
  51     abstract static class RBBIMonkeyKind {\r
  52     \r
  53         // Return a List of UnicodeSets, representing the character classes used\r
  54         //   for this type of iterator.\r
  55         abstract  List  charClasses();\r
  56 \r
  57         // Set the test text on which subsequent calls to next() will operate\r
  58         abstract  void   setText(StringBuffer text);\r
  59 \r
  60         // Find the next break postion, starting from the specified position.\r
  61         // Return -1 after reaching end of string.\r
  62         abstract   int   next(int i);\r
  63         \r
  64         // A Character Property, one of the constants defined in class UProperty.\r
  65         //   The value fo this property will be displayed for the characters\r
  66         //    near any test failure.  \r
  67         int   fCharProperty;\r
  68     }\r
  69 \r
  70  \r
  71     /**\r
  72      * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.\r
  73      */\r
  74     static class RBBICharMonkey extends RBBIMonkeyKind {\r
  75         List                      fSets;\r
  76 \r
  77         UnicodeSet                fCRLFSet;\r
  78         UnicodeSet                fControlSet;\r
  79         UnicodeSet                fExtendSet;\r
  80         UnicodeSet                fPrependSet;\r
  81         UnicodeSet                fSpacingSet;\r
  82         UnicodeSet                fLSet;\r
  83         UnicodeSet                fVSet;\r
  84         UnicodeSet                fTSet;\r
  85         UnicodeSet                fLVSet;\r
  86         UnicodeSet                fLVTSet;\r
  87         UnicodeSet                fHangulSet;\r
  88         UnicodeSet                fAnySet;\r
  89 \r
  90         StringBuffer              fText;\r
  91 \r
  92 \r
  93     RBBICharMonkey() {\r
  94         fText       = null;\r
  95         fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;\r
  96         fCRLFSet    = new UnicodeSet("[\\r\\n]");\r
  97         fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");\r
  98         fExtendSet  = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");\r
  99         fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");\r
 100         fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");\r
 101         fLSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");\r
 102         fVSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");\r
 103         fTSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");\r
 104         fLVSet      = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");\r
 105         fLVTSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");\r
 106         fHangulSet  = new UnicodeSet();\r
 107         fHangulSet.addAll(fLSet);\r
 108         fHangulSet.addAll(fVSet);\r
 109         fHangulSet.addAll(fTSet);\r
 110         fHangulSet.addAll(fLVSet);\r
 111         fHangulSet.addAll(fLVTSet);\r
 112 \r
 113         fAnySet     = new UnicodeSet("[\\u0000-\\U0010ffff]");\r
 114 \r
 115         fSets       = new ArrayList();\r
 116         fSets.add(fCRLFSet);\r
 117         fSets.add(fControlSet);\r
 118         fSets.add(fExtendSet);\r
 119         fSets.add(fPrependSet);\r
 120         fSets.add(fSpacingSet);\r
 121         fSets.add(fHangulSet);\r
 122         fSets.add(fAnySet);\r
 123      }\r
 124 \r
 125 \r
 126     void setText(StringBuffer s) {\r
 127         fText = s;\r
 128     }\r
 129     \r
 130     List charClasses() {\r
 131         return fSets;\r
 132     }\r
 133     \r
 134     int next(int prevPos) {\r
 135         int    p1, p2, p3;    // Indices of the significant code points around the\r
 136                               //   break position being tested.  The candidate break\r
 137                               //   location is before p2.\r
 138     \r
 139         int     breakPos = -1;\r
 140     \r
 141         int   c1, c2, c3;     // The code points at p0, p1, p2 & p3.\r
 142         \r
 143         // Previous break at end of string.  return DONE.\r
 144         if (prevPos >= fText.length()) {\r
 145             return -1;\r
 146         }\r
 147         p1 = p2 = p3 = prevPos;\r
 148         c3 =  UTF16.charAt(fText, prevPos);\r
 149         c1 = c2 = 0;\r
 150     \r
 151         // Loop runs once per "significant" character position in the input text.\r
 152         for (;;) {\r
 153             // Move all of the positions forward in the input string.\r
 154             p1 = p2;  c1 = c2;\r
 155             p2 = p3;  c2 = c3;\r
 156     \r
 157             // Advance p3 by one codepoint\r
 158             p3 = moveIndex32(fText, p3, 1);\r
 159             c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);\r
 160     \r
 161             if (p1 == p2) {\r
 162                 // Still warming up the loop.  (won't work with zero length strings, but we don't care)\r
 163                 continue;\r
 164             }\r
 165             if (p2 == fText.length()) {\r
 166                 // Reached end of string.  Always a break position.\r
 167                 break;\r
 168             }\r
 169     \r
 170             // Rule  GB3   CR x LF\r
 171             //     No Extend or Format characters may appear between the CR and LF,\r
 172             //     which requires the additional check for p2 immediately following p1.\r
 173             //\r
 174             if (c1==0x0D && c2==0x0A && p1==(p2-1)) {\r
 175                 continue;\r
 176             }\r
 177     \r
 178             // Rule (GB4).   ( Control | CR | LF ) <break>\r
 179             if (fControlSet.contains(c1) ||\r
 180                 c1 == 0x0D ||\r
 181                 c1 == 0x0A)  {\r
 182                 break;\r
 183             }\r
 184     \r
 185             // Rule (GB5)    <break>  ( Control | CR | LF )\r
 186             //\r
 187             if (fControlSet.contains(c2) ||\r
 188                 c2 == 0x0D ||\r
 189                 c2 == 0x0A)  {\r
 190                 break;\r
 191             }\r
 192     \r
 193     \r
 194             // Rule (GB6)  L x ( L | V | LV | LVT )\r
 195             if (fLSet.contains(c1) &&\r
 196                 (fLSet.contains(c2)  ||\r
 197                     fVSet.contains(c2)  ||\r
 198                     fLVSet.contains(c2) ||\r
 199                     fLVTSet.contains(c2))) {\r
 200                 continue;\r
 201             }\r
 202     \r
 203             // Rule (GB7)    ( LV | V )  x  ( V | T )\r
 204             if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&\r
 205                 (fVSet.contains(c2) || fTSet.contains(c2)))  {\r
 206                 continue;\r
 207             }\r
 208     \r
 209             // Rule (GB8)    ( LVT | T)  x T\r
 210             if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&\r
 211                 fTSet.contains(c2))  {\r
 212                 continue;\r
 213             }\r
 214     \r
 215             // Rule (GB9)    Numeric x ALetter\r
 216             if (fExtendSet.contains(c2))  {\r
 217                 continue;\r
 218             }\r
 219     \r
 220             // Rule (GB9a)   x  SpacingMark\r
 221             if (fSpacingSet.contains(c2)) {\r
 222                 continue;\r
 223             }\r
 224     \r
 225             // Rule (GB9b)   Prepend x\r
 226             if (fPrependSet.contains(c1)) {\r
 227                 continue;\r
 228             }\r
 229     \r
 230             // Rule (GB10)  Any  <break>  Any\r
 231             break;\r
 232         }\r
 233     \r
 234         breakPos = p2;\r
 235         return breakPos;\r
 236         }\r
 237     }\r
 238 \r
 239 \r
 240     /**\r
 241      * \r
 242      * Word Monkey Test Class\r
 243      *\r
 244      * \r
 245      * \r
 246      */\r
 247     static class RBBIWordMonkey extends RBBIMonkeyKind {\r
 248         List                      fSets;\r
 249         StringBuffer              fText;\r
 250 \r
 251         UnicodeSet                fCRSet;\r
 252         UnicodeSet                fLFSet;\r
 253         UnicodeSet                fNewlineSet;\r
 254         UnicodeSet                fKatakanaSet;\r
 255         UnicodeSet                fALetterSet;\r
 256         UnicodeSet                fMidNumLetSet;\r
 257         UnicodeSet                fMidLetterSet;\r
 258         UnicodeSet                fMidNumSet;\r
 259         UnicodeSet                fNumericSet;\r
 260         UnicodeSet                fFormatSet;\r
 261         UnicodeSet                fExtendSet;\r
 262         UnicodeSet                fExtendNumLetSet;\r
 263         UnicodeSet                fOtherSet;\r
 264 \r
 265         \r
 266         RBBIWordMonkey() {\r
 267             fCharProperty    = UProperty.WORD_BREAK;\r
 268 \r
 269             fCRSet           = new UnicodeSet("[\\p{Word_Break = CR}]");\r
 270             fLFSet           = new UnicodeSet("[\\p{Word_Break = LF}]");\r
 271             fNewlineSet      = new UnicodeSet("[\\p{Word_Break = Newline}]");\r
 272             fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]");\r
 273             fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}]");\r
 274             fMidNumLetSet    = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");\r
 275             fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]");\r
 276             fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]");\r
 277             fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]");\r
 278             fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]");\r
 279             fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");\r
 280             fExtendSet       = new UnicodeSet("[\\p{Word_Break = Extend}]");\r
 281 \r
 282             fOtherSet        = new UnicodeSet();\r
 283             fOtherSet.complement();\r
 284             fOtherSet.removeAll(fCRSet);\r
 285             fOtherSet.removeAll(fLFSet);\r
 286             fOtherSet.removeAll(fNewlineSet);\r
 287             fOtherSet.removeAll(fALetterSet);\r
 288             fOtherSet.removeAll(fKatakanaSet);\r
 289             fOtherSet.removeAll(fMidLetterSet);\r
 290             fOtherSet.removeAll(fMidNumSet);\r
 291             fOtherSet.removeAll(fNumericSet);\r
 292             fOtherSet.removeAll(fFormatSet);\r
 293             fOtherSet.removeAll(fExtendSet);\r
 294             fOtherSet.removeAll(fExtendNumLetSet);\r
 295             // Inhibit dictionary characters from being tested at all.\r
 296             fOtherSet.removeAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));\r
 297 \r
 298             fSets            = new ArrayList();\r
 299             fSets.add(fCRSet);\r
 300             fSets.add(fLFSet);\r
 301             fSets.add(fNewlineSet);\r
 302             fSets.add(fALetterSet);\r
 303             fSets.add(fKatakanaSet);\r
 304             fSets.add(fMidLetterSet);\r
 305             fSets.add(fMidNumLetSet);\r
 306             fSets.add(fMidNumSet);\r
 307             fSets.add(fNumericSet);\r
 308             fSets.add(fFormatSet);\r
 309             fSets.add(fExtendSet);\r
 310             fSets.add(fExtendNumLetSet);\r
 311             fSets.add(fOtherSet);\r
 312         }\r
 313         \r
 314         \r
 315         List  charClasses() {\r
 316          return fSets;  \r
 317         }\r
 318         \r
 319         void   setText(StringBuffer s) { \r
 320             fText = s;        \r
 321         }   \r
 322 \r
 323         int   next(int prevPos) {  \r
 324             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the \r
 325                                         //   break position being tested.  The candidate break\r
 326                                         //   location is before p2.\r
 327             int     breakPos = -1;\r
 328             \r
 329             int c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.\r
 330             \r
 331             // Previous break at end of string.  return DONE.\r
 332             if (prevPos >= fText.length()) {\r
 333                 return -1;\r
 334             }\r
 335             /*p0 =*/ p1 = p2 = p3 = prevPos;\r
 336             c3 = UTF16.charAt(fText, prevPos);\r
 337             c0 = c1 = c2 = 0;\r
 338             \r
 339             \r
 340 \r
 341             // Loop runs once per "significant" character position in the input text.\r
 342             for (;;) {\r
 343                 // Move all of the positions forward in the input string.\r
 344                 /*p0 = p1;*/  c0 = c1;\r
 345                 p1 = p2;  c1 = c2;\r
 346                 p2 = p3;  c2 = c3;\r
 347                 \r
 348                 // Advancd p3 by    X(Extend | Format)*   Rule 4\r
 349                 //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)\r
 350                 do {\r
 351                     p3 = moveIndex32(fText, p3, 1);\r
 352                     c3 = -1;\r
 353                     if (p3>=fText.length()) {\r
 354                         break;\r
 355                     }\r
 356                     c3 = UTF16.charAt(fText, p3);\r
 357                     if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {\r
 358                         break;\r
 359                     }\r
 360                 }\r
 361                 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3));\r
 362 \r
 363                 if (p1 == p2) {\r
 364                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)\r
 365                     continue;\r
 366                 }\r
 367                 if (p2 == fText.length()) {\r
 368                     // Reached end of string.  Always a break position.\r
 369                     break;\r
 370                 }\r
 371 \r
 372                 // Rule (3)   CR x LF\r
 373                 //     No Extend or Format characters may appear between the CR and LF,\r
 374                 //     which requires the additional check for p2 immediately following p1.\r
 375                 //\r
 376                 if (c1==0x0D && c2==0x0A) {\r
 377                     continue;\r
 378                 }\r
 379                 \r
 380                 // Rule (3a)  Break before and after newlines (including CR and LF)\r
 381                 //\r
 382                 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {\r
 383                     break;\r
 384                 }\r
 385                 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {\r
 386                     break;\r
 387                 }\r
 388 \r
 389                 // Rule (5).   ALetter x ALetter\r
 390                 if (fALetterSet.contains(c1) &&\r
 391                         fALetterSet.contains(c2))  {\r
 392                     continue;\r
 393                 }\r
 394                 \r
 395                 // Rule (6)  ALetter  x  (MidLetter | MidNumLet)  ALetter\r
 396                 //\r
 397                 if ( fALetterSet.contains(c1) &&\r
 398                         (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2)) &&\r
 399                         setContains(fALetterSet, c3)) {\r
 400                     continue;\r
 401                 }\r
 402                 \r
 403                 \r
 404                 // Rule (7)  ALetter (MidLetter | MidNumLet)   x  ALetter\r
 405                 if (fALetterSet.contains(c0) &&\r
 406                         (fMidLetterSet.contains(c1) ||  fMidNumLetSet.contains(c1))  &&\r
 407                         fALetterSet.contains(c2)) {\r
 408                     continue;\r
 409                 }\r
 410                 \r
 411                 //  Rule (8)    Numeric x Numeric\r
 412                 if (fNumericSet.contains(c1) &&\r
 413                         fNumericSet.contains(c2))  {\r
 414                     continue;\r
 415                 }\r
 416                 \r
 417                 // Rule (9)    ALetter x Numeric\r
 418                 if (fALetterSet.contains(c1) &&\r
 419                         fNumericSet.contains(c2))  {\r
 420                     continue;\r
 421                 }\r
 422 \r
 423                 // Rule (10)    Numeric x ALetter\r
 424                 if (fNumericSet.contains(c1) &&\r
 425                         fALetterSet.contains(c2))  {\r
 426                     continue;\r
 427                 }\r
 428                 \r
 429                 // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric\r
 430                 if ( fNumericSet.contains(c0) &&\r
 431                         (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1))  && \r
 432                         fNumericSet.contains(c2)) {\r
 433                     continue;\r
 434                 }\r
 435                 \r
 436                 // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric\r
 437                 if (fNumericSet.contains(c1) &&\r
 438                         (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2)) &&\r
 439                         setContains(fNumericSet, c3)) {\r
 440                     continue;\r
 441                 }\r
 442                 \r
 443                 // Rule (13)  Katakana x Katakana\r
 444                 if (fKatakanaSet.contains(c1) &&\r
 445                         fKatakanaSet.contains(c2))  {\r
 446                     continue;\r
 447                 }\r
 448                 \r
 449                 // Rule 13a  (ALetter | Numeric | Katakana | ExtendNumLet) x ExtendNumLet\r
 450                 if ((fALetterSet.contains(c1) || fNumericSet.contains(c1) ||\r
 451                         fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&\r
 452                         fExtendNumLetSet.contains(c2)) {\r
 453                     continue;\r
 454                 }\r
 455                 // Rule 13b   ExtendNumLet x (ALetter | Numeric | Katakana | ExtendNumLet)\r
 456                 if (fExtendNumLetSet.contains(c1) &&\r
 457                         (fALetterSet.contains(c2) || fNumericSet.contains(c2) ||\r
 458                         fKatakanaSet.contains(c2) || fExtendNumLetSet.contains(c2))) {\r
 459                     continue;\r
 460                 }\r
 461                \r
 462                 // Rule 14.  Break found here.\r
 463                 break;\r
 464             }\r
 465             \r
 466             breakPos = p2;\r
 467             return breakPos;\r
 468         }\r
 469         \r
 470     }\r
 471 \r
 472  \r
 473     static class RBBILineMonkey extends RBBIMonkeyKind {\r
 474         \r
 475         List        fSets;\r
 476         \r
 477         UnicodeSet  fBK;\r
 478         UnicodeSet  fCR;\r
 479         UnicodeSet  fLF;\r
 480         UnicodeSet  fCM;\r
 481         UnicodeSet  fNL;\r
 482         UnicodeSet  fSG;\r
 483         UnicodeSet  fWJ;\r
 484         UnicodeSet  fZW;\r
 485         UnicodeSet  fGL;\r
 486         UnicodeSet  fCB;\r
 487         UnicodeSet  fSP;\r
 488         UnicodeSet  fB2;\r
 489         UnicodeSet  fBA;\r
 490         UnicodeSet  fBB;\r
 491         UnicodeSet  fHY;\r
 492         UnicodeSet  fCL;\r
 493         UnicodeSet  fEX;\r
 494         UnicodeSet  fIN;\r
 495         UnicodeSet  fNS;\r
 496         UnicodeSet  fOP;\r
 497         UnicodeSet  fQU;\r
 498         UnicodeSet  fIS;\r
 499         UnicodeSet  fNU;\r
 500         UnicodeSet  fPO;\r
 501         UnicodeSet  fPR;\r
 502         UnicodeSet  fSY;\r
 503         UnicodeSet  fAI;\r
 504         UnicodeSet  fAL;\r
 505         UnicodeSet  fID;\r
 506         UnicodeSet  fSA;\r
 507         UnicodeSet  fJL;\r
 508         UnicodeSet  fJV;\r
 509         UnicodeSet  fJT;\r
 510         UnicodeSet  fH2;\r
 511         UnicodeSet  fH3;\r
 512         UnicodeSet  fXX;\r
 513         \r
 514         StringBuffer  fText;\r
 515         int           fOrigPositions;\r
 516         \r
 517         \r
 518         \r
 519         RBBILineMonkey()\r
 520         {\r
 521             fCharProperty  = UProperty.LINE_BREAK;\r
 522             fSets          = new ArrayList();\r
 523             \r
 524             fBK    = new UnicodeSet("[\\p{Line_Break=BK}]");\r
 525             fCR    = new UnicodeSet("[\\p{Line_break=CR}]");\r
 526             fLF    = new UnicodeSet("[\\p{Line_break=LF}]");\r
 527             fCM    = new UnicodeSet("[\\p{Line_break=CM}]");\r
 528             fNL    = new UnicodeSet("[\\p{Line_break=NL}]");\r
 529             fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]");\r
 530             fZW    = new UnicodeSet("[\\p{Line_break=ZW}]");\r
 531             fGL    = new UnicodeSet("[\\p{Line_break=GL}]");\r
 532             fCB    = new UnicodeSet("[\\p{Line_break=CB}]");\r
 533             fSP    = new UnicodeSet("[\\p{Line_break=SP}]");\r
 534             fB2    = new UnicodeSet("[\\p{Line_break=B2}]");\r
 535             fBA    = new UnicodeSet("[\\p{Line_break=BA}]");\r
 536             fBB    = new UnicodeSet("[\\p{Line_break=BB}]");\r
 537             fHY    = new UnicodeSet("[\\p{Line_break=HY}]");\r
 538             fCL    = new UnicodeSet("[\\p{Line_break=CL}]");\r
 539             fEX    = new UnicodeSet("[\\p{Line_break=EX}]");\r
 540             fIN    = new UnicodeSet("[\\p{Line_break=IN}]");\r
 541             fNS    = new UnicodeSet("[\\p{Line_break=NS}]");\r
 542             fOP    = new UnicodeSet("[\\p{Line_break=OP}]");\r
 543             fQU    = new UnicodeSet("[\\p{Line_break=QU}]");\r
 544             fIS    = new UnicodeSet("[\\p{Line_break=IS}]");\r
 545             fNU    = new UnicodeSet("[\\p{Line_break=NU}]");\r
 546             fPO    = new UnicodeSet("[\\p{Line_break=PO}]");\r
 547             fPR    = new UnicodeSet("[\\p{Line_break=PR}]");\r
 548             fSY    = new UnicodeSet("[\\p{Line_break=SY}]");\r
 549             fAI    = new UnicodeSet("[\\p{Line_break=AI}]");\r
 550             fAL    = new UnicodeSet("[\\p{Line_break=AL}]");\r
 551             fID    = new UnicodeSet("[\\p{Line_break=ID}]");\r
 552             fSA    = new UnicodeSet("[\\p{Line_break=SA}]");\r
 553             fJL    = new UnicodeSet("[\\p{Line_break=JL}]");\r
 554             fJV    = new UnicodeSet("[\\p{Line_break=JV}]");\r
 555             fJT    = new UnicodeSet("[\\p{Line_break=JT}]");\r
 556             fH2    = new UnicodeSet("[\\p{Line_break=H2}]");\r
 557             fH3    = new UnicodeSet("[\\p{Line_break=H3}]");\r
 558             fSG    = new UnicodeSet("[\\ud800-\\udfff]");\r
 559             fXX    = new UnicodeSet("[\\p{Line_break=XX}]");\r
 560 \r
 561             \r
 562             fAL.addAll(fXX);     // Default behavior for XX is identical to AL\r
 563             fAL.addAll(fAI);     // Default behavior for AI is identical to AL\r
 564             fAL.addAll(fSA);     // Default behavior for SA is XX, which defaults to AL\r
 565             fAL.addAll(fSG);     // Default behavior for SG (unpaired surrogates) is AL\r
 566             \r
 567             \r
 568             \r
 569             fSets.add(fBK);\r
 570             fSets.add(fCR);\r
 571             fSets.add(fLF);\r
 572             fSets.add(fCM);\r
 573             fSets.add(fNL);\r
 574             fSets.add(fWJ);\r
 575             fSets.add(fZW);\r
 576             fSets.add(fGL);\r
 577             fSets.add(fCB);\r
 578             fSets.add(fSP);\r
 579             fSets.add(fB2);\r
 580             fSets.add(fBA);\r
 581             fSets.add(fBB);\r
 582             fSets.add(fHY);\r
 583             fSets.add(fH2);\r
 584             fSets.add(fH3);\r
 585             fSets.add(fCL);\r
 586             fSets.add(fEX);\r
 587             fSets.add(fIN);\r
 588             fSets.add(fJL);\r
 589             fSets.add(fJT);\r
 590             fSets.add(fJV);\r
 591             fSets.add(fNS);\r
 592             fSets.add(fOP);\r
 593             fSets.add(fQU);\r
 594             fSets.add(fIS);\r
 595             fSets.add(fNU);\r
 596             fSets.add(fPO);\r
 597             fSets.add(fPR);\r
 598             fSets.add(fSY);\r
 599             fSets.add(fAI);\r
 600             fSets.add(fAL);\r
 601             fSets.add(fID);\r
 602             fSets.add(fWJ);\r
 603             fSets.add(fSA);\r
 604             fSets.add(fSG);\r
 605             \r
 606         }\r
 607         \r
 608         void setText(StringBuffer s) {\r
 609             fText       = s;\r
 610         }\r
 611         \r
 612         \r
 613         \r
 614 \r
 615         int next(int startPos) {\r
 616             int    pos;       //  Index of the char following a potential break position\r
 617             int    thisChar;  //  Character at above position "pos"\r
 618             \r
 619             int    prevPos;   //  Index of the char preceding a potential break position\r
 620             int    prevChar;  //  Character at above position.  Note that prevChar\r
 621                               //   and thisChar may not be adjacent because combining\r
 622                               //   characters between them will be ignored.\r
 623             \r
 624             int    nextPos;   //  Index of the next character following pos.\r
 625                               //     Usually skips over combining marks.\r
 626             int    tPos;      //  temp value.\r
 627             int    matchVals[]  = null;       // Number  Expression Match Results\r
 628  \r
 629             \r
 630             if (startPos >= fText.length()) {\r
 631                 return -1;\r
 632             }\r
 633             \r
 634             \r
 635             // Initial values for loop.  Loop will run the first time without finding breaks,\r
 636             //                           while the invalid values shift out and the "this" and\r
 637             //                           "prev" positions are filled in with good values.\r
 638             pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.\r
 639             thisChar = prevChar  = 0;\r
 640             nextPos  = startPos;\r
 641             \r
 642             \r
 643             // Loop runs once per position in the test text, until a break position\r
 644             //  is found.  In each iteration, we are testing for a possible break\r
 645             //  just preceding the character at index "pos".  The character preceding\r
 646             //  this char is at postion "prevPos"; because of combining sequences,\r
 647             //  "prevPos" can be arbitrarily far before "pos".\r
 648             for (;;) {\r
 649                 // Advance to the next position to be tested.\r
 650                 prevPos   = pos;\r
 651                 prevChar  = thisChar;\r
 652                 pos       = nextPos;\r
 653                 nextPos   = moveIndex32(fText, pos, 1);\r
 654                 \r
 655                 // Rule LB2 - Break at end of text.\r
 656                 if (pos >= fText.length()) {\r
 657                     break;\r
 658                 }\r
 659                 \r
 660                 // Rule LB 9 - adjust for combining sequences.\r
 661                 //             We do this rule out-of-order because the adjustment does\r
 662                 //             not effect the way that rules LB 3 through LB 6 match,\r
 663                 //             and doing it here rather than after LB 6 is substantially\r
 664                 //             simpler when combining sequences do occur.\r
 665                 \r
 666                 \r
 667                 // LB 9         Keep combining sequences together.\r
 668                 //              advance over any CM class chars at "pos", \r
 669                 //              result is "nextPos" for the following loop iteration.\r
 670                 thisChar  = UTF16.charAt(fText, pos);\r
 671                 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||\r
 672                         thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {\r
 673                     for (;;) {\r
 674                         if (nextPos == fText.length()) {\r
 675                             break;   \r
 676                         }\r
 677                         int nextChar = UTF16.charAt(fText, nextPos);\r
 678                         if (!fCM.contains(nextChar)) {\r
 679                             break;\r
 680                         }\r
 681                         nextPos = moveIndex32(fText, nextPos, 1);\r
 682                     }\r
 683                 }\r
 684                 \r
 685                 // LB 9 Treat X CM* as if it were X\r
 686                 //        No explicit action required.\r
 687                 \r
 688                 // LB 10     Treat any remaining combining mark as AL\r
 689                 if (fCM.contains(thisChar)) {\r
 690                     thisChar = 'A';   \r
 691                 }\r
 692 \r
 693                 \r
 694                 // If the loop is still warming up - if we haven't shifted the initial\r
 695                 //   -1 positions out of prevPos yet - loop back to advance the\r
 696                 //    position in the input without any further looking for breaks.\r
 697                 if (prevPos == -1) {\r
 698                     continue;\r
 699                 }\r
 700                 \r
 701                 // LB 4  Always break after hard line breaks,\r
 702                 if (fBK.contains(prevChar)) {\r
 703                     break;\r
 704                 }\r
 705                 \r
 706                 // LB 5  Break after CR, LF, NL, but not inside CR LF\r
 707                 if (fCR.contains(prevChar) && fLF.contains(thisChar)) {\r
 708                     continue;\r
 709                 }\r
 710                 if  (fCR.contains(prevChar) ||\r
 711                      fLF.contains(prevChar) ||\r
 712                      fNL.contains(prevChar))  {\r
 713                     break;\r
 714                 }\r
 715                 \r
 716                 // LB 6  Don't break before hard line breaks\r
 717                 if (fBK.contains(thisChar) || fCR.contains(thisChar) ||\r
 718                         fLF.contains(thisChar) || fNL.contains(thisChar) ) {\r
 719                     continue;\r
 720                 }\r
 721                 \r
 722                 \r
 723                 // LB 7  Don't break before spaces or zero-width space.\r
 724                 if (fSP.contains(thisChar)) {\r
 725                     continue;\r
 726                 }\r
 727                 \r
 728                 if (fZW.contains(thisChar)) {\r
 729                     continue;\r
 730                 }\r
 731                 \r
 732                 // LB 8  Break after zero width space\r
 733                 if (fZW.contains(prevChar)) {\r
 734                     break;\r
 735                 }\r
 736                 \r
 737                 //  LB 9, 10  Already done, at top of loop.\r
 738                 //\r
 739                 \r
 740                 \r
 741                 // LB 11\r
 742                 //    x  WJ\r
 743                 //    WJ  x\r
 744                 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {\r
 745                     continue;\r
 746                 }\r
 747                 \r
 748                 \r
 749                 // LB 12\r
 750                 //        GL x\r
 751                 if (fGL.contains(prevChar)) {\r
 752                     continue;\r
 753                 }\r
 754                 \r
 755                 // LB 12a\r
 756                 //    [^SP BA HY] x GL\r
 757                 if (!(fSP.contains(prevChar) ||\r
 758                       fBA.contains(prevChar) ||\r
 759                       fHY.contains(prevChar)     ) && fGL.contains(thisChar)) {\r
 760                     continue;\r
 761                 }\r
 762 \r
 763                 \r
 764                 \r
 765                 // LB 13  Don't break before closings.\r
 766                 //       NU x CL  and NU x IS are not matched here so that they will\r
 767                 //       fall into LB 17 and the more general number regular expression.\r
 768                 //\r
 769                 if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||\r
 770                         fEX.contains(thisChar) ||\r
 771                         !fNU.contains(prevChar) && fIS.contains(thisChar) ||\r
 772                         !fNU.contains(prevChar) && fSY.contains(thisChar))    {\r
 773                     continue;\r
 774                 }\r
 775                 \r
 776                 // LB 14  Don't break after OP SP*\r
 777                 //       Scan backwards, checking for this sequence.\r
 778                 //       The OP char could include combining marks, so we actually check for\r
 779                 //           OP CM* SP* x\r
 780                 tPos = prevPos;\r
 781                 if (fSP.contains(prevChar)) {\r
 782                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {\r
 783                         tPos=moveIndex32(fText, tPos, -1);\r
 784                     }\r
 785                 }\r
 786                 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {\r
 787                     tPos=moveIndex32(fText, tPos, -1);\r
 788                 }\r
 789                 if (fOP.contains(UTF16.charAt(fText, tPos))) {\r
 790                     continue;\r
 791                 }\r
 792                 \r
 793                 // LB 15 Do not break within "[ \r
 794                 //       QU CM* SP* x OP\r
 795                 if (fOP.contains(thisChar)) {\r
 796                     // Scan backwards from prevChar to see if it is preceded by QU CM* SP*\r
 797                     tPos = prevPos;\r
 798                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {\r
 799                         tPos = moveIndex32(fText, tPos, -1);\r
 800                     }\r
 801                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {\r
 802                         tPos = moveIndex32(fText, tPos, -1);\r
 803                     }\r
 804                     if (fQU.contains(UTF16.charAt(fText, tPos))) {\r
 805                         continue;\r
 806                     }\r
 807                 }               \r
 808                 \r
 809                 // LB 16   CL SP* x NS\r
 810                 if (fNS.contains(thisChar)) {\r
 811                     tPos = prevPos;\r
 812                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {\r
 813                         tPos = moveIndex32(fText, tPos, -1);\r
 814                     }\r
 815                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {\r
 816                         tPos = moveIndex32(fText, tPos, -1);\r
 817                     }\r
 818                     if (fCL.contains(UTF16.charAt(fText, tPos))) {\r
 819                         continue;\r
 820                     }\r
 821                 }               \r
 822                 \r
 823                                \r
 824                 // LB 17        B2 SP* x B2\r
 825                 if (fB2.contains(thisChar)) {\r
 826                     tPos = prevPos;\r
 827                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {\r
 828                         tPos = moveIndex32(fText, tPos, -1);\r
 829                     }\r
 830                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {\r
 831                         tPos = moveIndex32(fText, tPos, -1);\r
 832                     }\r
 833                     if (fB2.contains(UTF16.charAt(fText, tPos))) {\r
 834                         continue;\r
 835                     }\r
 836                 }               \r
 837                 \r
 838                 // LB 18    break after space\r
 839                 if (fSP.contains(prevChar)) {\r
 840                     break;\r
 841                 }\r
 842                 \r
 843                 // LB 19\r
 844                 //    x   QU\r
 845                 //    QU  x\r
 846                 if (fQU.contains(thisChar) || fQU.contains(prevChar)) {\r
 847                     continue;\r
 848                 }\r
 849                 \r
 850                 // LB 20  Break around a CB\r
 851                 if (fCB.contains(thisChar) || fCB.contains(prevChar)) {\r
 852                     break;\r
 853                 }\r
 854                 \r
 855                 // LB 21\r
 856                 if (fBA.contains(thisChar) ||\r
 857                         fHY.contains(thisChar) ||\r
 858                         fNS.contains(thisChar) ||\r
 859                         fBB.contains(prevChar) )   {\r
 860                     continue;\r
 861                 }\r
 862                 \r
 863                 // LB 22\r
 864                 if (fAL.contains(prevChar) && fIN.contains(thisChar) ||\r
 865                         fID.contains(prevChar) && fIN.contains(thisChar) ||\r
 866                         fIN.contains(prevChar) && fIN.contains(thisChar) ||\r
 867                         fNU.contains(prevChar) && fIN.contains(thisChar) )   {\r
 868                     continue;\r
 869                 }\r
 870                 \r
 871                 \r
 872                 // LB 23    ID x PO    (Note:  Leading CM behaves like ID)\r
 873                 //          AL x NU\r
 874                 //          NU x AL\r
 875                 if (fID.contains(prevChar) && fPO.contains(thisChar) ||\r
 876                         fAL.contains(prevChar) && fNU.contains(thisChar) ||\r
 877                         fNU.contains(prevChar) && fAL.contains(thisChar) )   {\r
 878                     continue;\r
 879                 }\r
 880                 \r
 881                 // LB 24  Do not break between prefix and letters or ideographs.\r
 882                 //        PR x ID\r
 883                 //        PR x AL\r
 884                 //        PO x AL\r
 885                 if (fPR.contains(prevChar) && fID.contains(thisChar) ||\r
 886                     fPR.contains(prevChar) && fAL.contains(thisChar) ||\r
 887                     fPO.contains(prevChar) && fAL.contains(thisChar))  {\r
 888                     continue;\r
 889                 }\r
 890                 \r
 891                 \r
 892                 // LB 25    Numbers\r
 893                 matchVals = LBNumberCheck(fText, prevPos, matchVals);\r
 894                 if (matchVals[0] != -1) {\r
 895                     // Matched a number.  But could have been just a single digit, which would\r
 896                     //    not represent a "no break here" between prevChar and thisChar\r
 897                     int numEndIdx = matchVals[1];  // idx of first char following num\r
 898                     if (numEndIdx > pos) {\r
 899                         // Number match includes at least the two chars being checked\r
 900                         if (numEndIdx > nextPos) {\r
 901                             // Number match includes additional chars.  Update pos and nextPos\r
 902                             //   so that next loop iteration will continue at the end of the number,\r
 903                             //   checking for breaks between last char in number & whatever follows.\r
 904                             nextPos = numEndIdx;\r
 905                             pos     = numEndIdx;\r
 906                             do {\r
 907                                 pos = moveIndex32(fText, pos, -1);  \r
 908                                 thisChar = UTF16.charAt(fText, pos);\r
 909                             }\r
 910                             while (fCM.contains(thisChar));\r
 911                         }\r
 912                         continue;\r
 913                     }\r
 914                 }\r
 915                 \r
 916                 \r
 917                 // LB 26  Do not break Korean Syllables\r
 918                 if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||\r
 919                                                 fJV.contains(thisChar) ||\r
 920                                                 fH2.contains(thisChar) ||\r
 921                                                 fH3.contains(thisChar))) {\r
 922                                                     continue;\r
 923                                                 }\r
 924 \r
 925                 if ((fJV.contains(prevChar) || fH2.contains(prevChar))  &&\r
 926                     (fJV.contains(thisChar) || fJT.contains(thisChar))) {\r
 927                         continue;\r
 928                 }\r
 929 \r
 930                 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&\r
 931                     fJT.contains(thisChar)) {\r
 932                         continue;\r
 933                 }\r
 934 \r
 935                 // LB 27 Treat a Korean Syllable Block the same as ID\r
 936                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||\r
 937                     fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&\r
 938                     fIN.contains(thisChar)) {\r
 939                         continue;\r
 940                     }\r
 941                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||\r
 942                     fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&\r
 943                     fPO.contains(thisChar)) {\r
 944                         continue;\r
 945                     }\r
 946                 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||\r
 947                     fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {\r
 948                         continue;\r
 949                     }\r
 950 \r
 951                 \r
 952                 \r
 953                 // LB 28 Do not break between alphabetics\r
 954                 if (fAL.contains(prevChar) && fAL.contains(thisChar)) {\r
 955                     continue;\r
 956                 }\r
 957                 \r
 958                 // LB 29  Do not break between numeric punctuation and alphabetics\r
 959                 if (fIS.contains(prevChar) && fAL.contains(thisChar)) {\r
 960                     continue;\r
 961                 }\r
 962                 \r
 963                 // LB 30  (Withdrawn as of Unicode 5.1)\r
 964               \r
 965                 // LB 31    Break everywhere else\r
 966                 break;            \r
 967             }\r
 968             \r
 969             return pos;\r
 970         }\r
 971         \r
 972         \r
 973         \r
 974         // Match the following regular expression in the input text.\r
 975         //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)?  (PR | PO) CM*)?\r
 976         //      0    0   1       3    3    4              7    7    7    7      9   9     11   11    (match states)\r
 977         //  retVals array  [0]  index of the start of the match, or -1 if no match\r
 978         //                 [1]  index of first char following the match.\r
 979         //  Can not use Java regex because need supplementary character support,\r
 980         //     and because Unicode char properties version must be the same as in\r
 981         //     the version of ICU being tested.\r
 982         private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {\r
 983             if (retVals == null) {\r
 984                 retVals = new int[2];\r
 985              }\r
 986             retVals[0]     = -1;  // Indicates no match.\r
 987             int matchState = 0;\r
 988             int idx        = startIdx;\r
 989             \r
 990             matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){\r
 991                 int c = UTF16.charAt(s, idx);\r
 992                 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);\r
 993                 switch (matchState) {\r
 994                     case 0:   \r
 995                         if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||\r
 996                             cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {\r
 997                             matchState = 1;  \r
 998                             break;\r
 999                         }\r
1000                         if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {\r
1001                             matchState = 4;\r
1002                             break;\r
1003                         }\r
1004                         if (cLBType == UCharacter.LineBreak.HYPHEN) {\r
1005                             matchState = 4;\r
1006                             break;\r
1007                         }\r
1008                         if (cLBType == UCharacter.LineBreak.NUMERIC) {\r
1009                             matchState = 7;\r
1010                             break;\r
1011                         }\r
1012                         break matchLoop;   /* No Match  */\r
1013                         \r
1014                     case 1:\r
1015                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {\r
1016                             matchState = 1;\r
1017                             break;\r
1018                         }\r
1019                         if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {\r
1020                             matchState = 4;\r
1021                             break;\r
1022                         }\r
1023                         if (cLBType == UCharacter.LineBreak.HYPHEN) {\r
1024                             matchState = 4;\r
1025                             break;\r
1026                         }\r
1027                         if (cLBType == UCharacter.LineBreak.NUMERIC) {\r
1028                             matchState = 7;\r
1029                             break;\r
1030                         }\r
1031                         break matchLoop;   /* No Match  */\r
1032                         \r
1033                         \r
1034                     case 4:\r
1035                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {\r
1036                             matchState = 4;\r
1037                             break;\r
1038                         }\r
1039                         if (cLBType == UCharacter.LineBreak.NUMERIC) {\r
1040                             matchState = 7;\r
1041                             break;\r
1042                         }\r
1043                         break matchLoop;   /* No Match  */\r
1044                         //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)?  (PR | PO) CM*)?\r
1045                         //      0    0   1       3    3    4              7    7    7    7      9   9     11   11    (match states)\r
1046                  \r
1047                     case 7:\r
1048                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {\r
1049                             matchState = 7;\r
1050                             break;                           \r
1051                         }\r
1052                         if (cLBType == UCharacter.LineBreak.NUMERIC) {\r
1053                             matchState = 7;\r
1054                             break;                           \r
1055                         }\r
1056                         if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {\r
1057                             matchState = 7;\r
1058                             break;                           \r
1059                         }\r
1060                         if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {\r
1061                             matchState = 7;\r
1062                             break;       \r
1063                         }\r
1064                         if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {\r
1065                             matchState = 9;\r
1066                             break;                           \r
1067                         }\r
1068                         if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {\r
1069                             matchState = 11;\r
1070                             break;                           \r
1071                         }\r
1072                         if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {\r
1073                             matchState = 11;\r
1074                             break;                           \r
1075                         }\r
1076 \r
1077                         break matchLoop;    // Match Complete.\r
1078                     case 9:\r
1079                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {\r
1080                             matchState = 9;\r
1081                             break;                           \r
1082                         }\r
1083                         if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {\r
1084                             matchState = 11;\r
1085                             break;                           \r
1086                         }\r
1087                         if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {\r
1088                             matchState = 11;\r
1089                             break;                           \r
1090                         }\r
1091                         break matchLoop;    // Match Complete.\r
1092                     case 11:\r
1093                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {\r
1094                             matchState = 11;\r
1095                             break;                           \r
1096                         }\r
1097                         break matchLoop;    // Match Complete.\r
1098                 }\r
1099             }\r
1100             if (matchState > 4) {\r
1101                 retVals[0] = startIdx;   \r
1102                  retVals[1] = idx;   \r
1103             }\r
1104             return retVals;\r
1105         }\r
1106         \r
1107         \r
1108         List  charClasses() {\r
1109             return fSets;\r
1110         }\r
1111         \r
1112         \r
1113     \r
1114     }\r
1115 \r
1116      \r
1117     /**\r
1118      * \r
1119      * Sentence Monkey Test Class\r
1120      *\r
1121      * \r
1122      * \r
1123      */\r
1124     static class RBBISentenceMonkey extends RBBIMonkeyKind {\r
1125         List                 fSets;\r
1126         StringBuffer         fText;\r
1127 \r
1128         UnicodeSet           fSepSet;\r
1129         UnicodeSet           fFormatSet;\r
1130         UnicodeSet           fSpSet;\r
1131         UnicodeSet           fLowerSet;\r
1132         UnicodeSet           fUpperSet;\r
1133         UnicodeSet           fOLetterSet;\r
1134         UnicodeSet           fNumericSet;\r
1135         UnicodeSet           fATermSet;\r
1136         UnicodeSet           fSContinueSet;\r
1137         UnicodeSet           fSTermSet;\r
1138         UnicodeSet           fCloseSet;\r
1139         UnicodeSet           fOtherSet;\r
1140         UnicodeSet           fExtendSet;\r
1141 \r
1142  \r
1143         \r
1144         RBBISentenceMonkey() {\r
1145             fCharProperty  = UProperty.SENTENCE_BREAK;\r
1146 \r
1147             fSets            = new ArrayList();\r
1148 \r
1149             //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator\r
1150             //                       set and made into character classes of their own.  For the monkey impl,\r
1151             //                       they remain in SEP, since Sep always appears with CR and LF in the rules.\r
1152             fSepSet          = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");\r
1153             fFormatSet       = new UnicodeSet("[\\p{Sentence_Break = Format}]");\r
1154             fSpSet           = new UnicodeSet("[\\p{Sentence_Break = Sp}]");\r
1155             fLowerSet        = new UnicodeSet("[\\p{Sentence_Break = Lower}]");\r
1156             fUpperSet        = new UnicodeSet("[\\p{Sentence_Break = Upper}]");\r
1157             fOLetterSet      = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");\r
1158             fNumericSet      = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");\r
1159             fATermSet        = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");\r
1160             fSContinueSet    = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");\r
1161             fSTermSet        = new UnicodeSet("[\\p{Sentence_Break = STerm}]");\r
1162             fCloseSet        = new UnicodeSet("[\\p{Sentence_Break = Close}]");\r
1163             fExtendSet       = new UnicodeSet("[\\p{Sentence_Break = Extend}]");\r
1164             fOtherSet        = new UnicodeSet();\r
1165 \r
1166 \r
1167             fOtherSet.complement();\r
1168             fOtherSet.removeAll(fSepSet);\r
1169             fOtherSet.removeAll(fFormatSet);\r
1170             fOtherSet.removeAll(fSpSet);\r
1171             fOtherSet.removeAll(fLowerSet);\r
1172             fOtherSet.removeAll(fUpperSet);\r
1173             fOtherSet.removeAll(fOLetterSet);\r
1174             fOtherSet.removeAll(fNumericSet);\r
1175             fOtherSet.removeAll(fATermSet);\r
1176             fOtherSet.removeAll(fSContinueSet);\r
1177             fOtherSet.removeAll(fSTermSet);\r
1178             fOtherSet.removeAll(fCloseSet);\r
1179             fOtherSet.removeAll(fExtendSet);\r
1180 \r
1181             fSets.add(fSepSet);\r
1182             fSets.add(fFormatSet);\r
1183 \r
1184             fSets.add(fSpSet);\r
1185             fSets.add(fLowerSet);\r
1186             fSets.add(fUpperSet);\r
1187             fSets.add(fOLetterSet);\r
1188             fSets.add(fNumericSet);\r
1189             fSets.add(fATermSet);\r
1190             fSets.add(fSContinueSet);\r
1191             fSets.add(fSTermSet);\r
1192             fSets.add(fCloseSet);\r
1193             fSets.add(fOtherSet);\r
1194             fSets.add(fExtendSet);\r
1195         }\r
1196         \r
1197         \r
1198         List  charClasses() {\r
1199             return fSets;  \r
1200         }\r
1201         \r
1202         void   setText(StringBuffer s) { \r
1203             fText = s;        \r
1204         }   \r
1205 \r
1206         \r
1207         //      moveBack()   Find the "significant" code point preceding the index i.\r
1208         //      Skips over ($Extend | $Format)*\r
1209         // \r
1210         private int moveBack(int i) {\r
1211             \r
1212             if (i <= 0) {\r
1213                 return -1;\r
1214             }\r
1215             \r
1216             int      c;\r
1217             int      j = i;\r
1218             do {\r
1219                 j = moveIndex32(fText, j, -1);\r
1220                 c = UTF16.charAt(fText, j);\r
1221             }\r
1222             while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));\r
1223             return j;\r
1224         }\r
1225         \r
1226         \r
1227         int moveForward(int i) {\r
1228             if (i>=fText.length()) {\r
1229                 return fText.length();\r
1230             }\r
1231             int   c;\r
1232             int   j = i;\r
1233             do {\r
1234                 j = moveIndex32(fText, j, 1);\r
1235                 c = cAt(j);\r
1236             }\r
1237             while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));\r
1238             return j;\r
1239            \r
1240         }\r
1241         \r
1242         int cAt(int pos) {\r
1243             if (pos<0 || pos>=fText.length()) {\r
1244                 return -1;\r
1245             }\r
1246             return UTF16.charAt(fText, pos);\r
1247         }\r
1248 \r
1249         int   next(int prevPos) {  \r
1250             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the \r
1251                                         //   break position being tested.  The candidate break\r
1252                                         //   location is before p2.\r
1253             int     breakPos = -1;\r
1254             \r
1255             int c0, c1, c2, c3;         // The code points at p0, p1, p2 & p3.\r
1256             int c;\r
1257             \r
1258             // Prev break at end of string.  return DONE.\r
1259             if (prevPos >= fText.length()) {\r
1260                 return -1;\r
1261             }\r
1262             /*p0 =*/ p1 = p2 = p3 = prevPos;\r
1263             c3 = UTF16.charAt(fText, prevPos);\r
1264             c0 = c1 = c2 = 0;\r
1265             \r
1266             // Loop runs once per "significant" character position in the input text.\r
1267             for (;;) {\r
1268                 // Move all of the positions forward in the input string.\r
1269                 /*p0 = p1;*/  c0 = c1;\r
1270                 p1 = p2;  c1 = c2;\r
1271                 p2 = p3;  c2 = c3;\r
1272                 \r
1273                 // Advancd p3 by  X(Extend | Format)*   Rule 4\r
1274                 p3 = moveForward(p3);\r
1275                 c3 = cAt(p3);\r
1276                 \r
1277                 // Rule (3) CR x LF\r
1278                 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {\r
1279                     continue;\r
1280                 }\r
1281                 \r
1282                 // Rule (4)    Sep  <break>\r
1283                 if (fSepSet.contains(c1)) {\r
1284                     p2 = p1+1;   // Separators don't combine with Extend or Format\r
1285                     break;\r
1286                 }               \r
1287 \r
1288                 if (p2 >= fText.length()) {\r
1289                     // Reached end of string.  Always a break position.\r
1290                     break;\r
1291                 }\r
1292 \r
1293                 if (p2 == prevPos) {\r
1294                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)\r
1295                     continue;\r
1296                 }\r
1297 \r
1298                 // Rule (6).   ATerm x Numeric\r
1299                 if (fATermSet.contains(c1) &&  fNumericSet.contains(c2))  {\r
1300                     continue;\r
1301                 }\r
1302 \r
1303                 // Rule (7).  Upper ATerm  x  Uppper\r
1304                 if (fUpperSet.contains(c0) && fATermSet.contains(c1) && fUpperSet.contains(c2)) {\r
1305                     continue;\r
1306                 }\r
1307 \r
1308                 // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep))* Lower\r
1309                 //           Note:  Sterm | ATerm are added to the negated part of the expression by a \r
1310                 //                  note to the Unicode 5.0 documents.\r
1311                 int p8 = p1;\r
1312                 while (p8>0 && fSpSet.contains(cAt(p8))) {\r
1313                     p8 = moveBack(p8);\r
1314                 }\r
1315                 while (p8>0 && fCloseSet.contains(cAt(p8))) {\r
1316                     p8 = moveBack(p8);\r
1317                 }\r
1318                 if (fATermSet.contains(cAt(p8))) {\r
1319                     p8=p2;\r
1320                     for (;;) {\r
1321                         c = cAt(p8);\r
1322                         if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||\r
1323                             fLowerSet.contains(c) || fSepSet.contains(c) ||\r
1324                             fATermSet.contains(c) || fSTermSet.contains(c))  \r
1325                          {\r
1326                             break;\r
1327                         }\r
1328                         p8 = moveForward(p8);\r
1329                     }\r
1330                     if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {\r
1331                         continue;\r
1332                     }\r
1333                 }\r
1334                 \r
1335                 // Rule 8a  (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)\r
1336                 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {\r
1337                     p8 = p1;\r
1338                     while (setContains(fSpSet, cAt(p8))) {\r
1339                         p8 = moveBack(p8);\r
1340                     }\r
1341                     while (setContains(fCloseSet, cAt(p8))) {\r
1342                         p8 = moveBack(p8);\r
1343                     }\r
1344                     c = cAt(p8);\r
1345                     if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {\r
1346                         continue;\r
1347                     }\r
1348                 }\r
1349 \r
1350 \r
1351                 // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)\r
1352                 int p9 = p1;\r
1353                 while (p9>0 && fCloseSet.contains(cAt(p9))) {\r
1354                     p9 = moveBack(p9);\r
1355                 }\r
1356                 c = cAt(p9);\r
1357                 if ((fSTermSet.contains(c) || fATermSet.contains(c))) {\r
1358                     if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {\r
1359                         continue;\r
1360                     }\r
1361                 }\r
1362 \r
1363                 // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)\r
1364                 int p10 = p1;\r
1365                 while (p10>0 && fSpSet.contains(cAt(p10))) {\r
1366                     p10 = moveBack(p10);\r
1367                 }\r
1368                 while (p10>0 && fCloseSet.contains(cAt(p10))) {\r
1369                     p10 = moveBack(p10);\r
1370                 }\r
1371                 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {\r
1372                     if (fSpSet.contains(c2) || fSepSet.contains(c2)) {\r
1373                         continue;\r
1374                     }\r
1375                 }\r
1376 \r
1377                 // Rule (11)  (STerm | ATerm) Close* Sp*   <break>\r
1378                 int p11 = p1;\r
1379                 if (p11>0 && fSepSet.contains(cAt(p11))) {\r
1380                     p11 = moveBack(p11);\r
1381                 }\r
1382                 while (p11>0 && fSpSet.contains(cAt(p11))) {\r
1383                     p11 = moveBack(p11);\r
1384                 }\r
1385                 while (p11>0 && fCloseSet.contains(cAt(p11))) {\r
1386                     p11 = moveBack(p11);\r
1387                 }\r
1388                 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {\r
1389                     break;\r
1390                 }\r
1391 \r
1392                 //  Rule (12)  Any x Any\r
1393                 continue;\r
1394             }\r
1395             breakPos = p2;\r
1396             return breakPos;\r
1397         }\r
1398            \r
1399 \r
1400         \r
1401     }\r
1402 \r
1403  \r
1404     /**\r
1405      * Move an index into a string by n code points.\r
1406      *   Similar to UTF16.moveCodePointOffset, but without the exceptions, which were\r
1407      *   complicating usage.\r
1408      * @param s   a Text string\r
1409      * @param pos The starting code unit index into the text string\r
1410      * @param amt The amount to adjust the string by.\r
1411      * @return    The adjusted code unit index, pinned to the string's length, or\r
1412      *            unchanged if input index was outside of the string.\r
1413      */\r
1414     static int moveIndex32(StringBuffer s, int pos, int amt) {\r
1415         int i;\r
1416         char  c;\r
1417         if (amt>0) {\r
1418             for (i=0; i<amt; i++) {\r
1419                 if (pos >= s.length()) {\r
1420                     return s.length();                   \r
1421                 }\r
1422                 c = s.charAt(pos);\r
1423                 pos++;\r
1424                 if (UTF16.isLeadSurrogate(c) && pos < s.length()) {\r
1425                     c = s.charAt(pos);\r
1426                     if (UTF16.isTrailSurrogate(c)) {\r
1427                         pos++;   \r
1428                     }\r
1429                 }\r
1430             }\r
1431         } else {\r
1432             for (i=0; i>amt; i--) {\r
1433                 if (pos <= 0) {\r
1434                     return 0;   \r
1435                 }\r
1436                 pos--;\r
1437                 c = s.charAt(pos);\r
1438                 if (UTF16.isTrailSurrogate(c) && pos >= 0) {\r
1439                     c = s.charAt(pos);\r
1440                     if (UTF16.isLeadSurrogate(c)) {\r
1441                         pos--;   \r
1442                     }\r
1443                 }\r
1444             }\r
1445         }\r
1446         return pos;\r
1447     }\r
1448     \r
1449     /**\r
1450      * No-exceptions form of UnicodeSet.contains(c).\r
1451      *    Simplifies loops that terminate with an end-of-input character value.\r
1452      * @param s  A unicode set\r
1453      * @param c  A code point value\r
1454      * @return   true if the set contains c.\r
1455      */\r
1456     static boolean setContains(UnicodeSet s, int c) {\r
1457         if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {\r
1458             return false;\r
1459         }\r
1460         return s.contains(c);\r
1461     }\r
1462     \r
1463     \r
1464     /**\r
1465      * return the index of the next code point in the input text.\r
1466      * @param i the preceding index\r
1467      * @return\r
1468      * @internal\r
1469      */\r
1470     static int  nextCP(StringBuffer s, int i) {\r
1471         if (i == -1) {\r
1472             // End of Input indication.  Continue to return end value.\r
1473             return -1;\r
1474         }\r
1475         int  retVal = i + 1;\r
1476         if (retVal > s.length()) {\r
1477             return -1;\r
1478         }\r
1479         int  c = UTF16.charAt(s, i);\r
1480         if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {\r
1481             retVal++;\r
1482         }\r
1483         return retVal;\r
1484     }\r
1485     \r
1486     \r
1487     /**\r
1488      * random number generator.  Not using Java's built-in Randoms for two reasons:\r
1489      *    1.  Using this code allows obtaining the same sequences as those from the ICU4C monkey test.\r
1490      *    2.  We need to get and restore the seed from values occurring in the middle\r
1491      *        of a long sequence, to more easily reproduce failing cases.\r
1492      */\r
1493     private static int m_seed = 1;\r
1494     private static int  m_rand()\r
1495     {\r
1496         m_seed = m_seed * 1103515245 + 12345;\r
1497         return (int)(m_seed >>> 16) % 32768;\r
1498     }\r
1499 \r
1500     // Helper function for formatting error output.\r
1501     //   Append a string into a fixed-size field in a StringBuffer.\r
1502     //   Blank-pad the string if it is shorter than the field.\r
1503     //   Truncate the source string if it is too long.\r
1504     //\r
1505     private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {\r
1506         int appendLen = src.length();\r
1507         if (appendLen >= fieldLen) {\r
1508             dest.append(src.substring(0, fieldLen));\r
1509         } else {\r
1510             dest.append(src);\r
1511             while (appendLen < fieldLen) {\r
1512                 dest.append(' ');\r
1513                 appendLen++;\r
1514             }\r
1515         }\r
1516     }\r
1517 \r
1518     // Helper function for formatting error output.\r
1519     // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format\r
1520     private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {\r
1521            String hexChars = "0123456789abcdef";\r
1522            if (c < 0x10000) {\r
1523                 dest.append("\\u");\r
1524                 for (int bn=12; bn>=0; bn-=4) {\r
1525                     dest.append(hexChars.charAt((((int)c)>>bn)&0xf));\r
1526                 }\r
1527                 appendToBuf(dest, " ", fieldLen-6);\r
1528             } else {\r
1529                 dest.append("\\U");\r
1530                 for (int bn=28; bn>=0; bn-=4) {\r
1531                     dest.append(hexChars.charAt((((int)c)>>bn)&0xf));\r
1532                 }\r
1533                 appendToBuf(dest, " ", fieldLen-10);\r
1534 \r
1535             }\r
1536        }\r
1537     \r
1538 /**\r
1539  *  Run a RBBI monkey test.  Common routine, for all break iterator types.\r
1540  *    Parameters:\r
1541  *       bi      - the break iterator to use\r
1542  *       mk      - MonkeyKind, abstraction for obtaining expected results\r
1543  *       name    - Name of test (char, word, etc.) for use in error messages\r
1544  *       seed    - Seed for starting random number generator (parameter from user)\r
1545  *       numIterations\r
1546  */\r
1547 void RunMonkey(BreakIterator  bi, RBBIMonkeyKind mk, String name, int  seed, int numIterations) {\r
1548     int              TESTSTRINGLEN = 500;\r
1549     StringBuffer     testText         = new StringBuffer();\r
1550     int              numCharClasses;\r
1551     List             chClasses;\r
1552     int[]            expected         = new int[TESTSTRINGLEN*2 + 1];\r
1553     int              expectedCount    = 0;\r
1554     boolean[]        expectedBreaks   = new boolean[TESTSTRINGLEN*2 + 1];\r
1555     boolean[]        forwardBreaks    = new boolean[TESTSTRINGLEN*2 + 1];\r
1556     boolean[]        reverseBreaks    = new boolean[TESTSTRINGLEN*2 + 1];\r
1557     boolean[]        isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];\r
1558     boolean[]        followingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];\r
1559     boolean[]        precedingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];\r
1560     int              i;\r
1561     int              loopCount        = 0;\r
1562     boolean          printTestData    = false;\r
1563     boolean          printBreaksFromBI = false;\r
1564 \r
1565     m_seed = seed;\r
1566 \r
1567     numCharClasses = mk.charClasses().size();\r
1568     chClasses      = mk.charClasses();\r
1569 \r
1570     // Verify that the character classes all have at least one member.\r
1571     for (i=0; i<numCharClasses; i++) {\r
1572         UnicodeSet s = (UnicodeSet)chClasses.get(i);\r
1573         if (s == null || s.size() == 0) {\r
1574             errln("Character Class " + i + " is null or of zero size.");\r
1575             return;\r
1576         }\r
1577     }\r
1578 \r
1579     //--------------------------------------------------------------------------------------------\r
1580     //\r
1581     //  Debugging settings.  Comment out everything in the following block for normal operation\r
1582     //\r
1583     //--------------------------------------------------------------------------------------------\r
1584     // numIterations = -1;  \r
1585     // RuleBasedBreakIterator_New.fTrace = true;\r
1586     // m_seed = 859056465;\r
1587     // TESTSTRINGLEN = 50;\r
1588     // printTestData = true;\r
1589     // printBreaksFromBI = true;\r
1590     // ((RuleBasedBreakIterator_New)bi).dump();\r
1591     \r
1592     //--------------------------------------------------------------------------------------------\r
1593     //\r
1594     //  End of Debugging settings.  \r
1595     //\r
1596     //--------------------------------------------------------------------------------------------\r
1597     \r
1598     int  dotsOnLine = 0;\r
1599      while (loopCount < numIterations || numIterations == -1) {\r
1600         if (numIterations == -1 && loopCount % 10 == 0) {\r
1601             // If test is running in an infinite loop, display a periodic tic so\r
1602             //   we can tell that it is making progress.\r
1603             System.out.print(".");\r
1604             if (dotsOnLine++ >= 80){\r
1605                 System.out.println();\r
1606                 dotsOnLine = 0;\r
1607             }\r
1608         }\r
1609         // Save current random number seed, so that we can recreate the random numbers\r
1610         //   for this loop iteration in event of an error.\r
1611         seed = m_seed;\r
1612 \r
1613         testText.setLength(0);\r
1614         // Populate a test string with data.\r
1615         if (printTestData) {\r
1616             System.out.println("Test Data string ..."); \r
1617         }\r
1618         for (i=0; i<TESTSTRINGLEN; i++) {\r
1619             int        aClassNum = m_rand() % numCharClasses;\r
1620             UnicodeSet classSet  = (UnicodeSet)chClasses.get(aClassNum);\r
1621             int        charIdx   = m_rand() % classSet.size();\r
1622             int        c         = classSet.charAt(charIdx);\r
1623             if (c < 0) {   // TODO:  deal with sets containing strings.\r
1624                 errln("c < 0");\r
1625             }\r
1626             UTF16.appendCodePoint(testText, c);\r
1627             if (printTestData) {\r
1628                 System.out.print(Integer.toHexString(c) + " ");\r
1629             }\r
1630         }\r
1631         if (printTestData) {\r
1632             System.out.println(); \r
1633         }\r
1634 \r
1635         Arrays.fill(expected, 0);\r
1636         Arrays.fill(expectedBreaks, false);\r
1637         Arrays.fill(forwardBreaks, false);\r
1638         Arrays.fill(reverseBreaks, false);\r
1639         Arrays.fill(isBoundaryBreaks, false);\r
1640         Arrays.fill(followingBreaks, false);\r
1641         Arrays.fill(precedingBreaks, false);\r
1642  \r
1643         // Calculate the expected results for this test string.\r
1644         mk.setText(testText);\r
1645         expectedCount = 0;\r
1646         expectedBreaks[0] = true;\r
1647         expected[expectedCount ++] = 0;\r
1648         int breakPos = 0;\r
1649         int lastBreakPos = -1;\r
1650         for (;;) {\r
1651             lastBreakPos = breakPos;\r
1652             breakPos = mk.next(breakPos);\r
1653             if (breakPos == -1) {\r
1654                 break;\r
1655             }\r
1656             if (breakPos > testText.length()) {\r
1657                 errln("breakPos > testText.length()");\r
1658             }\r
1659             if (lastBreakPos >= breakPos) {\r
1660                 errln("Next() not increasing.");\r
1661                 // break;\r
1662             }\r
1663             expectedBreaks[breakPos] = true;\r
1664             expected[expectedCount ++] = breakPos;\r
1665         }\r
1666 \r
1667         // Find the break positions using forward iteration\r
1668         if (printBreaksFromBI) {\r
1669             System.out.println("Breaks from BI...");  \r
1670         }\r
1671         bi.setText(testText.toString());\r
1672         for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {\r
1673             if (i < 0 || i > testText.length()) {\r
1674                 errln(name + " break monkey test: Out of range value returned by breakIterator::next()");\r
1675                 break;\r
1676             }\r
1677             if (printBreaksFromBI) {\r
1678                 System.out.print(Integer.toHexString(i) + " ");\r
1679             }\r
1680             forwardBreaks[i] = true;\r
1681         }\r
1682         if (printBreaksFromBI) {\r
1683             System.out.println();\r
1684         }\r
1685 \r
1686         // Find the break positions using reverse iteration\r
1687         for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {\r
1688             if (i < 0 || i > testText.length()) {\r
1689                 errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);\r
1690                 break;\r
1691             }\r
1692             reverseBreaks[i] = true;\r
1693         }\r
1694 \r
1695         // Find the break positions using isBoundary() tests.\r
1696         for (i=0; i<=testText.length(); i++) {\r
1697             isBoundaryBreaks[i] = bi.isBoundary(i);\r
1698         }\r
1699 \r
1700         // Find the break positions using the following() function.\r
1701         lastBreakPos = 0;\r
1702         followingBreaks[0] = true;\r
1703         for (i=0; i<testText.length(); i++) {\r
1704             breakPos = bi.following(i);\r
1705             if (breakPos <= i ||\r
1706                 breakPos < lastBreakPos ||\r
1707                 breakPos > testText.length() ||\r
1708                 breakPos > lastBreakPos && lastBreakPos > i ) {\r
1709                 errln(name + " break monkey test: " +\r
1710                     "Out of range value returned by BreakIterator::following().\n" +\r
1711                     "index=" + i + "following returned=" + breakPos +\r
1712                     "lastBreak=" + lastBreakPos);\r
1713                 precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.\r
1714             } else {\r
1715                 followingBreaks[breakPos] = true;\r
1716                 lastBreakPos = breakPos;\r
1717             }\r
1718         }\r
1719         \r
1720         // Find the break positions using the preceding() function.\r
1721         lastBreakPos = testText.length();\r
1722         precedingBreaks[testText.length()] = true;\r
1723         for (i=testText.length(); i>0; i--) {\r
1724             breakPos = bi.preceding(i);\r
1725             if (breakPos >= i ||\r
1726                 breakPos > lastBreakPos ||\r
1727                 breakPos < 0 ||\r
1728                 breakPos < lastBreakPos && lastBreakPos < i ) {\r
1729                 errln(name + " break monkey test: " +\r
1730                         "Out of range value returned by BreakIterator::preceding().\n" +\r
1731                         "index=" + i + "preceding returned=" + breakPos +\r
1732                         "lastBreak=" + lastBreakPos);\r
1733                 precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.\r
1734             } else {\r
1735                 precedingBreaks[breakPos] = true;\r
1736                 lastBreakPos = breakPos;\r
1737             }\r
1738         }\r
1739 \r
1740         \r
1741 \r
1742         // Compare the expected and actual results.\r
1743         for (i=0; i<=testText.length(); i++) {\r
1744             String errorType = null;\r
1745             if  (forwardBreaks[i] != expectedBreaks[i]) {\r
1746                 errorType = "next()";\r
1747             } else if (reverseBreaks[i] != forwardBreaks[i]) {\r
1748                 errorType = "previous()";\r
1749             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {\r
1750                 errorType = "isBoundary()";\r
1751             } else if (followingBreaks[i] != expectedBreaks[i]) {\r
1752                 errorType = "following()";\r
1753             } else if (precedingBreaks[i] != expectedBreaks[i]) {\r
1754                 errorType = "preceding()";\r
1755             }\r
1756 \r
1757 \r
1758             if (errorType != null) {\r
1759                 // Format a range of the test text that includes the failure as\r
1760                 //  a data item that can be included in the rbbi test data file.\r
1761 \r
1762                 // Start of the range is the last point where expected and actual results\r
1763                 //   both agreed that there was a break position.\r
1764                 int startContext = i;\r
1765                 int count = 0;\r
1766                 for (;;) {\r
1767                     if (startContext==0) { break; }\r
1768                     startContext --;\r
1769                     if (expectedBreaks[startContext]) {\r
1770                         if (count == 2) break;\r
1771                         count ++;\r
1772                     }\r
1773                 }\r
1774 \r
1775                 // End of range is two expected breaks past the start position.\r
1776                 int endContext = i + 1;\r
1777                 int ci;\r
1778                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.\r
1779                     for (;;) {\r
1780                         if (endContext >= testText.length()) {break;}\r
1781                         if (expectedBreaks[endContext-1]) { \r
1782                             if (count == 0) break;\r
1783                             count --;\r
1784                         }\r
1785                         endContext ++;\r
1786                     }\r
1787                 }\r
1788 \r
1789                 // Format looks like   "<data><>\uabcd\uabcd<>\U0001abcd...</data>"\r
1790                 StringBuffer errorText = new StringBuffer();\r
1791 \r
1792                 int      c;    // Char from test data\r
1793                 for (ci = startContext;  ci <= endContext && ci != -1;  ci = nextCP(testText, ci)) {\r
1794                     if (ci == i) {\r
1795                         // This is the location of the error.\r
1796                         errorText.append("<?>---------------------------------\n");\r
1797                     } else if (expectedBreaks[ci]) {\r
1798                         // This a non-error expected break position.\r
1799                         errorText.append("------------------------------------\n");\r
1800                     }\r
1801                     if (ci < testText.length()) {\r
1802                         c = UTF16.charAt(testText, ci);\r
1803                         appendCharToBuf(errorText, c, 11);\r
1804                         String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT);\r
1805                         appendToBuf(errorText, gc, 8);\r
1806                         int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty);\r
1807                         String extraPropValue = \r
1808                             UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG);\r
1809                         appendToBuf(errorText, extraPropValue, 20);\r
1810 \r
1811                         String charName = UCharacter.getExtendedName(c);\r
1812                         appendToBuf(errorText, charName, 40);\r
1813                         errorText.append('\n');\r
1814                     }\r
1815                 }\r
1816                 if (ci == testText.length() && ci != -1) {\r
1817                     errorText.append("<>");\r
1818                 }\r
1819                 errorText.append("</data>\n");\r
1820 \r
1821                 // Output the error\r
1822                 errln(name + " break monkey test error.  " + \r
1823                      (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") +\r
1824                       "\nOperation = " + errorType + "; random seed = " + seed + ";  buf Idx = " + i + "\n" +\r
1825                       errorText);\r
1826                 break;\r
1827             }\r
1828         }\r
1829 \r
1830         loopCount++;\r
1831     }\r
1832 }\r
1833 \r
1834 public void TestCharMonkey() {\r
1835     \r
1836     int        loopCount = 500;\r
1837     int        seed      = 1;\r
1838     \r
1839     if (params.inclusion >= 9) {\r
1840         loopCount = 10000;\r
1841     }\r
1842     \r
1843     RBBICharMonkey  m = new RBBICharMonkey();\r
1844     BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);\r
1845     RunMonkey(bi, m, "char", seed, loopCount);\r
1846 }\r
1847 \r
1848 public void TestWordMonkey() {\r
1849     \r
1850     int        loopCount = 500;\r
1851     int        seed      = 1;\r
1852     \r
1853     if (params.inclusion >= 9) {\r
1854         loopCount = 10000;\r
1855     }\r
1856     \r
1857     logln("Word Break Monkey Test");\r
1858     RBBIWordMonkey  m = new RBBIWordMonkey();\r
1859     BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);\r
1860     RunMonkey(bi, m, "word", seed, loopCount);\r
1861 }\r
1862 \r
1863 public void TestLineMonkey() {\r
1864     \r
1865     int        loopCount = 500;\r
1866     int        seed      = 1;\r
1867     \r
1868     if (params.inclusion >= 9) {\r
1869         loopCount = 10000;\r
1870     }\r
1871     \r
1872     logln("Line Break Monkey Test");\r
1873     RBBILineMonkey  m = new RBBILineMonkey();\r
1874     BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);\r
1875     if (params == null) {\r
1876         loopCount = 50;\r
1877     }\r
1878     RunMonkey(bi, m, "line", seed, loopCount);\r
1879 }\r
1880 \r
1881 public void TestSentMonkey() {\r
1882     \r
1883     int        loopCount = 500;\r
1884     int        seed      = 1;\r
1885     \r
1886     if (params.inclusion >= 9) {\r
1887         loopCount = 3000;\r
1888     }\r
1889     \r
1890     logln("Sentence Break Monkey Test");\r
1891     RBBISentenceMonkey  m = new RBBISentenceMonkey();\r
1892     BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);\r
1893     if (params == null) {\r
1894         loopCount = 30;\r
1895     }\r
1896     RunMonkey(bi, m, "sent", seed, loopCount);\r
1897 }\r
1898 //\r
1899 //  Round-trip monkey tests.\r
1900 //  Verify that break iterators created from the rule source from the default\r
1901 //    break iterators still pass the monkey test for the iterator type.\r
1902 //\r
1903 //  This is a major test for the Rule Compiler.  The default break iterators are built\r
1904 //  from pre-compiled binary rule data that was created using ICU4C; these\r
1905 //  round-trip rule recompile tests verify that the Java rule compiler can\r
1906 //  rebuild break iterators from the original source rules.\r
1907 //\r
1908 public void TestRTCharMonkey() {\r
1909     \r
1910     int        loopCount = 200;\r
1911     int        seed      = 1;\r
1912     \r
1913     if (params.inclusion >= 9) {\r
1914         loopCount = 2000;\r
1915     }\r
1916     \r
1917     RBBICharMonkey  m = new RBBICharMonkey();\r
1918     BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);\r
1919     String rules = bi.toString();\r
1920     BreakIterator rtbi = new RuleBasedBreakIterator(rules);\r
1921     RunMonkey(rtbi, m, "char", seed, loopCount);\r
1922 }\r
1923 \r
1924 public void TestRTWordMonkey() {\r
1925     \r
1926     int        loopCount = 200;\r
1927     int        seed      = 1;\r
1928     \r
1929     if (params.inclusion >= 9) {\r
1930         loopCount = 2000;\r
1931     }\r
1932     \r
1933     logln("Word Break Monkey Test");\r
1934     RBBIWordMonkey  m = new RBBIWordMonkey();\r
1935     BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);\r
1936     String rules = bi.toString();\r
1937     BreakIterator rtbi = new RuleBasedBreakIterator(rules);\r
1938     RunMonkey(rtbi, m, "word", seed, loopCount);\r
1939 }\r
1940 \r
1941 public void TestRTLineMonkey() {\r
1942     \r
1943     int        loopCount = 200;\r
1944     int        seed      = 1;\r
1945     \r
1946     if (params.inclusion >= 9) {\r
1947         loopCount = 2000;\r
1948     }\r
1949     \r
1950     logln("Line Break Monkey Test");\r
1951     RBBILineMonkey  m = new RBBILineMonkey();\r
1952     BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);\r
1953     String rules = bi.toString();\r
1954     BreakIterator rtbi = new RuleBasedBreakIterator(rules);\r
1955     if (params == null) {\r
1956         loopCount = 50;\r
1957     }\r
1958     RunMonkey(rtbi, m, "line", seed, loopCount);\r
1959 }\r
1960 \r
1961 public void TestRTSentMonkey() {\r
1962     \r
1963     int        loopCount = 200;\r
1964     int        seed      = 1;\r
1965     \r
1966     if (params.inclusion >= 9) {\r
1967         loopCount = 1000;\r
1968     }\r
1969     \r
1970     logln("Sentence Break Monkey Test");\r
1971     RBBISentenceMonkey  m = new RBBISentenceMonkey();\r
1972     BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);\r
1973     String rules = bi.toString();\r
1974     BreakIterator rtbi = new RuleBasedBreakIterator(rules);\r
1975     if (params == null) {\r
1976         loopCount = 30;\r
1977     }\r
1978     RunMonkey(rtbi, m, "sent", seed, loopCount);\r
1979 }\r
1980 \r
1981 \r
1982 \r
1983 }\r
1984 \r