jars/icu4j-4_4_2-src/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java

   1 /*\r
   2  *******************************************************************************\r
   3  * Copyright (C) 2003-2010 International Business Machines Corporation and     *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  */\r
   7  package com.ibm.icu.dev.test.rbbi;\r
   8 \r
   9 \r
  10 // Monkey testing of RuleBasedBreakIterator\r
  11 import java.util.ArrayList;\r
  12 import java.util.Arrays;\r
  13 import java.util.List;\r
  14 import java.util.Locale;\r
  15 \r
  16 import com.ibm.icu.dev.test.TestFmwk;\r
  17 import com.ibm.icu.lang.UCharacter;\r
  18 import com.ibm.icu.lang.UProperty;\r
  19 import com.ibm.icu.text.BreakIterator;\r
  20 import com.ibm.icu.text.RuleBasedBreakIterator;\r
  21 import com.ibm.icu.text.UTF16;\r
  22 import com.ibm.icu.text.UnicodeSet;\r
  23 \r
  24 \r
  25 /**\r
  26  * Monkey tests for RBBI.  These tests have independent implementations of\r
  27  * the Unicode TR boundary rules, and compare results between these and ICU's\r
  28  * implementation, using random data.\r
  29  * \r
  30  * Tests cover Grapheme Cluster (char), Word and Line breaks\r
  31  * \r
  32  * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp\r
  33  *\r
  34  */\r
  35 public class RBBITestMonkey extends TestFmwk {\r
  36     \r
  37     public static void main(String[] args) {\r
  38         new RBBITestMonkey().run(args);\r
  39     }\r
  40     \r
  41 //\r
  42 //     classs RBBIMonkeyKind\r
  43 //\r
  44 //        Monkey Test for Break Iteration\r
  45 //        Abstract interface class.   Concrete derived classes independently\r
  46 //        implement the break rules for different iterator types.\r
  47 //\r
  48 //        The Monkey Test itself uses doesn't know which type of break iterator it is\r
  49 //        testing, but works purely in terms of the interface defined here.\r
  50 //\r
  51     abstract static class RBBIMonkeyKind {\r
  52     \r
  53         // Return a List of UnicodeSets, representing the character classes used\r
  54         //   for this type of iterator.\r
  55         abstract  List  charClasses();\r
  56 \r
  57         // Set the test text on which subsequent calls to next() will operate\r
  58         abstract  void   setText(StringBuffer text);\r
  59 \r
  60         // Find the next break postion, starting from the specified position.\r
  61         // Return -1 after reaching end of string.\r
  62         abstract   int   next(int i);\r
  63         \r
  64         // A Character Property, one of the constants defined in class UProperty.\r
  65         //   The value fo this property will be displayed for the characters\r
  66         //    near any test failure.  \r
  67         int   fCharProperty;\r
  68     }\r
  69 \r
  70  \r
  71     /**\r
  72      * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.\r
  73      */\r
  74     static class RBBICharMonkey extends RBBIMonkeyKind {\r
  75         List                      fSets;\r
  76 \r
  77         UnicodeSet                fCRLFSet;\r
  78         UnicodeSet                fControlSet;\r
  79         UnicodeSet                fExtendSet;\r
  80         UnicodeSet                fPrependSet;\r
  81         UnicodeSet                fSpacingSet;\r
  82         UnicodeSet                fLSet;\r
  83         UnicodeSet                fVSet;\r
  84         UnicodeSet                fTSet;\r
  85         UnicodeSet                fLVSet;\r
  86         UnicodeSet                fLVTSet;\r
  87         UnicodeSet                fHangulSet;\r
  88         UnicodeSet                fAnySet;\r
  89 \r
  90         StringBuffer              fText;\r
  91 \r
  92 \r
  93     RBBICharMonkey() {\r
  94         fText       = null;\r
  95         fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;\r
  96         fCRLFSet    = new UnicodeSet("[\\r\\n]");\r
  97         fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");\r
  98         fExtendSet  = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");\r
  99         fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");\r
 100         fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");\r
 101         fLSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");\r
 102         fVSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");\r
 103         fTSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");\r
 104         fLVSet      = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");\r
 105         fLVTSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");\r
 106         fHangulSet  = new UnicodeSet();\r
 107         fHangulSet.addAll(fLSet);\r
 108         fHangulSet.addAll(fVSet);\r
 109         fHangulSet.addAll(fTSet);\r
 110         fHangulSet.addAll(fLVSet);\r
 111         fHangulSet.addAll(fLVTSet);\r
 112 \r
 113         fAnySet     = new UnicodeSet("[\\u0000-\\U0010ffff]");\r
 114 \r
 115         fSets       = new ArrayList();\r
 116         fSets.add(fCRLFSet);\r
 117         fSets.add(fControlSet);\r
 118         fSets.add(fExtendSet);\r
 119         fSets.add(fPrependSet);\r
 120         fSets.add(fSpacingSet);\r
 121         fSets.add(fHangulSet);\r
 122         fSets.add(fAnySet);\r
 123      }\r
 124 \r
 125 \r
 126     void setText(StringBuffer s) {\r
 127         fText = s;\r
 128     }\r
 129     \r
 130     List charClasses() {\r
 131         return fSets;\r
 132     }\r
 133     \r
 134     int next(int prevPos) {\r
 135         int    p1, p2, p3;    // Indices of the significant code points around the\r
 136                               //   break position being tested.  The candidate break\r
 137                               //   location is before p2.\r
 138     \r
 139         int     breakPos = -1;\r
 140     \r
 141         int   c1, c2, c3;     // The code points at p0, p1, p2 & p3.\r
 142         \r
 143         // Previous break at end of string.  return DONE.\r
 144         if (prevPos >= fText.length()) {\r
 145             return -1;\r
 146         }\r
 147         p1 = p2 = p3 = prevPos;\r
 148         c3 =  UTF16.charAt(fText, prevPos);\r
 149         c1 = c2 = 0;\r
 150     \r
 151         // Loop runs once per "significant" character position in the input text.\r
 152         for (;;) {\r
 153             // Move all of the positions forward in the input string.\r
 154             p1 = p2;  c1 = c2;\r
 155             p2 = p3;  c2 = c3;\r
 156     \r
 157             // Advance p3 by one codepoint\r
 158             p3 = moveIndex32(fText, p3, 1);\r
 159             c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);\r
 160     \r
 161             if (p1 == p2) {\r
 162                 // Still warming up the loop.  (won't work with zero length strings, but we don't care)\r
 163                 continue;\r
 164             }\r
 165             if (p2 == fText.length()) {\r
 166                 // Reached end of string.  Always a break position.\r
 167                 break;\r
 168             }\r
 169     \r
 170             // Rule  GB3   CR x LF\r
 171             //     No Extend or Format characters may appear between the CR and LF,\r
 172             //     which requires the additional check for p2 immediately following p1.\r
 173             //\r
 174             if (c1==0x0D && c2==0x0A && p1==(p2-1)) {\r
 175                 continue;\r
 176             }\r
 177     \r
 178             // Rule (GB4).   ( Control | CR | LF ) <break>\r
 179             if (fControlSet.contains(c1) ||\r
 180                 c1 == 0x0D ||\r
 181                 c1 == 0x0A)  {\r
 182                 break;\r
 183             }\r
 184     \r
 185             // Rule (GB5)    <break>  ( Control | CR | LF )\r
 186             //\r
 187             if (fControlSet.contains(c2) ||\r
 188                 c2 == 0x0D ||\r
 189                 c2 == 0x0A)  {\r
 190                 break;\r
 191             }\r
 192     \r
 193     \r
 194             // Rule (GB6)  L x ( L | V | LV | LVT )\r
 195             if (fLSet.contains(c1) &&\r
 196                 (fLSet.contains(c2)  ||\r
 197                     fVSet.contains(c2)  ||\r
 198                     fLVSet.contains(c2) ||\r
 199                     fLVTSet.contains(c2))) {\r
 200                 continue;\r
 201             }\r
 202     \r
 203             // Rule (GB7)    ( LV | V )  x  ( V | T )\r
 204             if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&\r
 205                 (fVSet.contains(c2) || fTSet.contains(c2)))  {\r
 206                 continue;\r
 207             }\r
 208     \r
 209             // Rule (GB8)    ( LVT | T)  x T\r
 210             if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&\r
 211                 fTSet.contains(c2))  {\r
 212                 continue;\r
 213             }\r
 214     \r
 215             // Rule (GB9)    Numeric x ALetter\r
 216             if (fExtendSet.contains(c2))  {\r
 217                 continue;\r
 218             }\r
 219     \r
 220             // Rule (GB9a)   x  SpacingMark\r
 221             if (fSpacingSet.contains(c2)) {\r
 222                 continue;\r
 223             }\r
 224     \r
 225             // Rule (GB9b)   Prepend x\r
 226             if (fPrependSet.contains(c1)) {\r
 227                 continue;\r
 228             }\r
 229     \r
 230             // Rule (GB10)  Any  <break>  Any\r
 231             break;\r
 232         }\r
 233     \r
 234         breakPos = p2;\r
 235         return breakPos;\r
 236         }\r
 237     }\r
 238 \r
 239 \r
 240     /**\r
 241      * \r
 242      * Word Monkey Test Class\r
 243      *\r
 244      * \r
 245      * \r
 246      */\r
 247     static class RBBIWordMonkey extends RBBIMonkeyKind {\r
 248         List                      fSets;\r
 249         StringBuffer              fText;\r
 250 \r
 251         UnicodeSet                fCRSet;\r
 252         UnicodeSet                fLFSet;\r
 253         UnicodeSet                fNewlineSet;\r
 254         UnicodeSet                fKatakanaSet;\r
 255         UnicodeSet                fALetterSet;\r
 256         UnicodeSet                fMidNumLetSet;\r
 257         UnicodeSet                fMidLetterSet;\r
 258         UnicodeSet                fMidNumSet;\r
 259         UnicodeSet                fNumericSet;\r
 260         UnicodeSet                fFormatSet;\r
 261         UnicodeSet                fExtendSet;\r
 262         UnicodeSet                fExtendNumLetSet;\r
 263         UnicodeSet                fOtherSet;\r
 264 \r
 265         \r
 266         RBBIWordMonkey() {\r
 267             fCharProperty    = UProperty.WORD_BREAK;\r
 268 \r
 269             fCRSet           = new UnicodeSet("[\\p{Word_Break = CR}]");\r
 270             fLFSet           = new UnicodeSet("[\\p{Word_Break = LF}]");\r
 271             fNewlineSet      = new UnicodeSet("[\\p{Word_Break = Newline}]");\r
 272             fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]");\r
 273             fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}]");\r
 274             fMidNumLetSet    = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");\r
 275             fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]");\r
 276             fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]");\r
 277             fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]");\r
 278             fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]");\r
 279             fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");\r
 280             fExtendSet       = new UnicodeSet("[\\p{Word_Break = Extend}]");\r
 281 \r
 282             fOtherSet        = new UnicodeSet();\r
 283             fOtherSet.complement();\r
 284             fOtherSet.removeAll(fCRSet);\r
 285             fOtherSet.removeAll(fLFSet);\r
 286             fOtherSet.removeAll(fNewlineSet);\r
 287             fOtherSet.removeAll(fALetterSet);\r
 288             fOtherSet.removeAll(fKatakanaSet);\r
 289             fOtherSet.removeAll(fMidLetterSet);\r
 290             fOtherSet.removeAll(fMidNumSet);\r
 291             fOtherSet.removeAll(fNumericSet);\r
 292             fOtherSet.removeAll(fFormatSet);\r
 293             fOtherSet.removeAll(fExtendSet);\r
 294             fOtherSet.removeAll(fExtendNumLetSet);\r
 295             // Inhibit dictionary characters from being tested at all.\r
 296             fOtherSet.removeAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));\r
 297 \r
 298             fSets            = new ArrayList();\r
 299             fSets.add(fCRSet);\r
 300             fSets.add(fLFSet);\r
 301             fSets.add(fNewlineSet);\r
 302             fSets.add(fALetterSet);\r
 303             fSets.add(fKatakanaSet);\r
 304             fSets.add(fMidLetterSet);\r
 305             fSets.add(fMidNumLetSet);\r
 306             fSets.add(fMidNumSet);\r
 307             fSets.add(fNumericSet);\r
 308             fSets.add(fFormatSet);\r
 309             fSets.add(fExtendSet);\r
 310             fSets.add(fExtendNumLetSet);\r
 311             fSets.add(fOtherSet);\r
 312         }\r
 313         \r
 314         \r
 315         List  charClasses() {\r
 316          return fSets;  \r
 317         }\r
 318         \r
 319         void   setText(StringBuffer s) { \r
 320             fText = s;        \r
 321         }   \r
 322 \r
 323         int   next(int prevPos) {  \r
 324             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the \r
 325                                         //   break position being tested.  The candidate break\r
 326                                         //   location is before p2.\r
 327             int     breakPos = -1;\r
 328             \r
 329             int c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.\r
 330             \r
 331             // Previous break at end of string.  return DONE.\r
 332             if (prevPos >= fText.length()) {\r
 333                 return -1;\r
 334             }\r
 335             /*p0 =*/ p1 = p2 = p3 = prevPos;\r
 336             c3 = UTF16.charAt(fText, prevPos);\r
 337             c0 = c1 = c2 = 0;\r
 338             \r
 339             \r
 340 \r
 341             // Loop runs once per "significant" character position in the input text.\r
 342             for (;;) {\r
 343                 // Move all of the positions forward in the input string.\r
 344                 /*p0 = p1;*/  c0 = c1;\r
 345                 p1 = p2;  c1 = c2;\r
 346                 p2 = p3;  c2 = c3;\r
 347                 \r
 348                 // Advancd p3 by    X(Extend | Format)*   Rule 4\r
 349                 //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)\r
 350                 do {\r
 351                     p3 = moveIndex32(fText, p3, 1);\r
 352                     c3 = -1;\r
 353                     if (p3>=fText.length()) {\r
 354                         break;\r
 355                     }\r
 356                     c3 = UTF16.charAt(fText, p3);\r
 357                     if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {\r
 358                         break;\r
 359                     }\r
 360                 }\r
 361                 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3));\r
 362 \r
 363                 if (p1 == p2) {\r
 364                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)\r
 365                     continue;\r
 366                 }\r
 367                 if (p2 == fText.length()) {\r
 368                     // Reached end of string.  Always a break position.\r
 369                     break;\r
 370                 }\r
 371 \r
 372                 // Rule (3)   CR x LF\r
 373                 //     No Extend or Format characters may appear between the CR and LF,\r
 374                 //     which requires the additional check for p2 immediately following p1.\r
 375                 //\r
 376                 if (c1==0x0D && c2==0x0A) {\r
 377                     continue;\r
 378                 }\r
 379                 \r
 380                 // Rule (3a)  Break before and after newlines (including CR and LF)\r
 381                 //\r
 382                 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {\r
 383                     break;\r
 384                 }\r
 385                 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {\r
 386                     break;\r
 387                 }\r
 388 \r
 389                 // Rule (5).   ALetter x ALetter\r
 390                 if (fALetterSet.contains(c1) &&\r
 391                         fALetterSet.contains(c2))  {\r
 392                     continue;\r
 393                 }\r
 394                 \r
 395                 // Rule (6)  ALetter  x  (MidLetter | MidNumLet)  ALetter\r
 396                 //\r
 397                 if ( fALetterSet.contains(c1) &&\r
 398                         (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2)) &&\r
 399                         setContains(fALetterSet, c3)) {\r
 400                     continue;\r
 401                 }\r
 402                 \r
 403                 \r
 404                 // Rule (7)  ALetter (MidLetter | MidNumLet)   x  ALetter\r
 405                 if (fALetterSet.contains(c0) &&\r
 406                         (fMidLetterSet.contains(c1) ||  fMidNumLetSet.contains(c1))  &&\r
 407                         fALetterSet.contains(c2)) {\r
 408                     continue;\r
 409                 }\r
 410                 \r
 411                 //  Rule (8)    Numeric x Numeric\r
 412                 if (fNumericSet.contains(c1) &&\r
 413                         fNumericSet.contains(c2))  {\r
 414                     continue;\r
 415                 }\r
 416                 \r
 417                 // Rule (9)    ALetter x Numeric\r
 418                 if (fALetterSet.contains(c1) &&\r
 419                         fNumericSet.contains(c2))  {\r
 420                     continue;\r
 421                 }\r
 422 \r
 423                 // Rule (10)    Numeric x ALetter\r
 424                 if (fNumericSet.contains(c1) &&\r
 425                         fALetterSet.contains(c2))  {\r
 426                     continue;\r
 427                 }\r
 428                 \r
 429                 // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric\r
 430                 if ( fNumericSet.contains(c0) &&\r
 431                         (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1))  && \r
 432                         fNumericSet.contains(c2)) {\r
 433                     continue;\r
 434                 }\r
 435                 \r
 436                 // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric\r
 437                 if (fNumericSet.contains(c1) &&\r
 438                         (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2)) &&\r
 439                         setContains(fNumericSet, c3)) {\r
 440                     continue;\r
 441                 }\r
 442                 \r
 443                 // Rule (13)  Katakana x Katakana\r
 444                 if (fKatakanaSet.contains(c1) &&\r
 445                         fKatakanaSet.contains(c2))  {\r
 446                     continue;\r
 447                 }\r
 448                 \r
 449                 // Rule 13a  (ALetter | Numeric | Katakana | ExtendNumLet) x ExtendNumLet\r
 450                 if ((fALetterSet.contains(c1) || fNumericSet.contains(c1) ||\r
 451                         fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&\r
 452                         fExtendNumLetSet.contains(c2)) {\r
 453                     continue;\r
 454                 }\r
 455                 // Rule 13b   ExtendNumLet x (ALetter | Numeric | Katakana | ExtendNumLet)\r
 456                 if (fExtendNumLetSet.contains(c1) &&\r
 457                         (fALetterSet.contains(c2) || fNumericSet.contains(c2) ||\r
 458                         fKatakanaSet.contains(c2) || fExtendNumLetSet.contains(c2))) {\r
 459                     continue;\r
 460                 }\r
 461                \r
 462                 // Rule 14.  Break found here.\r
 463                 break;\r
 464             }\r
 465             \r
 466             breakPos = p2;\r
 467             return breakPos;\r
 468         }\r
 469         \r
 470     }\r
 471 \r
 472  \r
 473     static class RBBILineMonkey extends RBBIMonkeyKind {\r
 474         \r
 475         List        fSets;\r
 476         \r
 477         UnicodeSet  fBK;\r
 478         UnicodeSet  fCR;\r
 479         UnicodeSet  fLF;\r
 480         UnicodeSet  fCM;\r
 481         UnicodeSet  fNL;\r
 482         UnicodeSet  fSG;\r
 483         UnicodeSet  fWJ;\r
 484         UnicodeSet  fZW;\r
 485         UnicodeSet  fGL;\r
 486         UnicodeSet  fCB;\r
 487         UnicodeSet  fSP;\r
 488         UnicodeSet  fB2;\r
 489         UnicodeSet  fBA;\r
 490         UnicodeSet  fBB;\r
 491         UnicodeSet  fHY;\r
 492         UnicodeSet  fCL;\r
 493         UnicodeSet  fCP;\r
 494         UnicodeSet  fEX;\r
 495         UnicodeSet  fIN;\r
 496         UnicodeSet  fNS;\r
 497         UnicodeSet  fOP;\r
 498         UnicodeSet  fQU;\r
 499         UnicodeSet  fIS;\r
 500         UnicodeSet  fNU;\r
 501         UnicodeSet  fPO;\r
 502         UnicodeSet  fPR;\r
 503         UnicodeSet  fSY;\r
 504         UnicodeSet  fAI;\r
 505         UnicodeSet  fAL;\r
 506         UnicodeSet  fID;\r
 507         UnicodeSet  fSA;\r
 508         UnicodeSet  fJL;\r
 509         UnicodeSet  fJV;\r
 510         UnicodeSet  fJT;\r
 511         UnicodeSet  fH2;\r
 512         UnicodeSet  fH3;\r
 513         UnicodeSet  fXX;\r
 514         \r
 515         StringBuffer  fText;\r
 516         int           fOrigPositions;\r
 517         \r
 518         \r
 519         \r
 520         RBBILineMonkey()\r
 521         {\r
 522             fCharProperty  = UProperty.LINE_BREAK;\r
 523             fSets          = new ArrayList();\r
 524             \r
 525             fBK    = new UnicodeSet("[\\p{Line_Break=BK}]");\r
 526             fCR    = new UnicodeSet("[\\p{Line_break=CR}]");\r
 527             fLF    = new UnicodeSet("[\\p{Line_break=LF}]");\r
 528             fCM    = new UnicodeSet("[\\p{Line_break=CM}]");\r
 529             fNL    = new UnicodeSet("[\\p{Line_break=NL}]");\r
 530             fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]");\r
 531             fZW    = new UnicodeSet("[\\p{Line_break=ZW}]");\r
 532             fGL    = new UnicodeSet("[\\p{Line_break=GL}]");\r
 533             fCB    = new UnicodeSet("[\\p{Line_break=CB}]");\r
 534             fSP    = new UnicodeSet("[\\p{Line_break=SP}]");\r
 535             fB2    = new UnicodeSet("[\\p{Line_break=B2}]");\r
 536             fBA    = new UnicodeSet("[\\p{Line_break=BA}]");\r
 537             fBB    = new UnicodeSet("[\\p{Line_break=BB}]");\r
 538             fHY    = new UnicodeSet("[\\p{Line_break=HY}]");\r
 539             fCL    = new UnicodeSet("[\\p{Line_break=CL}]");\r
 540             fCP    = new UnicodeSet("[\\p{Line_break=CP}]");\r
 541             fEX    = new UnicodeSet("[\\p{Line_break=EX}]");\r
 542             fIN    = new UnicodeSet("[\\p{Line_break=IN}]");\r
 543             fNS    = new UnicodeSet("[\\p{Line_break=NS}]");\r
 544             fOP    = new UnicodeSet("[\\p{Line_break=OP}]");\r
 545             fQU    = new UnicodeSet("[\\p{Line_break=QU}]");\r
 546             fIS    = new UnicodeSet("[\\p{Line_break=IS}]");\r
 547             fNU    = new UnicodeSet("[\\p{Line_break=NU}]");\r
 548             fPO    = new UnicodeSet("[\\p{Line_break=PO}]");\r
 549             fPR    = new UnicodeSet("[\\p{Line_break=PR}]");\r
 550             fSY    = new UnicodeSet("[\\p{Line_break=SY}]");\r
 551             fAI    = new UnicodeSet("[\\p{Line_break=AI}]");\r
 552             fAL    = new UnicodeSet("[\\p{Line_break=AL}]");\r
 553             fID    = new UnicodeSet("[\\p{Line_break=ID}]");\r
 554             fSA    = new UnicodeSet("[\\p{Line_break=SA}]");\r
 555             fJL    = new UnicodeSet("[\\p{Line_break=JL}]");\r
 556             fJV    = new UnicodeSet("[\\p{Line_break=JV}]");\r
 557             fJT    = new UnicodeSet("[\\p{Line_break=JT}]");\r
 558             fH2    = new UnicodeSet("[\\p{Line_break=H2}]");\r
 559             fH3    = new UnicodeSet("[\\p{Line_break=H3}]");\r
 560             fSG    = new UnicodeSet("[\\ud800-\\udfff]");\r
 561             fXX    = new UnicodeSet("[\\p{Line_break=XX}]");\r
 562 \r
 563             \r
 564             fAL.addAll(fXX);     // Default behavior for XX is identical to AL\r
 565             fAL.addAll(fAI);     // Default behavior for AI is identical to AL\r
 566             fAL.addAll(fSA);     // Default behavior for SA is XX, which defaults to AL\r
 567             fAL.addAll(fSG);     // Default behavior for SG (unpaired surrogates) is AL\r
 568             \r
 569             \r
 570             \r
 571             fSets.add(fBK);\r
 572             fSets.add(fCR);\r
 573             fSets.add(fLF);\r
 574             fSets.add(fCM);\r
 575             fSets.add(fNL);\r
 576             fSets.add(fWJ);\r
 577             fSets.add(fZW);\r
 578             fSets.add(fGL);\r
 579             fSets.add(fCB);\r
 580             fSets.add(fSP);\r
 581             fSets.add(fB2);\r
 582             fSets.add(fBA);\r
 583             fSets.add(fBB);\r
 584             fSets.add(fHY);\r
 585             fSets.add(fH2);\r
 586             fSets.add(fH3);\r
 587             fSets.add(fCL);\r
 588             fSets.add(fCP);\r
 589             fSets.add(fEX);\r
 590             fSets.add(fIN);\r
 591             fSets.add(fJL);\r
 592             fSets.add(fJT);\r
 593             fSets.add(fJV);\r
 594             fSets.add(fNS);\r
 595             fSets.add(fOP);\r
 596             fSets.add(fQU);\r
 597             fSets.add(fIS);\r
 598             fSets.add(fNU);\r
 599             fSets.add(fPO);\r
 600             fSets.add(fPR);\r
 601             fSets.add(fSY);\r
 602             fSets.add(fAI);\r
 603             fSets.add(fAL);\r
 604             fSets.add(fID);\r
 605             fSets.add(fWJ);\r
 606             fSets.add(fSA);\r
 607             fSets.add(fSG);\r
 608             \r
 609         }\r
 610         \r
 611         void setText(StringBuffer s) {\r
 612             fText       = s;\r
 613         }\r
 614         \r
 615         \r
 616         \r
 617 \r
 618         int next(int startPos) {\r
 619             int    pos;       //  Index of the char following a potential break position\r
 620             int    thisChar;  //  Character at above position "pos"\r
 621             \r
 622             int    prevPos;   //  Index of the char preceding a potential break position\r
 623             int    prevChar;  //  Character at above position.  Note that prevChar\r
 624                               //   and thisChar may not be adjacent because combining\r
 625                               //   characters between them will be ignored.\r
 626             \r
 627             int    nextPos;   //  Index of the next character following pos.\r
 628                               //     Usually skips over combining marks.\r
 629             int    tPos;      //  temp value.\r
 630             int    matchVals[]  = null;       // Number  Expression Match Results\r
 631  \r
 632             \r
 633             if (startPos >= fText.length()) {\r
 634                 return -1;\r
 635             }\r
 636             \r
 637             \r
 638             // Initial values for loop.  Loop will run the first time without finding breaks,\r
 639             //                           while the invalid values shift out and the "this" and\r
 640             //                           "prev" positions are filled in with good values.\r
 641             pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.\r
 642             thisChar = prevChar  = 0;\r
 643             nextPos  = startPos;\r
 644             \r
 645             \r
 646             // Loop runs once per position in the test text, until a break position\r
 647             //  is found.  In each iteration, we are testing for a possible break\r
 648             //  just preceding the character at index "pos".  The character preceding\r
 649             //  this char is at postion "prevPos"; because of combining sequences,\r
 650             //  "prevPos" can be arbitrarily far before "pos".\r
 651             for (;;) {\r
 652                 // Advance to the next position to be tested.\r
 653                 prevPos   = pos;\r
 654                 prevChar  = thisChar;\r
 655                 pos       = nextPos;\r
 656                 nextPos   = moveIndex32(fText, pos, 1);\r
 657                 \r
 658                 // Rule LB2 - Break at end of text.\r
 659                 if (pos >= fText.length()) {\r
 660                     break;\r
 661                 }\r
 662                 \r
 663                 // Rule LB 9 - adjust for combining sequences.\r
 664                 //             We do this rule out-of-order because the adjustment does\r
 665                 //             not effect the way that rules LB 3 through LB 6 match,\r
 666                 //             and doing it here rather than after LB 6 is substantially\r
 667                 //             simpler when combining sequences do occur.\r
 668                 \r
 669                 \r
 670                 // LB 9         Keep combining sequences together.\r
 671                 //              advance over any CM class chars at "pos", \r
 672                 //              result is "nextPos" for the following loop iteration.\r
 673                 thisChar  = UTF16.charAt(fText, pos);\r
 674                 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||\r
 675                         thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {\r
 676                     for (;;) {\r
 677                         if (nextPos == fText.length()) {\r
 678                             break;   \r
 679                         }\r
 680                         int nextChar = UTF16.charAt(fText, nextPos);\r
 681                         if (!fCM.contains(nextChar)) {\r
 682                             break;\r
 683                         }\r
 684                         nextPos = moveIndex32(fText, nextPos, 1);\r
 685                     }\r
 686                 }\r
 687                 \r
 688                 // LB 9 Treat X CM* as if it were X\r
 689                 //        No explicit action required.\r
 690                 \r
 691                 // LB 10     Treat any remaining combining mark as AL\r
 692                 if (fCM.contains(thisChar)) {\r
 693                     thisChar = 'A';   \r
 694                 }\r
 695 \r
 696                 \r
 697                 // If the loop is still warming up - if we haven't shifted the initial\r
 698                 //   -1 positions out of prevPos yet - loop back to advance the\r
 699                 //    position in the input without any further looking for breaks.\r
 700                 if (prevPos == -1) {\r
 701                     continue;\r
 702                 }\r
 703                 \r
 704                 // LB 4  Always break after hard line breaks,\r
 705                 if (fBK.contains(prevChar)) {\r
 706                     break;\r
 707                 }\r
 708                 \r
 709                 // LB 5  Break after CR, LF, NL, but not inside CR LF\r
 710                 if (fCR.contains(prevChar) && fLF.contains(thisChar)) {\r
 711                     continue;\r
 712                 }\r
 713                 if  (fCR.contains(prevChar) ||\r
 714                      fLF.contains(prevChar) ||\r
 715                      fNL.contains(prevChar))  {\r
 716                     break;\r
 717                 }\r
 718                 \r
 719                 // LB 6  Don't break before hard line breaks\r
 720                 if (fBK.contains(thisChar) || fCR.contains(thisChar) ||\r
 721                         fLF.contains(thisChar) || fNL.contains(thisChar) ) {\r
 722                     continue;\r
 723                 }\r
 724                 \r
 725                 \r
 726                 // LB 7  Don't break before spaces or zero-width space.\r
 727                 if (fSP.contains(thisChar)) {\r
 728                     continue;\r
 729                 }\r
 730                 \r
 731                 if (fZW.contains(thisChar)) {\r
 732                     continue;\r
 733                 }\r
 734                 \r
 735                 // LB 8  Break after zero width space\r
 736                 if (fZW.contains(prevChar)) {\r
 737                     break;\r
 738                 }\r
 739                 \r
 740                 //  LB 9, 10  Already done, at top of loop.\r
 741                 //\r
 742                 \r
 743                 \r
 744                 // LB 11\r
 745                 //    x  WJ\r
 746                 //    WJ  x\r
 747                 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {\r
 748                     continue;\r
 749                 }\r
 750                 \r
 751                 \r
 752                 // LB 12\r
 753                 //        GL x\r
 754                 if (fGL.contains(prevChar)) {\r
 755                     continue;\r
 756                 }\r
 757                 \r
 758                 // LB 12a\r
 759                 //    [^SP BA HY] x GL\r
 760                 if (!(fSP.contains(prevChar) ||\r
 761                       fBA.contains(prevChar) ||\r
 762                       fHY.contains(prevChar)     ) && fGL.contains(thisChar)) {\r
 763                     continue;\r
 764                 }\r
 765 \r
 766                 \r
 767                 \r
 768                 // LB 13  Don't break before closings.\r
 769                 //       NU x CL, NU x CP  and NU x IS are not matched here so that they will\r
 770                 //       fall into LB 17 and the more general number regular expression.\r
 771                 //\r
 772                 if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||\r
 773                     !fNU.contains(prevChar) && fCP.contains(thisChar) ||\r
 774                                                fEX.contains(thisChar) ||\r
 775                     !fNU.contains(prevChar) && fIS.contains(thisChar) ||\r
 776                     !fNU.contains(prevChar) && fSY.contains(thisChar))    {\r
 777                     continue;\r
 778                 }\r
 779                 \r
 780                 // LB 14  Don't break after OP SP*\r
 781                 //       Scan backwards, checking for this sequence.\r
 782                 //       The OP char could include combining marks, so we actually check for\r
 783                 //           OP CM* SP* x\r
 784                 tPos = prevPos;\r
 785                 if (fSP.contains(prevChar)) {\r
 786                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {\r
 787                         tPos=moveIndex32(fText, tPos, -1);\r
 788                     }\r
 789                 }\r
 790                 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {\r
 791                     tPos=moveIndex32(fText, tPos, -1);\r
 792                 }\r
 793                 if (fOP.contains(UTF16.charAt(fText, tPos))) {\r
 794                     continue;\r
 795                 }\r
 796                 \r
 797                 // LB 15 Do not break within "[ \r
 798                 //       QU CM* SP* x OP\r
 799                 if (fOP.contains(thisChar)) {\r
 800                     // Scan backwards from prevChar to see if it is preceded by QU CM* SP*\r
 801                     tPos = prevPos;\r
 802                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {\r
 803                         tPos = moveIndex32(fText, tPos, -1);\r
 804                     }\r
 805                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {\r
 806                         tPos = moveIndex32(fText, tPos, -1);\r
 807                     }\r
 808                     if (fQU.contains(UTF16.charAt(fText, tPos))) {\r
 809                         continue;\r
 810                     }\r
 811                 }               \r
 812                 \r
 813                 // LB 16   (CL | CP) SP* x NS\r
 814                 if (fNS.contains(thisChar)) {\r
 815                     tPos = prevPos;\r
 816                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {\r
 817                         tPos = moveIndex32(fText, tPos, -1);\r
 818                     }\r
 819                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {\r
 820                         tPos = moveIndex32(fText, tPos, -1);\r
 821                     }\r
 822                     if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {\r
 823                         continue;\r
 824                     }\r
 825                 }               \r
 826                 \r
 827                                \r
 828                 // LB 17        B2 SP* x B2\r
 829                 if (fB2.contains(thisChar)) {\r
 830                     tPos = prevPos;\r
 831                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {\r
 832                         tPos = moveIndex32(fText, tPos, -1);\r
 833                     }\r
 834                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {\r
 835                         tPos = moveIndex32(fText, tPos, -1);\r
 836                     }\r
 837                     if (fB2.contains(UTF16.charAt(fText, tPos))) {\r
 838                         continue;\r
 839                     }\r
 840                 }               \r
 841                 \r
 842                 // LB 18    break after space\r
 843                 if (fSP.contains(prevChar)) {\r
 844                     break;\r
 845                 }\r
 846                 \r
 847                 // LB 19\r
 848                 //    x   QU\r
 849                 //    QU  x\r
 850                 if (fQU.contains(thisChar) || fQU.contains(prevChar)) {\r
 851                     continue;\r
 852                 }\r
 853                 \r
 854                 // LB 20  Break around a CB\r
 855                 if (fCB.contains(thisChar) || fCB.contains(prevChar)) {\r
 856                     break;\r
 857                 }\r
 858                 \r
 859                 // LB 21\r
 860                 if (fBA.contains(thisChar) ||\r
 861                         fHY.contains(thisChar) ||\r
 862                         fNS.contains(thisChar) ||\r
 863                         fBB.contains(prevChar) )   {\r
 864                     continue;\r
 865                 }\r
 866                 \r
 867                 // LB 22\r
 868                 if (fAL.contains(prevChar) && fIN.contains(thisChar) ||\r
 869                         fID.contains(prevChar) && fIN.contains(thisChar) ||\r
 870                         fIN.contains(prevChar) && fIN.contains(thisChar) ||\r
 871                         fNU.contains(prevChar) && fIN.contains(thisChar) )   {\r
 872                     continue;\r
 873                 }\r
 874                 \r
 875                 \r
 876                 // LB 23    ID x PO    (Note:  Leading CM behaves like ID)\r
 877                 //          AL x NU\r
 878                 //          NU x AL\r
 879                 if (fID.contains(prevChar) && fPO.contains(thisChar) ||\r
 880                         fAL.contains(prevChar) && fNU.contains(thisChar) ||\r
 881                         fNU.contains(prevChar) && fAL.contains(thisChar) )   {\r
 882                     continue;\r
 883                 }\r
 884                 \r
 885                 // LB 24  Do not break between prefix and letters or ideographs.\r
 886                 //        PR x ID\r
 887                 //        PR x AL\r
 888                 //        PO x AL\r
 889                 if (fPR.contains(prevChar) && fID.contains(thisChar) ||\r
 890                     fPR.contains(prevChar) && fAL.contains(thisChar) ||\r
 891                     fPO.contains(prevChar) && fAL.contains(thisChar))  {\r
 892                     continue;\r
 893                 }\r
 894                 \r
 895                 \r
 896                 // LB 25    Numbers\r
 897                 matchVals = LBNumberCheck(fText, prevPos, matchVals);\r
 898                 if (matchVals[0] != -1) {\r
 899                     // Matched a number.  But could have been just a single digit, which would\r
 900                     //    not represent a "no break here" between prevChar and thisChar\r
 901                     int numEndIdx = matchVals[1];  // idx of first char following num\r
 902                     if (numEndIdx > pos) {\r
 903                         // Number match includes at least the two chars being checked\r
 904                         if (numEndIdx > nextPos) {\r
 905                             // Number match includes additional chars.  Update pos and nextPos\r
 906                             //   so that next loop iteration will continue at the end of the number,\r
 907                             //   checking for breaks between last char in number & whatever follows.\r
 908                             nextPos = numEndIdx;\r
 909                             pos     = numEndIdx;\r
 910                             do {\r
 911                                 pos = moveIndex32(fText, pos, -1);  \r
 912                                 thisChar = UTF16.charAt(fText, pos);\r
 913                             }\r
 914                             while (fCM.contains(thisChar));\r
 915                         }\r
 916                         continue;\r
 917                     }\r
 918                 }\r
 919                 \r
 920                 \r
 921                 // LB 26  Do not break Korean Syllables\r
 922                 if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||\r
 923                                                 fJV.contains(thisChar) ||\r
 924                                                 fH2.contains(thisChar) ||\r
 925                                                 fH3.contains(thisChar))) {\r
 926                                                     continue;\r
 927                                                 }\r
 928 \r
 929                 if ((fJV.contains(prevChar) || fH2.contains(prevChar))  &&\r
 930                     (fJV.contains(thisChar) || fJT.contains(thisChar))) {\r
 931                         continue;\r
 932                 }\r
 933 \r
 934                 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&\r
 935                     fJT.contains(thisChar)) {\r
 936                         continue;\r
 937                 }\r
 938 \r
 939                 // LB 27 Treat a Korean Syllable Block the same as ID\r
 940                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||\r
 941                     fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&\r
 942                     fIN.contains(thisChar)) {\r
 943                         continue;\r
 944                     }\r
 945                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||\r
 946                     fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&\r
 947                     fPO.contains(thisChar)) {\r
 948                         continue;\r
 949                     }\r
 950                 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||\r
 951                     fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {\r
 952                         continue;\r
 953                     }\r
 954 \r
 955                 \r
 956                 \r
 957                 // LB 28 Do not break between alphabetics\r
 958                 if (fAL.contains(prevChar) && fAL.contains(thisChar)) {\r
 959                     continue;\r
 960                 }\r
 961                 \r
 962                 // LB 29  Do not break between numeric punctuation and alphabetics\r
 963                 if (fIS.contains(prevChar) && fAL.contains(thisChar)) {\r
 964                     continue;\r
 965                 }\r
 966                 \r
 967                 // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.\r
 968                 //          (AL | NU) x OP\r
 969                 //          CP x (AL | NU)\r
 970                 if ((fAL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {\r
 971                     continue;\r
 972                 }\r
 973                 if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fNU.contains(thisChar))) {\r
 974                     continue;\r
 975                 }\r
 976 \r
 977               \r
 978                 // LB 31    Break everywhere else\r
 979                 break;            \r
 980             }\r
 981             \r
 982             return pos;\r
 983         }\r
 984         \r
 985         \r
 986         \r
 987         // Match the following regular expression in the input text.\r
 988         //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)?  (PR | PO) CM*)?\r
 989         //      0    0   1       3    3    4              7    7    7    7      9    9    9     11   11    (match states)\r
 990         //  retVals array  [0]  index of the start of the match, or -1 if no match\r
 991         //                 [1]  index of first char following the match.\r
 992         //  Can not use Java regex because need supplementary character support,\r
 993         //     and because Unicode char properties version must be the same as in\r
 994         //     the version of ICU being tested.\r
 995         private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {\r
 996             if (retVals == null) {\r
 997                 retVals = new int[2];\r
 998              }\r
 999             retVals[0]     = -1;  // Indicates no match.\r
1000             int matchState = 0;\r
1001             int idx        = startIdx;\r
1002             \r
1003             matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){\r
1004                 int c = UTF16.charAt(s, idx);\r
1005                 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);\r
1006                 switch (matchState) {\r
1007                     case 0:   \r
1008                         if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||\r
1009                             cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {\r
1010                             matchState = 1;  \r
1011                             break;\r
1012                         }\r
1013                         if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {\r
1014                             matchState = 4;\r
1015                             break;\r
1016                         }\r
1017                         if (cLBType == UCharacter.LineBreak.HYPHEN) {\r
1018                             matchState = 4;\r
1019                             break;\r
1020                         }\r
1021                         if (cLBType == UCharacter.LineBreak.NUMERIC) {\r
1022                             matchState = 7;\r
1023                             break;\r
1024                         }\r
1025                         break matchLoop;   /* No Match  */\r
1026                         \r
1027                     case 1:\r
1028                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {\r
1029                             matchState = 1;\r
1030                             break;\r
1031                         }\r
1032                         if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {\r
1033                             matchState = 4;\r
1034                             break;\r
1035                         }\r
1036                         if (cLBType == UCharacter.LineBreak.HYPHEN) {\r
1037                             matchState = 4;\r
1038                             break;\r
1039                         }\r
1040                         if (cLBType == UCharacter.LineBreak.NUMERIC) {\r
1041                             matchState = 7;\r
1042                             break;\r
1043                         }\r
1044                         break matchLoop;   /* No Match  */\r
1045                         \r
1046                         \r
1047                     case 4:\r
1048                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {\r
1049                             matchState = 4;\r
1050                             break;\r
1051                         }\r
1052                         if (cLBType == UCharacter.LineBreak.NUMERIC) {\r
1053                             matchState = 7;\r
1054                             break;\r
1055                         }\r
1056                         break matchLoop;   /* No Match  */\r
1057                         //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)?  (PR | PO) CM*)?\r
1058                         //      0    0   1       3    3    4              7    7    7    7      9   9     11   11    (match states)\r
1059                  \r
1060                     case 7:\r
1061                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {\r
1062                             matchState = 7;\r
1063                             break;                           \r
1064                         }\r
1065                         if (cLBType == UCharacter.LineBreak.NUMERIC) {\r
1066                             matchState = 7;\r
1067                             break;                           \r
1068                         }\r
1069                         if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {\r
1070                             matchState = 7;\r
1071                             break;                           \r
1072                         }\r
1073                         if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {\r
1074                             matchState = 7;\r
1075                             break;       \r
1076                         }\r
1077                         if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {\r
1078                             matchState = 9;\r
1079                             break;                           \r
1080                         }\r
1081                         if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {\r
1082                             matchState = 9;\r
1083                             break;                           \r
1084                         }\r
1085                         if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {\r
1086                             matchState = 11;\r
1087                             break;                           \r
1088                         }\r
1089                         if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {\r
1090                             matchState = 11;\r
1091                             break;                           \r
1092                         }\r
1093 \r
1094                         break matchLoop;    // Match Complete.\r
1095                     case 9:\r
1096                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {\r
1097                             matchState = 9;\r
1098                             break;                           \r
1099                         }\r
1100                         if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {\r
1101                             matchState = 11;\r
1102                             break;                           \r
1103                         }\r
1104                         if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {\r
1105                             matchState = 11;\r
1106                             break;                           \r
1107                         }\r
1108                         break matchLoop;    // Match Complete.\r
1109                     case 11:\r
1110                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {\r
1111                             matchState = 11;\r
1112                             break;                           \r
1113                         }\r
1114                         break matchLoop;    // Match Complete.\r
1115                 }\r
1116             }\r
1117             if (matchState > 4) {\r
1118                 retVals[0] = startIdx;   \r
1119                  retVals[1] = idx;   \r
1120             }\r
1121             return retVals;\r
1122         }\r
1123         \r
1124         \r
1125         List  charClasses() {\r
1126             return fSets;\r
1127         }\r
1128         \r
1129         \r
1130     \r
1131     }\r
1132 \r
1133      \r
1134     /**\r
1135      * \r
1136      * Sentence Monkey Test Class\r
1137      *\r
1138      * \r
1139      * \r
1140      */\r
1141     static class RBBISentenceMonkey extends RBBIMonkeyKind {\r
1142         List                 fSets;\r
1143         StringBuffer         fText;\r
1144 \r
1145         UnicodeSet           fSepSet;\r
1146         UnicodeSet           fFormatSet;\r
1147         UnicodeSet           fSpSet;\r
1148         UnicodeSet           fLowerSet;\r
1149         UnicodeSet           fUpperSet;\r
1150         UnicodeSet           fOLetterSet;\r
1151         UnicodeSet           fNumericSet;\r
1152         UnicodeSet           fATermSet;\r
1153         UnicodeSet           fSContinueSet;\r
1154         UnicodeSet           fSTermSet;\r
1155         UnicodeSet           fCloseSet;\r
1156         UnicodeSet           fOtherSet;\r
1157         UnicodeSet           fExtendSet;\r
1158 \r
1159  \r
1160         \r
1161         RBBISentenceMonkey() {\r
1162             fCharProperty  = UProperty.SENTENCE_BREAK;\r
1163 \r
1164             fSets            = new ArrayList();\r
1165 \r
1166             //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator\r
1167             //                       set and made into character classes of their own.  For the monkey impl,\r
1168             //                       they remain in SEP, since Sep always appears with CR and LF in the rules.\r
1169             fSepSet          = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");\r
1170             fFormatSet       = new UnicodeSet("[\\p{Sentence_Break = Format}]");\r
1171             fSpSet           = new UnicodeSet("[\\p{Sentence_Break = Sp}]");\r
1172             fLowerSet        = new UnicodeSet("[\\p{Sentence_Break = Lower}]");\r
1173             fUpperSet        = new UnicodeSet("[\\p{Sentence_Break = Upper}]");\r
1174             fOLetterSet      = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");\r
1175             fNumericSet      = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");\r
1176             fATermSet        = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");\r
1177             fSContinueSet    = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");\r
1178             fSTermSet        = new UnicodeSet("[\\p{Sentence_Break = STerm}]");\r
1179             fCloseSet        = new UnicodeSet("[\\p{Sentence_Break = Close}]");\r
1180             fExtendSet       = new UnicodeSet("[\\p{Sentence_Break = Extend}]");\r
1181             fOtherSet        = new UnicodeSet();\r
1182 \r
1183 \r
1184             fOtherSet.complement();\r
1185             fOtherSet.removeAll(fSepSet);\r
1186             fOtherSet.removeAll(fFormatSet);\r
1187             fOtherSet.removeAll(fSpSet);\r
1188             fOtherSet.removeAll(fLowerSet);\r
1189             fOtherSet.removeAll(fUpperSet);\r
1190             fOtherSet.removeAll(fOLetterSet);\r
1191             fOtherSet.removeAll(fNumericSet);\r
1192             fOtherSet.removeAll(fATermSet);\r
1193             fOtherSet.removeAll(fSContinueSet);\r
1194             fOtherSet.removeAll(fSTermSet);\r
1195             fOtherSet.removeAll(fCloseSet);\r
1196             fOtherSet.removeAll(fExtendSet);\r
1197 \r
1198             fSets.add(fSepSet);\r
1199             fSets.add(fFormatSet);\r
1200 \r
1201             fSets.add(fSpSet);\r
1202             fSets.add(fLowerSet);\r
1203             fSets.add(fUpperSet);\r
1204             fSets.add(fOLetterSet);\r
1205             fSets.add(fNumericSet);\r
1206             fSets.add(fATermSet);\r
1207             fSets.add(fSContinueSet);\r
1208             fSets.add(fSTermSet);\r
1209             fSets.add(fCloseSet);\r
1210             fSets.add(fOtherSet);\r
1211             fSets.add(fExtendSet);\r
1212         }\r
1213         \r
1214         \r
1215         List  charClasses() {\r
1216             return fSets;  \r
1217         }\r
1218         \r
1219         void   setText(StringBuffer s) { \r
1220             fText = s;        \r
1221         }   \r
1222 \r
1223         \r
1224         //      moveBack()   Find the "significant" code point preceding the index i.\r
1225         //      Skips over ($Extend | $Format)*\r
1226         // \r
1227         private int moveBack(int i) {\r
1228             \r
1229             if (i <= 0) {\r
1230                 return -1;\r
1231             }\r
1232             \r
1233             int      c;\r
1234             int      j = i;\r
1235             do {\r
1236                 j = moveIndex32(fText, j, -1);\r
1237                 c = UTF16.charAt(fText, j);\r
1238             }\r
1239             while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));\r
1240             return j;\r
1241         }\r
1242         \r
1243         \r
1244         int moveForward(int i) {\r
1245             if (i>=fText.length()) {\r
1246                 return fText.length();\r
1247             }\r
1248             int   c;\r
1249             int   j = i;\r
1250             do {\r
1251                 j = moveIndex32(fText, j, 1);\r
1252                 c = cAt(j);\r
1253             }\r
1254             while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));\r
1255             return j;\r
1256            \r
1257         }\r
1258         \r
1259         int cAt(int pos) {\r
1260             if (pos<0 || pos>=fText.length()) {\r
1261                 return -1;\r
1262             }\r
1263             return UTF16.charAt(fText, pos);\r
1264         }\r
1265 \r
1266         int   next(int prevPos) {  \r
1267             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the \r
1268                                         //   break position being tested.  The candidate break\r
1269                                         //   location is before p2.\r
1270             int     breakPos = -1;\r
1271             \r
1272             int c0, c1, c2, c3;         // The code points at p0, p1, p2 & p3.\r
1273             int c;\r
1274             \r
1275             // Prev break at end of string.  return DONE.\r
1276             if (prevPos >= fText.length()) {\r
1277                 return -1;\r
1278             }\r
1279             /*p0 =*/ p1 = p2 = p3 = prevPos;\r
1280             c3 = UTF16.charAt(fText, prevPos);\r
1281             c0 = c1 = c2 = 0;\r
1282             \r
1283             // Loop runs once per "significant" character position in the input text.\r
1284             for (;;) {\r
1285                 // Move all of the positions forward in the input string.\r
1286                 /*p0 = p1;*/  c0 = c1;\r
1287                 p1 = p2;  c1 = c2;\r
1288                 p2 = p3;  c2 = c3;\r
1289                 \r
1290                 // Advancd p3 by  X(Extend | Format)*   Rule 4\r
1291                 p3 = moveForward(p3);\r
1292                 c3 = cAt(p3);\r
1293                 \r
1294                 // Rule (3) CR x LF\r
1295                 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {\r
1296                     continue;\r
1297                 }\r
1298                 \r
1299                 // Rule (4)    Sep  <break>\r
1300                 if (fSepSet.contains(c1)) {\r
1301                     p2 = p1+1;   // Separators don't combine with Extend or Format\r
1302                     break;\r
1303                 }               \r
1304 \r
1305                 if (p2 >= fText.length()) {\r
1306                     // Reached end of string.  Always a break position.\r
1307                     break;\r
1308                 }\r
1309 \r
1310                 if (p2 == prevPos) {\r
1311                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)\r
1312                     continue;\r
1313                 }\r
1314 \r
1315                 // Rule (6).   ATerm x Numeric\r
1316                 if (fATermSet.contains(c1) &&  fNumericSet.contains(c2))  {\r
1317                     continue;\r
1318                 }\r
1319 \r
1320                 // Rule (7).  Upper ATerm  x  Uppper\r
1321                 if (fUpperSet.contains(c0) && fATermSet.contains(c1) && fUpperSet.contains(c2)) {\r
1322                     continue;\r
1323                 }\r
1324 \r
1325                 // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep))* Lower\r
1326                 //           Note:  Sterm | ATerm are added to the negated part of the expression by a \r
1327                 //                  note to the Unicode 5.0 documents.\r
1328                 int p8 = p1;\r
1329                 while (p8>0 && fSpSet.contains(cAt(p8))) {\r
1330                     p8 = moveBack(p8);\r
1331                 }\r
1332                 while (p8>0 && fCloseSet.contains(cAt(p8))) {\r
1333                     p8 = moveBack(p8);\r
1334                 }\r
1335                 if (fATermSet.contains(cAt(p8))) {\r
1336                     p8=p2;\r
1337                     for (;;) {\r
1338                         c = cAt(p8);\r
1339                         if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||\r
1340                             fLowerSet.contains(c) || fSepSet.contains(c) ||\r
1341                             fATermSet.contains(c) || fSTermSet.contains(c))  \r
1342                          {\r
1343                             break;\r
1344                         }\r
1345                         p8 = moveForward(p8);\r
1346                     }\r
1347                     if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {\r
1348                         continue;\r
1349                     }\r
1350                 }\r
1351                 \r
1352                 // Rule 8a  (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)\r
1353                 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {\r
1354                     p8 = p1;\r
1355                     while (setContains(fSpSet, cAt(p8))) {\r
1356                         p8 = moveBack(p8);\r
1357                     }\r
1358                     while (setContains(fCloseSet, cAt(p8))) {\r
1359                         p8 = moveBack(p8);\r
1360                     }\r
1361                     c = cAt(p8);\r
1362                     if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {\r
1363                         continue;\r
1364                     }\r
1365                 }\r
1366 \r
1367 \r
1368                 // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)\r
1369                 int p9 = p1;\r
1370                 while (p9>0 && fCloseSet.contains(cAt(p9))) {\r
1371                     p9 = moveBack(p9);\r
1372                 }\r
1373                 c = cAt(p9);\r
1374                 if ((fSTermSet.contains(c) || fATermSet.contains(c))) {\r
1375                     if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {\r
1376                         continue;\r
1377                     }\r
1378                 }\r
1379 \r
1380                 // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)\r
1381                 int p10 = p1;\r
1382                 while (p10>0 && fSpSet.contains(cAt(p10))) {\r
1383                     p10 = moveBack(p10);\r
1384                 }\r
1385                 while (p10>0 && fCloseSet.contains(cAt(p10))) {\r
1386                     p10 = moveBack(p10);\r
1387                 }\r
1388                 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {\r
1389                     if (fSpSet.contains(c2) || fSepSet.contains(c2)) {\r
1390                         continue;\r
1391                     }\r
1392                 }\r
1393 \r
1394                 // Rule (11)  (STerm | ATerm) Close* Sp*   <break>\r
1395                 int p11 = p1;\r
1396                 if (p11>0 && fSepSet.contains(cAt(p11))) {\r
1397                     p11 = moveBack(p11);\r
1398                 }\r
1399                 while (p11>0 && fSpSet.contains(cAt(p11))) {\r
1400                     p11 = moveBack(p11);\r
1401                 }\r
1402                 while (p11>0 && fCloseSet.contains(cAt(p11))) {\r
1403                     p11 = moveBack(p11);\r
1404                 }\r
1405                 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {\r
1406                     break;\r
1407                 }\r
1408 \r
1409                 //  Rule (12)  Any x Any\r
1410                 continue;\r
1411             }\r
1412             breakPos = p2;\r
1413             return breakPos;\r
1414         }\r
1415            \r
1416 \r
1417         \r
1418     }\r
1419 \r
1420  \r
1421     /**\r
1422      * Move an index into a string by n code points.\r
1423      *   Similar to UTF16.moveCodePointOffset, but without the exceptions, which were\r
1424      *   complicating usage.\r
1425      * @param s   a Text string\r
1426      * @param pos The starting code unit index into the text string\r
1427      * @param amt The amount to adjust the string by.\r
1428      * @return    The adjusted code unit index, pinned to the string's length, or\r
1429      *            unchanged if input index was outside of the string.\r
1430      */\r
1431     static int moveIndex32(StringBuffer s, int pos, int amt) {\r
1432         int i;\r
1433         char  c;\r
1434         if (amt>0) {\r
1435             for (i=0; i<amt; i++) {\r
1436                 if (pos >= s.length()) {\r
1437                     return s.length();                   \r
1438                 }\r
1439                 c = s.charAt(pos);\r
1440                 pos++;\r
1441                 if (UTF16.isLeadSurrogate(c) && pos < s.length()) {\r
1442                     c = s.charAt(pos);\r
1443                     if (UTF16.isTrailSurrogate(c)) {\r
1444                         pos++;   \r
1445                     }\r
1446                 }\r
1447             }\r
1448         } else {\r
1449             for (i=0; i>amt; i--) {\r
1450                 if (pos <= 0) {\r
1451                     return 0;   \r
1452                 }\r
1453                 pos--;\r
1454                 c = s.charAt(pos);\r
1455                 if (UTF16.isTrailSurrogate(c) && pos >= 0) {\r
1456                     c = s.charAt(pos);\r
1457                     if (UTF16.isLeadSurrogate(c)) {\r
1458                         pos--;   \r
1459                     }\r
1460                 }\r
1461             }\r
1462         }\r
1463         return pos;\r
1464     }\r
1465     \r
1466     /**\r
1467      * No-exceptions form of UnicodeSet.contains(c).\r
1468      *    Simplifies loops that terminate with an end-of-input character value.\r
1469      * @param s  A unicode set\r
1470      * @param c  A code point value\r
1471      * @return   true if the set contains c.\r
1472      */\r
1473     static boolean setContains(UnicodeSet s, int c) {\r
1474         if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {\r
1475             return false;\r
1476         }\r
1477         return s.contains(c);\r
1478     }\r
1479     \r
1480     \r
1481     /**\r
1482      * return the index of the next code point in the input text.\r
1483      * @param i the preceding index\r
1484      * @return\r
1485      */\r
1486     static int  nextCP(StringBuffer s, int i) {\r
1487         if (i == -1) {\r
1488             // End of Input indication.  Continue to return end value.\r
1489             return -1;\r
1490         }\r
1491         int  retVal = i + 1;\r
1492         if (retVal > s.length()) {\r
1493             return -1;\r
1494         }\r
1495         int  c = UTF16.charAt(s, i);\r
1496         if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {\r
1497             retVal++;\r
1498         }\r
1499         return retVal;\r
1500     }\r
1501     \r
1502     \r
1503     /**\r
1504      * random number generator.  Not using Java's built-in Randoms for two reasons:\r
1505      *    1.  Using this code allows obtaining the same sequences as those from the ICU4C monkey test.\r
1506      *    2.  We need to get and restore the seed from values occurring in the middle\r
1507      *        of a long sequence, to more easily reproduce failing cases.\r
1508      */\r
1509     private static int m_seed = 1;\r
1510     private static int  m_rand()\r
1511     {\r
1512         m_seed = m_seed * 1103515245 + 12345;\r
1513         return (int)(m_seed >>> 16) % 32768;\r
1514     }\r
1515 \r
1516     // Helper function for formatting error output.\r
1517     //   Append a string into a fixed-size field in a StringBuffer.\r
1518     //   Blank-pad the string if it is shorter than the field.\r
1519     //   Truncate the source string if it is too long.\r
1520     //\r
1521     private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {\r
1522         int appendLen = src.length();\r
1523         if (appendLen >= fieldLen) {\r
1524             dest.append(src.substring(0, fieldLen));\r
1525         } else {\r
1526             dest.append(src);\r
1527             while (appendLen < fieldLen) {\r
1528                 dest.append(' ');\r
1529                 appendLen++;\r
1530             }\r
1531         }\r
1532     }\r
1533 \r
1534     // Helper function for formatting error output.\r
1535     // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format\r
1536     private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {\r
1537            String hexChars = "0123456789abcdef";\r
1538            if (c < 0x10000) {\r
1539                 dest.append("\\u");\r
1540                 for (int bn=12; bn>=0; bn-=4) {\r
1541                     dest.append(hexChars.charAt((((int)c)>>bn)&0xf));\r
1542                 }\r
1543                 appendToBuf(dest, " ", fieldLen-6);\r
1544             } else {\r
1545                 dest.append("\\U");\r
1546                 for (int bn=28; bn>=0; bn-=4) {\r
1547                     dest.append(hexChars.charAt((((int)c)>>bn)&0xf));\r
1548                 }\r
1549                 appendToBuf(dest, " ", fieldLen-10);\r
1550 \r
1551             }\r
1552        }\r
1553     \r
1554 /**\r
1555  *  Run a RBBI monkey test.  Common routine, for all break iterator types.\r
1556  *    Parameters:\r
1557  *       bi      - the break iterator to use\r
1558  *       mk      - MonkeyKind, abstraction for obtaining expected results\r
1559  *       name    - Name of test (char, word, etc.) for use in error messages\r
1560  *       seed    - Seed for starting random number generator (parameter from user)\r
1561  *       numIterations\r
1562  */\r
1563 void RunMonkey(BreakIterator  bi, RBBIMonkeyKind mk, String name, int  seed, int numIterations) {\r
1564     int              TESTSTRINGLEN = 500;\r
1565     StringBuffer     testText         = new StringBuffer();\r
1566     int              numCharClasses;\r
1567     List             chClasses;\r
1568     int[]            expected         = new int[TESTSTRINGLEN*2 + 1];\r
1569     int              expectedCount    = 0;\r
1570     boolean[]        expectedBreaks   = new boolean[TESTSTRINGLEN*2 + 1];\r
1571     boolean[]        forwardBreaks    = new boolean[TESTSTRINGLEN*2 + 1];\r
1572     boolean[]        reverseBreaks    = new boolean[TESTSTRINGLEN*2 + 1];\r
1573     boolean[]        isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];\r
1574     boolean[]        followingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];\r
1575     boolean[]        precedingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];\r
1576     int              i;\r
1577     int              loopCount        = 0;\r
1578     boolean          printTestData    = false;\r
1579     boolean          printBreaksFromBI = false;\r
1580 \r
1581     m_seed = seed;\r
1582 \r
1583     numCharClasses = mk.charClasses().size();\r
1584     chClasses      = mk.charClasses();\r
1585 \r
1586     // Verify that the character classes all have at least one member.\r
1587     for (i=0; i<numCharClasses; i++) {\r
1588         UnicodeSet s = (UnicodeSet)chClasses.get(i);\r
1589         if (s == null || s.size() == 0) {\r
1590             errln("Character Class " + i + " is null or of zero size.");\r
1591             return;\r
1592         }\r
1593     }\r
1594 \r
1595     //--------------------------------------------------------------------------------------------\r
1596     //\r
1597     //  Debugging settings.  Comment out everything in the following block for normal operation\r
1598     //\r
1599     //--------------------------------------------------------------------------------------------\r
1600     // numIterations = -1;  \r
1601     // RuleBasedBreakIterator_New.fTrace = true;\r
1602     // m_seed = 859056465;\r
1603     // TESTSTRINGLEN = 50;\r
1604     // printTestData = true;\r
1605     // printBreaksFromBI = true;\r
1606     // ((RuleBasedBreakIterator_New)bi).dump();\r
1607     \r
1608     //--------------------------------------------------------------------------------------------\r
1609     //\r
1610     //  End of Debugging settings.  \r
1611     //\r
1612     //--------------------------------------------------------------------------------------------\r
1613     \r
1614     int  dotsOnLine = 0;\r
1615      while (loopCount < numIterations || numIterations == -1) {\r
1616         if (numIterations == -1 && loopCount % 10 == 0) {\r
1617             // If test is running in an infinite loop, display a periodic tic so\r
1618             //   we can tell that it is making progress.\r
1619             System.out.print(".");\r
1620             if (dotsOnLine++ >= 80){\r
1621                 System.out.println();\r
1622                 dotsOnLine = 0;\r
1623             }\r
1624         }\r
1625         // Save current random number seed, so that we can recreate the random numbers\r
1626         //   for this loop iteration in event of an error.\r
1627         seed = m_seed;\r
1628 \r
1629         testText.setLength(0);\r
1630         // Populate a test string with data.\r
1631         if (printTestData) {\r
1632             System.out.println("Test Data string ..."); \r
1633         }\r
1634         for (i=0; i<TESTSTRINGLEN; i++) {\r
1635             int        aClassNum = m_rand() % numCharClasses;\r
1636             UnicodeSet classSet  = (UnicodeSet)chClasses.get(aClassNum);\r
1637             int        charIdx   = m_rand() % classSet.size();\r
1638             int        c         = classSet.charAt(charIdx);\r
1639             if (c < 0) {   // TODO:  deal with sets containing strings.\r
1640                 errln("c < 0");\r
1641             }\r
1642             UTF16.appendCodePoint(testText, c);\r
1643             if (printTestData) {\r
1644                 System.out.print(Integer.toHexString(c) + " ");\r
1645             }\r
1646         }\r
1647         if (printTestData) {\r
1648             System.out.println(); \r
1649         }\r
1650 \r
1651         Arrays.fill(expected, 0);\r
1652         Arrays.fill(expectedBreaks, false);\r
1653         Arrays.fill(forwardBreaks, false);\r
1654         Arrays.fill(reverseBreaks, false);\r
1655         Arrays.fill(isBoundaryBreaks, false);\r
1656         Arrays.fill(followingBreaks, false);\r
1657         Arrays.fill(precedingBreaks, false);\r
1658  \r
1659         // Calculate the expected results for this test string.\r
1660         mk.setText(testText);\r
1661         expectedCount = 0;\r
1662         expectedBreaks[0] = true;\r
1663         expected[expectedCount ++] = 0;\r
1664         int breakPos = 0;\r
1665         int lastBreakPos = -1;\r
1666         for (;;) {\r
1667             lastBreakPos = breakPos;\r
1668             breakPos = mk.next(breakPos);\r
1669             if (breakPos == -1) {\r
1670                 break;\r
1671             }\r
1672             if (breakPos > testText.length()) {\r
1673                 errln("breakPos > testText.length()");\r
1674             }\r
1675             if (lastBreakPos >= breakPos) {\r
1676                 errln("Next() not increasing.");\r
1677                 // break;\r
1678             }\r
1679             expectedBreaks[breakPos] = true;\r
1680             expected[expectedCount ++] = breakPos;\r
1681         }\r
1682 \r
1683         // Find the break positions using forward iteration\r
1684         if (printBreaksFromBI) {\r
1685             System.out.println("Breaks from BI...");  \r
1686         }\r
1687         bi.setText(testText.toString());\r
1688         for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {\r
1689             if (i < 0 || i > testText.length()) {\r
1690                 errln(name + " break monkey test: Out of range value returned by breakIterator::next()");\r
1691                 break;\r
1692             }\r
1693             if (printBreaksFromBI) {\r
1694                 System.out.print(Integer.toHexString(i) + " ");\r
1695             }\r
1696             forwardBreaks[i] = true;\r
1697         }\r
1698         if (printBreaksFromBI) {\r
1699             System.out.println();\r
1700         }\r
1701 \r
1702         // Find the break positions using reverse iteration\r
1703         for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {\r
1704             if (i < 0 || i > testText.length()) {\r
1705                 errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);\r
1706                 break;\r
1707             }\r
1708             reverseBreaks[i] = true;\r
1709         }\r
1710 \r
1711         // Find the break positions using isBoundary() tests.\r
1712         for (i=0; i<=testText.length(); i++) {\r
1713             isBoundaryBreaks[i] = bi.isBoundary(i);\r
1714         }\r
1715 \r
1716         // Find the break positions using the following() function.\r
1717         lastBreakPos = 0;\r
1718         followingBreaks[0] = true;\r
1719         for (i=0; i<testText.length(); i++) {\r
1720             breakPos = bi.following(i);\r
1721             if (breakPos <= i ||\r
1722                 breakPos < lastBreakPos ||\r
1723                 breakPos > testText.length() ||\r
1724                 breakPos > lastBreakPos && lastBreakPos > i ) {\r
1725                 errln(name + " break monkey test: " +\r
1726                     "Out of range value returned by BreakIterator::following().\n" +\r
1727                     "index=" + i + "following returned=" + breakPos +\r
1728                     "lastBreak=" + lastBreakPos);\r
1729                 precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.\r
1730             } else {\r
1731                 followingBreaks[breakPos] = true;\r
1732                 lastBreakPos = breakPos;\r
1733             }\r
1734         }\r
1735         \r
1736         // Find the break positions using the preceding() function.\r
1737         lastBreakPos = testText.length();\r
1738         precedingBreaks[testText.length()] = true;\r
1739         for (i=testText.length(); i>0; i--) {\r
1740             breakPos = bi.preceding(i);\r
1741             if (breakPos >= i ||\r
1742                 breakPos > lastBreakPos ||\r
1743                 breakPos < 0 ||\r
1744                 breakPos < lastBreakPos && lastBreakPos < i ) {\r
1745                 errln(name + " break monkey test: " +\r
1746                         "Out of range value returned by BreakIterator::preceding().\n" +\r
1747                         "index=" + i + "preceding returned=" + breakPos +\r
1748                         "lastBreak=" + lastBreakPos);\r
1749                 precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.\r
1750             } else {\r
1751                 precedingBreaks[breakPos] = true;\r
1752                 lastBreakPos = breakPos;\r
1753             }\r
1754         }\r
1755 \r
1756         \r
1757 \r
1758         // Compare the expected and actual results.\r
1759         for (i=0; i<=testText.length(); i++) {\r
1760             String errorType = null;\r
1761             if  (forwardBreaks[i] != expectedBreaks[i]) {\r
1762                 errorType = "next()";\r
1763             } else if (reverseBreaks[i] != forwardBreaks[i]) {\r
1764                 errorType = "previous()";\r
1765             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {\r
1766                 errorType = "isBoundary()";\r
1767             } else if (followingBreaks[i] != expectedBreaks[i]) {\r
1768                 errorType = "following()";\r
1769             } else if (precedingBreaks[i] != expectedBreaks[i]) {\r
1770                 errorType = "preceding()";\r
1771             }\r
1772 \r
1773 \r
1774             if (errorType != null) {\r
1775                 // Format a range of the test text that includes the failure as\r
1776                 //  a data item that can be included in the rbbi test data file.\r
1777 \r
1778                 // Start of the range is the last point where expected and actual results\r
1779                 //   both agreed that there was a break position.\r
1780                 int startContext = i;\r
1781                 int count = 0;\r
1782                 for (;;) {\r
1783                     if (startContext==0) { break; }\r
1784                     startContext --;\r
1785                     if (expectedBreaks[startContext]) {\r
1786                         if (count == 2) break;\r
1787                         count ++;\r
1788                     }\r
1789                 }\r
1790 \r
1791                 // End of range is two expected breaks past the start position.\r
1792                 int endContext = i + 1;\r
1793                 int ci;\r
1794                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.\r
1795                     for (;;) {\r
1796                         if (endContext >= testText.length()) {break;}\r
1797                         if (expectedBreaks[endContext-1]) { \r
1798                             if (count == 0) break;\r
1799                             count --;\r
1800                         }\r
1801                         endContext ++;\r
1802                     }\r
1803                 }\r
1804 \r
1805                 // Format looks like   "<data><>\uabcd\uabcd<>\U0001abcd...</data>"\r
1806                 StringBuffer errorText = new StringBuffer();\r
1807 \r
1808                 int      c;    // Char from test data\r
1809                 for (ci = startContext;  ci <= endContext && ci != -1;  ci = nextCP(testText, ci)) {\r
1810                     if (ci == i) {\r
1811                         // This is the location of the error.\r
1812                         errorText.append("<?>---------------------------------\n");\r
1813                     } else if (expectedBreaks[ci]) {\r
1814                         // This a non-error expected break position.\r
1815                         errorText.append("------------------------------------\n");\r
1816                     }\r
1817                     if (ci < testText.length()) {\r
1818                         c = UTF16.charAt(testText, ci);\r
1819                         appendCharToBuf(errorText, c, 11);\r
1820                         String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT);\r
1821                         appendToBuf(errorText, gc, 8);\r
1822                         int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty);\r
1823                         String extraPropValue = \r
1824                             UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG);\r
1825                         appendToBuf(errorText, extraPropValue, 20);\r
1826 \r
1827                         String charName = UCharacter.getExtendedName(c);\r
1828                         appendToBuf(errorText, charName, 40);\r
1829                         errorText.append('\n');\r
1830                     }\r
1831                 }\r
1832                 if (ci == testText.length() && ci != -1) {\r
1833                     errorText.append("<>");\r
1834                 }\r
1835                 errorText.append("</data>\n");\r
1836 \r
1837                 // Output the error\r
1838                 errln(name + " break monkey test error.  " + \r
1839                      (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") +\r
1840                       "\nOperation = " + errorType + "; random seed = " + seed + ";  buf Idx = " + i + "\n" +\r
1841                       errorText);\r
1842                 break;\r
1843             }\r
1844         }\r
1845 \r
1846         loopCount++;\r
1847     }\r
1848 }\r
1849 \r
1850 public void TestCharMonkey() {\r
1851     \r
1852     int        loopCount = 500;\r
1853     int        seed      = 1;\r
1854     \r
1855     if (params.inclusion >= 9) {\r
1856         loopCount = 10000;\r
1857     }\r
1858     \r
1859     RBBICharMonkey  m = new RBBICharMonkey();\r
1860     BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);\r
1861     RunMonkey(bi, m, "char", seed, loopCount);\r
1862 }\r
1863 \r
1864 public void TestWordMonkey() {\r
1865     \r
1866     int        loopCount = 500;\r
1867     int        seed      = 1;\r
1868     \r
1869     if (params.inclusion >= 9) {\r
1870         loopCount = 10000;\r
1871     }\r
1872     \r
1873     logln("Word Break Monkey Test");\r
1874     RBBIWordMonkey  m = new RBBIWordMonkey();\r
1875     BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);\r
1876     RunMonkey(bi, m, "word", seed, loopCount);\r
1877 }\r
1878 \r
1879 public void TestLineMonkey() {\r
1880     int        loopCount = 500;\r
1881     int        seed      = 1;\r
1882     \r
1883     if (params.inclusion >= 9) {\r
1884         loopCount = 10000;\r
1885     }\r
1886     \r
1887     logln("Line Break Monkey Test");\r
1888     RBBILineMonkey  m = new RBBILineMonkey();\r
1889     BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);\r
1890     if (params == null) {\r
1891         loopCount = 50;\r
1892     }\r
1893     RunMonkey(bi, m, "line", seed, loopCount);\r
1894 }\r
1895 \r
1896 public void TestSentMonkey() {\r
1897     \r
1898     int        loopCount = 500;\r
1899     int        seed      = 1;\r
1900     \r
1901     if (params.inclusion >= 9) {\r
1902         loopCount = 3000;\r
1903     }\r
1904     \r
1905     logln("Sentence Break Monkey Test");\r
1906     RBBISentenceMonkey  m = new RBBISentenceMonkey();\r
1907     BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);\r
1908     if (params == null) {\r
1909         loopCount = 30;\r
1910     }\r
1911     RunMonkey(bi, m, "sent", seed, loopCount);\r
1912 }\r
1913 //\r
1914 //  Round-trip monkey tests.\r
1915 //  Verify that break iterators created from the rule source from the default\r
1916 //    break iterators still pass the monkey test for the iterator type.\r
1917 //\r
1918 //  This is a major test for the Rule Compiler.  The default break iterators are built\r
1919 //  from pre-compiled binary rule data that was created using ICU4C; these\r
1920 //  round-trip rule recompile tests verify that the Java rule compiler can\r
1921 //  rebuild break iterators from the original source rules.\r
1922 //\r
1923 public void TestRTCharMonkey() {\r
1924     \r
1925     int        loopCount = 200;\r
1926     int        seed      = 1;\r
1927     \r
1928     if (params.inclusion >= 9) {\r
1929         loopCount = 2000;\r
1930     }\r
1931     \r
1932     RBBICharMonkey  m = new RBBICharMonkey();\r
1933     BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);\r
1934     String rules = bi.toString();\r
1935     BreakIterator rtbi = new RuleBasedBreakIterator(rules);\r
1936     RunMonkey(rtbi, m, "char", seed, loopCount);\r
1937 }\r
1938 \r
1939 public void TestRTWordMonkey() {\r
1940     \r
1941     int        loopCount = 200;\r
1942     int        seed      = 1;\r
1943     \r
1944     if (params.inclusion >= 9) {\r
1945         loopCount = 2000;\r
1946     }\r
1947     \r
1948     logln("Word Break Monkey Test");\r
1949     RBBIWordMonkey  m = new RBBIWordMonkey();\r
1950     BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);\r
1951     String rules = bi.toString();\r
1952     BreakIterator rtbi = new RuleBasedBreakIterator(rules);\r
1953     RunMonkey(rtbi, m, "word", seed, loopCount);\r
1954 }\r
1955 \r
1956 public void TestRTLineMonkey() {\r
1957     int        loopCount = 200;\r
1958     int        seed      = 1;\r
1959     \r
1960     if (params.inclusion >= 9) {\r
1961         loopCount = 2000;\r
1962     }\r
1963     \r
1964     logln("Line Break Monkey Test");\r
1965     RBBILineMonkey  m = new RBBILineMonkey();\r
1966     BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);\r
1967     String rules = bi.toString();\r
1968     BreakIterator rtbi = new RuleBasedBreakIterator(rules);\r
1969     if (params == null) {\r
1970         loopCount = 50;\r
1971     }\r
1972     RunMonkey(rtbi, m, "line", seed, loopCount);\r
1973 }\r
1974 \r
1975 public void TestRTSentMonkey() {\r
1976     \r
1977     int        loopCount = 200;\r
1978     int        seed      = 1;\r
1979     \r
1980     if (params.inclusion >= 9) {\r
1981         loopCount = 1000;\r
1982     }\r
1983     \r
1984     logln("Sentence Break Monkey Test");\r
1985     RBBISentenceMonkey  m = new RBBISentenceMonkey();\r
1986     BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);\r
1987     String rules = bi.toString();\r
1988     BreakIterator rtbi = new RuleBasedBreakIterator(rules);\r
1989     if (params == null) {\r
1990         loopCount = 30;\r
1991     }\r
1992     RunMonkey(rtbi, m, "sent", seed, loopCount);\r
1993 }\r
1994 \r
1995 \r
1996 \r
1997 }\r
1998 \r