]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-52_1/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
Added flags.
[Dictionary.git] / jars / icu4j-52_1 / main / tests / core / src / com / ibm / icu / dev / test / rbbi / RBBITestMonkey.java
1 /*
2  *******************************************************************************
3  * Copyright (C) 2003-2013 International Business Machines Corporation and
4  * others. All Rights Reserved.
5  *******************************************************************************
6  */
7  package com.ibm.icu.dev.test.rbbi;
8
9
10 // Monkey testing of RuleBasedBreakIterator
11 import java.util.ArrayList;
12 import java.util.Arrays;
13 import java.util.List;
14 import java.util.Locale;
15
16 import com.ibm.icu.dev.test.TestFmwk;
17 import com.ibm.icu.lang.UCharacter;
18 import com.ibm.icu.lang.UProperty;
19 import com.ibm.icu.text.BreakIterator;
20 import com.ibm.icu.text.RuleBasedBreakIterator;
21 import com.ibm.icu.text.UTF16;
22 import com.ibm.icu.text.UnicodeSet;
23
24
25 /**
26  * Monkey tests for RBBI.  These tests have independent implementations of
27  * the Unicode TR boundary rules, and compare results between these and ICU's
28  * implementation, using random data.
29  * 
30  * Tests cover Grapheme Cluster (char), Word and Line breaks
31  * 
32  * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
33  *
34  */
35 public class RBBITestMonkey extends TestFmwk {
36     
37     public static void main(String[] args) {
38         new RBBITestMonkey().run(args);
39     }
40     
41 //
42 //     classs RBBIMonkeyKind
43 //
44 //        Monkey Test for Break Iteration
45 //        Abstract interface class.   Concrete derived classes independently
46 //        implement the break rules for different iterator types.
47 //
48 //        The Monkey Test itself uses doesn't know which type of break iterator it is
49 //        testing, but works purely in terms of the interface defined here.
50 //
51     abstract static class RBBIMonkeyKind {
52     
53         // Return a List of UnicodeSets, representing the character classes used
54         //   for this type of iterator.
55         abstract  List  charClasses();
56
57         // Set the test text on which subsequent calls to next() will operate
58         abstract  void   setText(StringBuffer text);
59
60         // Find the next break position, starting from the specified position.
61         // Return -1 after reaching end of string.
62         abstract   int   next(int i);
63         
64         // A Character Property, one of the constants defined in class UProperty.
65         //   The value of this property will be displayed for the characters
66         //    near any test failure.  
67         int   fCharProperty;
68     }
69
70  
71     /**
72      * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
73      * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets
74      */
75     static class RBBICharMonkey extends RBBIMonkeyKind {
76         List                      fSets;
77
78         UnicodeSet                fCRLFSet;
79         UnicodeSet                fControlSet;
80         UnicodeSet                fExtendSet;
81         UnicodeSet                fRegionalIndicatorSet;
82         UnicodeSet                fPrependSet;
83         UnicodeSet                fSpacingSet;
84         UnicodeSet                fLSet;
85         UnicodeSet                fVSet;
86         UnicodeSet                fTSet;
87         UnicodeSet                fLVSet;
88         UnicodeSet                fLVTSet;
89         UnicodeSet                fHangulSet;
90         UnicodeSet                fAnySet;
91
92         StringBuffer              fText;
93
94
95     RBBICharMonkey() {
96         fText       = null;
97         fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
98         fCRLFSet    = new UnicodeSet("[\\r\\n]");
99         fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
100         fExtendSet  = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
101         fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]");
102         fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
103         fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
104         fLSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
105         fVSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");
106         fTSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");
107         fLVSet      = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");
108         fLVTSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");
109         fHangulSet  = new UnicodeSet();
110         fHangulSet.addAll(fLSet);
111         fHangulSet.addAll(fVSet);
112         fHangulSet.addAll(fTSet);
113         fHangulSet.addAll(fLVSet);
114         fHangulSet.addAll(fLVTSet);
115
116         fAnySet     = new UnicodeSet("[\\u0000-\\U0010ffff]");
117
118         fSets       = new ArrayList();
119         fSets.add(fCRLFSet);
120         fSets.add(fControlSet);
121         fSets.add(fExtendSet);
122         fSets.add(fRegionalIndicatorSet);
123         if (!fPrependSet.isEmpty()) {
124             fSets.add(fPrependSet);
125         }
126         fSets.add(fSpacingSet);
127         fSets.add(fHangulSet);
128         fSets.add(fAnySet);
129      }
130
131
132     void setText(StringBuffer s) {
133         fText = s;
134     }
135     
136     List charClasses() {
137         return fSets;
138     }
139     
140     int next(int prevPos) {
141         int    p1, p2, p3;    // Indices of the significant code points around the
142                               //   break position being tested.  The candidate break
143                               //   location is before p2.
144     
145         int     breakPos = -1;
146     
147         int   c1, c2, c3;     // The code points at p0, p1, p2 & p3.
148         
149         // Previous break at end of string.  return DONE.
150         if (prevPos >= fText.length()) {
151             return -1;
152         }
153         p1 = p2 = p3 = prevPos;
154         c3 =  UTF16.charAt(fText, prevPos);
155         c1 = c2 = 0;
156     
157         // Loop runs once per "significant" character position in the input text.
158         for (;;) {
159             // Move all of the positions forward in the input string.
160             p1 = p2;  c1 = c2;
161             p2 = p3;  c2 = c3;
162     
163             // Advance p3 by one codepoint
164             p3 = moveIndex32(fText, p3, 1);
165             c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);
166     
167             if (p1 == p2) {
168                 // Still warming up the loop.  (won't work with zero length strings, but we don't care)
169                 continue;
170             }
171             if (p2 == fText.length()) {
172                 // Reached end of string.  Always a break position.
173                 break;
174             }
175     
176             // Rule  GB3   CR x LF
177             //     No Extend or Format characters may appear between the CR and LF,
178             //     which requires the additional check for p2 immediately following p1.
179             //
180             if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
181                 continue;
182             }
183     
184             // Rule (GB4).   ( Control | CR | LF ) <break>
185             if (fControlSet.contains(c1) ||
186                 c1 == 0x0D ||
187                 c1 == 0x0A)  {
188                 break;
189             }
190     
191             // Rule (GB5)    <break>  ( Control | CR | LF )
192             //
193             if (fControlSet.contains(c2) ||
194                 c2 == 0x0D ||
195                 c2 == 0x0A)  {
196                 break;
197             }
198     
199     
200             // Rule (GB6)  L x ( L | V | LV | LVT )
201             if (fLSet.contains(c1) &&
202                 (fLSet.contains(c2)  ||
203                     fVSet.contains(c2)  ||
204                     fLVSet.contains(c2) ||
205                     fLVTSet.contains(c2))) {
206                 continue;
207             }
208     
209             // Rule (GB7)    ( LV | V )  x  ( V | T )
210             if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&
211                 (fVSet.contains(c2) || fTSet.contains(c2)))  {
212                 continue;
213             }
214     
215             // Rule (GB8)    ( LVT | T)  x T
216             if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&
217                 fTSet.contains(c2))  {
218                 continue;
219             }
220     
221             // Rule (GB8a)   Regional_Indicator x Regional_Indicator
222             if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
223                 continue;
224             }
225             
226             // Rule (GB9)    Numeric x ALetter
227             if (fExtendSet.contains(c2))  {
228                 continue;
229             }
230             
231             // Rule (GB9a)   x  SpacingMark
232             if (fSpacingSet.contains(c2)) {
233                 continue;
234             }
235     
236             // Rule (GB9b)   Prepend x
237             if (fPrependSet.contains(c1)) {
238                 continue;
239             }
240     
241             // Rule (GB10)  Any  <break>  Any
242             break;
243         }
244     
245         breakPos = p2;
246         return breakPos;
247         }
248     }
249
250
251     /**
252      * 
253      * Word Monkey Test Class
254      *
255      * 
256      * 
257      */
258     static class RBBIWordMonkey extends RBBIMonkeyKind {
259         List                      fSets;
260         StringBuffer              fText;
261
262         UnicodeSet                fCRSet;
263         UnicodeSet                fLFSet;
264         UnicodeSet                fNewlineSet;
265         UnicodeSet                fRegionalIndicatorSet;
266         UnicodeSet                fKatakanaSet;
267         UnicodeSet                fHebrew_LetterSet;
268         UnicodeSet                fALetterSet;
269         UnicodeSet                fSingle_QuoteSet;
270         UnicodeSet                fDouble_QuoteSet;
271         UnicodeSet                fMidNumLetSet;
272         UnicodeSet                fMidLetterSet;
273         UnicodeSet                fMidNumSet;
274         UnicodeSet                fNumericSet;
275         UnicodeSet                fFormatSet;
276         UnicodeSet                fExtendSet;
277         UnicodeSet                fExtendNumLetSet;
278         UnicodeSet                fOtherSet;        
279         UnicodeSet                fDictionaryCjkSet;
280
281         
282         RBBIWordMonkey() {
283             fCharProperty    = UProperty.WORD_BREAK;
284
285             fDictionaryCjkSet= new UnicodeSet("[[:Script=Hangul:][:Han:][:Hiragana:][:Katakana:]]");
286             fCRSet           = new UnicodeSet("[\\p{Word_Break = CR}]");
287             fLFSet           = new UnicodeSet("[\\p{Word_Break = LF}]");
288             fNewlineSet      = new UnicodeSet("[\\p{Word_Break = Newline}]");
289             fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]");            
290             fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}]");
291             fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]");            
292             fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]");
293             fALetterSet.removeAll(fDictionaryCjkSet);
294             fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]");
295             fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]");           
296             fMidNumLetSet    = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
297             fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
298             fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]");
299             fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]");
300             fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]");
301             fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
302             fExtendSet       = new UnicodeSet("[\\p{Word_Break = Extend}]");
303
304             fOtherSet        = new UnicodeSet();
305             fOtherSet.complement();
306             fOtherSet.removeAll(fCRSet);
307             fOtherSet.removeAll(fLFSet);
308             fOtherSet.removeAll(fNewlineSet);
309             fOtherSet.removeAll(fALetterSet);
310             fOtherSet.removeAll(fSingle_QuoteSet);
311             fOtherSet.removeAll(fDouble_QuoteSet);
312             fOtherSet.removeAll(fKatakanaSet);
313             fOtherSet.removeAll(fHebrew_LetterSet);
314             fOtherSet.removeAll(fMidLetterSet);
315             fOtherSet.removeAll(fMidNumSet);
316             fOtherSet.removeAll(fNumericSet);
317             fOtherSet.removeAll(fFormatSet);
318             fOtherSet.removeAll(fExtendSet);
319             fOtherSet.removeAll(fExtendNumLetSet);
320             fOtherSet.removeAll(fRegionalIndicatorSet);
321             // Inhibit dictionary characters from being tested at all.
322             // remove surrogates so as to not generate higher CJK characters
323             fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]"));
324             fOtherSet.removeAll(fDictionaryCjkSet);
325
326             fSets            = new ArrayList();
327             fSets.add(fCRSet);
328             fSets.add(fLFSet);
329             fSets.add(fNewlineSet);
330             fSets.add(fRegionalIndicatorSet);
331             fSets.add(fHebrew_LetterSet);
332             fSets.add(fALetterSet);
333             //fSets.add(fKatakanaSet); // TODO: work out how to test katakana
334             fSets.add(fSingle_QuoteSet);
335             fSets.add(fDouble_QuoteSet);
336             fSets.add(fMidLetterSet);
337             fSets.add(fMidNumLetSet);
338             fSets.add(fMidNumSet);
339             fSets.add(fNumericSet);
340             fSets.add(fFormatSet);
341             fSets.add(fExtendSet);
342             fSets.add(fExtendNumLetSet);
343             fSets.add(fOtherSet);
344         }
345         
346         
347         List  charClasses() {
348          return fSets;  
349         }
350         
351         void   setText(StringBuffer s) { 
352             fText = s;        
353         }   
354
355         int   next(int prevPos) {  
356             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the 
357                                         //   break position being tested.  The candidate break
358                                         //   location is before p2.
359             int     breakPos = -1;
360             
361             int c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
362             
363             // Previous break at end of string.  return DONE.
364             if (prevPos >= fText.length()) {
365                 return -1;
366             }
367             /*p0 =*/ p1 = p2 = p3 = prevPos;
368             c3 = UTF16.charAt(fText, prevPos);
369             c0 = c1 = c2 = 0;
370             
371             
372
373             // Loop runs once per "significant" character position in the input text.
374             for (;;) {
375                 // Move all of the positions forward in the input string.
376                 /*p0 = p1;*/  c0 = c1;
377                 p1 = p2;  c1 = c2;
378                 p2 = p3;  c2 = c3;
379                 
380                 // Advance p3 by    X(Extend | Format)*   Rule 4
381                 //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
382                 do {
383                     p3 = moveIndex32(fText, p3, 1);
384                     c3 = -1;
385                     if (p3>=fText.length()) {
386                         break;
387                     }
388                     c3 = UTF16.charAt(fText, p3);
389                     if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
390                         break;
391                     }
392                 }
393                 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3));
394
395                 if (p1 == p2) {
396                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
397                     continue;
398                 }
399                 if (p2 == fText.length()) {
400                     // Reached end of string.  Always a break position.
401                     break;
402                 }
403
404                 // Rule (3)   CR x LF
405                 //     No Extend or Format characters may appear between the CR and LF,
406                 //     which requires the additional check for p2 immediately following p1.
407                 //
408                 if (c1==0x0D && c2==0x0A) {
409                     continue;
410                 }
411                 
412                 // Rule (3a)  Break before and after newlines (including CR and LF)
413                 //
414                 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {
415                     break;
416                 }
417                 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
418                     break;
419                 }
420
421                 // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
422                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
423                     (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
424                     continue;
425                 }
426                
427                 // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
428                 //
429                 if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1))   &&
430                      (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
431                      (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) {
432                     continue;
433                 }
434
435                 // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
436                 if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) &&
437                     (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
438                     (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
439                     continue;
440                 }
441
442                 // Rule (7a)     Hebrew_Letter x Single_Quote
443                 if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) {
444                     continue;
445                 }
446
447                 // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
448                 if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) {
449                     continue;
450                 }
451
452                 // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
453                 if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) {
454                     continue;
455                 }
456                 
457                 //  Rule (8)    Numeric x Numeric
458                 if (fNumericSet.contains(c1) &&
459                         fNumericSet.contains(c2))  {
460                     continue;
461                 }
462                 
463                 // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
464                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
465                     fNumericSet.contains(c2))  {
466                     continue;
467                 }
468
469                 // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
470                 if (fNumericSet.contains(c1) &&
471                     (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
472                     continue;
473                 }
474
475                 // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
476                 if (fNumericSet.contains(c0) &&
477                         (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1))  &&
478                         fNumericSet.contains(c2)) {
479                     continue;
480                 }
481                 
482                 // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
483                 if (fNumericSet.contains(c1) &&
484                     (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2))  &&
485                     setContains(fNumericSet, c3)) {
486                     continue;
487                 }
488                 
489                 // Rule (13)  Katakana x Katakana
490                 if (fKatakanaSet.contains(c1) &&
491                         fKatakanaSet.contains(c2))  {
492                     continue;
493                 }
494
495                 // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
496                 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) ||
497                         fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
498                         fExtendNumLetSet.contains(c2)) {
499                     continue;
500                 }
501                 
502                 // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
503                 if (fExtendNumLetSet.contains(c1) &&
504                         (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) ||
505                          fNumericSet.contains(c2) || fKatakanaSet.contains(c2)))  {
506                     continue;
507                 }
508
509                 
510                 // Rule 13c   Do not break between Regional Indicators. 
511                 //            Regional_Indicator  Ã—   Regional_Indicator
512                 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
513                     continue;
514                 }
515                 
516                 // Rule 14.  Break found here.
517                 break;
518             }
519             
520             breakPos = p2;
521             return breakPos;
522         }
523         
524     }
525
526  
527     static class RBBILineMonkey extends RBBIMonkeyKind {
528         
529         List        fSets;
530         
531         UnicodeSet  fBK;
532         UnicodeSet  fCR;
533         UnicodeSet  fLF;
534         UnicodeSet  fCM;
535         UnicodeSet  fNL;
536         UnicodeSet  fSG;
537         UnicodeSet  fWJ;
538         UnicodeSet  fZW;
539         UnicodeSet  fGL;
540         UnicodeSet  fCB;
541         UnicodeSet  fSP;
542         UnicodeSet  fB2;
543         UnicodeSet  fBA;
544         UnicodeSet  fBB;
545         UnicodeSet  fHY;
546         UnicodeSet  fCL;
547         UnicodeSet  fCP;
548         UnicodeSet  fEX;
549         UnicodeSet  fIN;
550         UnicodeSet  fNS;
551         UnicodeSet  fOP;
552         UnicodeSet  fQU;
553         UnicodeSet  fIS;
554         UnicodeSet  fNU;
555         UnicodeSet  fPO;
556         UnicodeSet  fPR;
557         UnicodeSet  fSY;
558         UnicodeSet  fAI;
559         UnicodeSet  fAL;
560         UnicodeSet  fHL;
561         UnicodeSet  fID;
562         UnicodeSet  fSA;
563         UnicodeSet  fJL;
564         UnicodeSet  fJV;
565         UnicodeSet  fJT;
566         UnicodeSet  fH2;
567         UnicodeSet  fH3;
568         UnicodeSet  fRI;
569         UnicodeSet  fXX;
570         
571         StringBuffer  fText;
572         int           fOrigPositions;
573         
574         
575         
576         RBBILineMonkey()
577         {
578             fCharProperty  = UProperty.LINE_BREAK;
579             fSets          = new ArrayList();
580             
581             fBK    = new UnicodeSet("[\\p{Line_Break=BK}]");
582             fCR    = new UnicodeSet("[\\p{Line_break=CR}]");
583             fLF    = new UnicodeSet("[\\p{Line_break=LF}]");
584             fCM    = new UnicodeSet("[\\p{Line_break=CM}]");
585             fNL    = new UnicodeSet("[\\p{Line_break=NL}]");
586             fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]");
587             fZW    = new UnicodeSet("[\\p{Line_break=ZW}]");
588             fGL    = new UnicodeSet("[\\p{Line_break=GL}]");
589             fCB    = new UnicodeSet("[\\p{Line_break=CB}]");
590             fSP    = new UnicodeSet("[\\p{Line_break=SP}]");
591             fB2    = new UnicodeSet("[\\p{Line_break=B2}]");
592             fBA    = new UnicodeSet("[\\p{Line_break=BA}]");
593             fBB    = new UnicodeSet("[\\p{Line_break=BB}]");
594             fHY    = new UnicodeSet("[\\p{Line_break=HY}]");
595             fCL    = new UnicodeSet("[\\p{Line_break=CL}]");
596             fCP    = new UnicodeSet("[\\p{Line_break=CP}]");
597             fEX    = new UnicodeSet("[\\p{Line_break=EX}]");
598             fIN    = new UnicodeSet("[\\p{Line_break=IN}]");
599             fNS    = new UnicodeSet("[\\p{Line_break=NS}]");
600             fOP    = new UnicodeSet("[\\p{Line_break=OP}]");
601             fQU    = new UnicodeSet("[\\p{Line_break=QU}]");
602             fIS    = new UnicodeSet("[\\p{Line_break=IS}]");
603             fNU    = new UnicodeSet("[\\p{Line_break=NU}]");
604             fPO    = new UnicodeSet("[\\p{Line_break=PO}]");
605             fPR    = new UnicodeSet("[\\p{Line_break=PR}]");
606             fSY    = new UnicodeSet("[\\p{Line_break=SY}]");
607             fAI    = new UnicodeSet("[\\p{Line_break=AI}]");
608             fAL    = new UnicodeSet("[\\p{Line_break=AL}]");
609             fHL    = new UnicodeSet("[\\p{Line_break=HL}]");
610             fID    = new UnicodeSet("[\\p{Line_break=ID}]");
611             fSA    = new UnicodeSet("[\\p{Line_break=SA}]");
612             fJL    = new UnicodeSet("[\\p{Line_break=JL}]");
613             fJV    = new UnicodeSet("[\\p{Line_break=JV}]");
614             fJT    = new UnicodeSet("[\\p{Line_break=JT}]");
615             fH2    = new UnicodeSet("[\\p{Line_break=H2}]");
616             fH3    = new UnicodeSet("[\\p{Line_break=H3}]");
617             fSG    = new UnicodeSet("[\\ud800-\\udfff]");
618             fRI    = new UnicodeSet("[\\p{Line_break=RI}]");
619             fXX    = new UnicodeSet("[\\p{Line_break=XX}]");
620
621             
622             fAL.addAll(fXX);     // Default behavior for XX is identical to AL
623             fAL.addAll(fAI);     // Default behavior for AI is identical to AL
624             fAL.addAll(fSA);     // Default behavior for SA is XX, which defaults to AL
625             fAL.addAll(fSG);     // Default behavior for SG (unpaired surrogates) is AL
626             
627             
628             
629             fSets.add(fBK);
630             fSets.add(fCR);
631             fSets.add(fLF);
632             fSets.add(fCM);
633             fSets.add(fNL);
634             fSets.add(fWJ);
635             fSets.add(fZW);
636             fSets.add(fGL);
637             fSets.add(fCB);
638             fSets.add(fSP);
639             fSets.add(fB2);
640             fSets.add(fBA);
641             fSets.add(fBB);
642             fSets.add(fHY);
643             fSets.add(fH2);
644             fSets.add(fH3);
645             fSets.add(fCL);
646             fSets.add(fCP);
647             fSets.add(fEX);
648             fSets.add(fIN);
649             fSets.add(fJL);
650             fSets.add(fJT);
651             fSets.add(fJV);
652             fSets.add(fNS);
653             fSets.add(fOP);
654             fSets.add(fQU);
655             fSets.add(fIS);
656             fSets.add(fNU);
657             fSets.add(fPO);
658             fSets.add(fPR);
659             fSets.add(fSY);
660             fSets.add(fAI);
661             fSets.add(fAL);
662             fSets.add(fHL);
663             fSets.add(fID);
664             fSets.add(fWJ);
665             fSets.add(fSA);
666             fSets.add(fSG);
667             fSets.add(fRI);
668         }
669         
670         void setText(StringBuffer s) {
671             fText       = s;
672         }
673         
674         
675         
676
677         int next(int startPos) {
678             int    pos;       //  Index of the char following a potential break position
679             int    thisChar;  //  Character at above position "pos"
680             
681             int    prevPos;   //  Index of the char preceding a potential break position
682             int    prevChar;  //  Character at above position.  Note that prevChar
683                               //   and thisChar may not be adjacent because combining
684                               //   characters between them will be ignored.
685             int    prevCharX2; //  Character before prevChar, more contex for LB 21a
686             
687             int    nextPos;   //  Index of the next character following pos.
688                               //     Usually skips over combining marks.
689             int    tPos;      //  temp value.
690             int    matchVals[]  = null;       // Number  Expression Match Results
691  
692             
693             if (startPos >= fText.length()) {
694                 return -1;
695             }
696             
697             
698             // Initial values for loop.  Loop will run the first time without finding breaks,
699             //                           while the invalid values shift out and the "this" and
700             //                           "prev" positions are filled in with good values.
701             pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
702             thisChar = prevChar  = prevCharX2 = 0;
703             nextPos  = startPos;
704             
705             
706             // Loop runs once per position in the test text, until a break position
707             //  is found.  In each iteration, we are testing for a possible break
708             //  just preceding the character at index "pos".  The character preceding
709             //  this char is at postion "prevPos"; because of combining sequences,
710             //  "prevPos" can be arbitrarily far before "pos".
711             for (;;) {
712                 // Advance to the next position to be tested.
713                 prevCharX2 = prevChar;
714                 prevPos   = pos;
715                 prevChar  = thisChar;
716                 pos       = nextPos;
717                 nextPos   = moveIndex32(fText, pos, 1);
718                 
719                 // Rule LB2 - Break at end of text.
720                 if (pos >= fText.length()) {
721                     break;
722                 }
723                 
724                 // Rule LB 9 - adjust for combining sequences.
725                 //             We do this rule out-of-order because the adjustment does
726                 //             not effect the way that rules LB 3 through LB 6 match,
727                 //             and doing it here rather than after LB 6 is substantially
728                 //             simpler when combining sequences do occur.
729                 
730                 
731                 // LB 9         Keep combining sequences together.
732                 //              advance over any CM class chars at "pos", 
733                 //              result is "nextPos" for the following loop iteration.
734                 thisChar  = UTF16.charAt(fText, pos);
735                 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||
736                         thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {
737                     for (;;) {
738                         if (nextPos == fText.length()) {
739                             break;   
740                         }
741                         int nextChar = UTF16.charAt(fText, nextPos);
742                         if (!fCM.contains(nextChar)) {
743                             break;
744                         }
745                         nextPos = moveIndex32(fText, nextPos, 1);
746                     }
747                 }
748                 
749                 // LB 9 Treat X CM* as if it were X
750                 //        No explicit action required.
751                 
752                 // LB 10     Treat any remaining combining mark as AL
753                 if (fCM.contains(thisChar)) {
754                     thisChar = 'A';   
755                 }
756
757                 
758                 // If the loop is still warming up - if we haven't shifted the initial
759                 //   -1 positions out of prevPos yet - loop back to advance the
760                 //    position in the input without any further looking for breaks.
761                 if (prevPos == -1) {
762                     continue;
763                 }
764                 
765                 // LB 4  Always break after hard line breaks,
766                 if (fBK.contains(prevChar)) {
767                     break;
768                 }
769                 
770                 // LB 5  Break after CR, LF, NL, but not inside CR LF
771                 if (fCR.contains(prevChar) && fLF.contains(thisChar)) {
772                     continue;
773                 }
774                 if  (fCR.contains(prevChar) ||
775                      fLF.contains(prevChar) ||
776                      fNL.contains(prevChar))  {
777                     break;
778                 }
779                 
780                 // LB 6  Don't break before hard line breaks
781                 if (fBK.contains(thisChar) || fCR.contains(thisChar) ||
782                         fLF.contains(thisChar) || fNL.contains(thisChar) ) {
783                     continue;
784                 }
785                 
786                 
787                 // LB 7  Don't break before spaces or zero-width space.
788                 if (fSP.contains(thisChar)) {
789                     continue;
790                 }
791                 
792                 if (fZW.contains(thisChar)) {
793                     continue;
794                 }
795                 
796                 // LB 8  Break after zero width space
797                 if (fZW.contains(prevChar)) {
798                     break;
799                 }
800                 
801                 //  LB 9, 10  Already done, at top of loop.
802                 //
803                 
804                 
805                 // LB 11
806                 //    x  WJ
807                 //    WJ  x
808                 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
809                     continue;
810                 }
811                 
812                 
813                 // LB 12
814                 //        GL x
815                 if (fGL.contains(prevChar)) {
816                     continue;
817                 }
818                 
819                 // LB 12a
820                 //    [^SP BA HY] x GL
821                 if (!(fSP.contains(prevChar) ||
822                       fBA.contains(prevChar) ||
823                       fHY.contains(prevChar)     ) && fGL.contains(thisChar)) {
824                     continue;
825                 }
826
827                 
828                 
829                 // LB 13  Don't break before closings.
830                 //       NU x CL, NU x CP  and NU x IS are not matched here so that they will
831                 //       fall into LB 17 and the more general number regular expression.
832                 //
833                 if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
834                     !fNU.contains(prevChar) && fCP.contains(thisChar) ||
835                                                fEX.contains(thisChar) ||
836                     !fNU.contains(prevChar) && fIS.contains(thisChar) ||
837                     !fNU.contains(prevChar) && fSY.contains(thisChar))    {
838                     continue;
839                 }
840                 
841                 // LB 14  Don't break after OP SP*
842                 //       Scan backwards, checking for this sequence.
843                 //       The OP char could include combining marks, so we actually check for
844                 //           OP CM* SP* x
845                 tPos = prevPos;
846                 if (fSP.contains(prevChar)) {
847                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
848                         tPos=moveIndex32(fText, tPos, -1);
849                     }
850                 }
851                 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
852                     tPos=moveIndex32(fText, tPos, -1);
853                 }
854                 if (fOP.contains(UTF16.charAt(fText, tPos))) {
855                     continue;
856                 }
857                 
858                 // LB 15 Do not break within "[ 
859                 //       QU CM* SP* x OP
860                 if (fOP.contains(thisChar)) {
861                     // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
862                     tPos = prevPos;
863                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
864                         tPos = moveIndex32(fText, tPos, -1);
865                     }
866                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
867                         tPos = moveIndex32(fText, tPos, -1);
868                     }
869                     if (fQU.contains(UTF16.charAt(fText, tPos))) {
870                         continue;
871                     }
872                 }               
873                 
874                 // LB 16   (CL | CP) SP* x NS
875                 if (fNS.contains(thisChar)) {
876                     tPos = prevPos;
877                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
878                         tPos = moveIndex32(fText, tPos, -1);
879                     }
880                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
881                         tPos = moveIndex32(fText, tPos, -1);
882                     }
883                     if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {
884                         continue;
885                     }
886                 }               
887                 
888                                
889                 // LB 17        B2 SP* x B2
890                 if (fB2.contains(thisChar)) {
891                     tPos = prevPos;
892                     while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
893                         tPos = moveIndex32(fText, tPos, -1);
894                     }
895                     while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
896                         tPos = moveIndex32(fText, tPos, -1);
897                     }
898                     if (fB2.contains(UTF16.charAt(fText, tPos))) {
899                         continue;
900                     }
901                 }               
902                 
903                 // LB 18    break after space
904                 if (fSP.contains(prevChar)) {
905                     break;
906                 }
907                 
908                 // LB 19
909                 //    x   QU
910                 //    QU  x
911                 if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
912                     continue;
913                 }
914                 
915                 // LB 20  Break around a CB
916                 if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
917                     break;
918                 }
919                 
920                 // LB 21
921                 if (fBA.contains(thisChar) ||
922                         fHY.contains(thisChar) ||
923                         fNS.contains(thisChar) ||
924                         fBB.contains(prevChar) )   {
925                     continue;
926                 }
927                 
928                  // LB 21a, HL (HY | BA) x
929                 if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) {
930                     continue;
931                 }
932
933                  // LB 21b, SY x HL
934                 if (fSY.contains(prevChar) && fHL.contains(thisChar)) {
935                     continue;
936                 }
937                 
938                // LB 22
939                 if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
940                         fHL.contains(prevChar) && fIN.contains(thisChar) ||
941                         fID.contains(prevChar) && fIN.contains(thisChar) ||
942                         fIN.contains(prevChar) && fIN.contains(thisChar) ||
943                         fNU.contains(prevChar) && fIN.contains(thisChar) )   {
944                     continue;
945                 }
946                 
947                 
948                 // LB 23    ID x PO    (Note:  Leading CM behaves like ID)
949                 //          AL x NU
950                 //          NU x AL
951                 if (fID.contains(prevChar) && fPO.contains(thisChar) ||
952                         fAL.contains(prevChar) && fNU.contains(thisChar) ||
953                         fHL.contains(prevChar) && fNU.contains(thisChar) ||
954                         fNU.contains(prevChar) && fAL.contains(thisChar) ||
955                         fNU.contains(prevChar) && fHL.contains(thisChar) )   {
956                    continue;
957                 }
958                 
959                 // LB 24  Do not break between prefix and letters or ideographs.
960                 //        PR x ID
961                 //        PR x AL
962                 //        PO x AL
963                 if (fPR.contains(prevChar) && fID.contains(thisChar) ||
964                     fPR.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar)) ||
965                     fPO.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar)))  {
966                     continue;
967                 }
968                 
969                 
970                 // LB 25    Numbers
971                 matchVals = LBNumberCheck(fText, prevPos, matchVals);
972                 if (matchVals[0] != -1) {
973                     // Matched a number.  But could have been just a single digit, which would
974                     //    not represent a "no break here" between prevChar and thisChar
975                     int numEndIdx = matchVals[1];  // idx of first char following num
976                     if (numEndIdx > pos) {
977                         // Number match includes at least the two chars being checked
978                         if (numEndIdx > nextPos) {
979                             // Number match includes additional chars.  Update pos and nextPos
980                             //   so that next loop iteration will continue at the end of the number,
981                             //   checking for breaks between last char in number & whatever follows.
982                             nextPos = numEndIdx;
983                             pos     = numEndIdx;
984                             do {
985                                 pos = moveIndex32(fText, pos, -1);  
986                                 thisChar = UTF16.charAt(fText, pos);
987                             }
988                             while (fCM.contains(thisChar));
989                         }
990                         continue;
991                     }
992                 }
993                 
994                 
995                 // LB 26  Do not break Korean Syllables
996                 if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
997                                                 fJV.contains(thisChar) ||
998                                                 fH2.contains(thisChar) ||
999                                                 fH3.contains(thisChar))) {
1000                                                     continue;
1001                                                 }
1002
1003                 if ((fJV.contains(prevChar) || fH2.contains(prevChar))  &&
1004                     (fJV.contains(thisChar) || fJT.contains(thisChar))) {
1005                         continue;
1006                 }
1007
1008                 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&
1009                     fJT.contains(thisChar)) {
1010                         continue;
1011                 }
1012
1013                 // LB 27 Treat a Korean Syllable Block the same as ID
1014                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
1015                     fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
1016                     fIN.contains(thisChar)) {
1017                         continue;
1018                     }
1019                 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
1020                     fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
1021                     fPO.contains(thisChar)) {
1022                         continue;
1023                     }
1024                 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||
1025                     fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {
1026                         continue;
1027                     }
1028
1029                 
1030                 
1031                 // LB 28 Do not break between alphabetics
1032                 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1033                     continue;
1034                 }
1035                 
1036                 // LB 29  Do not break between numeric punctuation and alphabetics
1037                 if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1038                     continue;
1039                 }
1040                 
1041                 // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
1042                 //          (AL | NU) x OP
1043                 //          CP x (AL | NU)
1044                 if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
1045                     continue;
1046                 }
1047                 if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) {
1048                     continue;
1049                 }
1050
1051                 // LB 30a   Do not break between regional indicators.  RI Ã— RI
1052                 if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
1053                     continue;
1054                 }
1055                 
1056                 // LB 31    Break everywhere else
1057                 break;            
1058             }
1059             
1060             return pos;
1061         }
1062         
1063         
1064         
1065         // Match the following regular expression in the input text.
1066         //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)?  (PR | PO) CM*)?
1067         //      0    0   1       3    3    4              7    7    7    7      9    9    9     11   11    (match states)
1068         //  retVals array  [0]  index of the start of the match, or -1 if no match
1069         //                 [1]  index of first char following the match.
1070         //  Can not use Java regex because need supplementary character support,
1071         //     and because Unicode char properties version must be the same as in
1072         //     the version of ICU being tested.
1073         private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
1074             if (retVals == null) {
1075                 retVals = new int[2];
1076              }
1077             retVals[0]     = -1;  // Indicates no match.
1078             int matchState = 0;
1079             int idx        = startIdx;
1080             
1081             matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
1082                 int c = UTF16.charAt(s, idx);
1083                 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
1084                 switch (matchState) {
1085                     case 0:   
1086                         if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||
1087                             cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1088                             matchState = 1;  
1089                             break;
1090                         }
1091                         if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1092                             matchState = 4;
1093                             break;
1094                         }
1095                         if (cLBType == UCharacter.LineBreak.HYPHEN) {
1096                             matchState = 4;
1097                             break;
1098                         }
1099                         if (cLBType == UCharacter.LineBreak.NUMERIC) {
1100                             matchState = 7;
1101                             break;
1102                         }
1103                         break matchLoop;   /* No Match  */
1104                         
1105                     case 1:
1106                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1107                             matchState = 1;
1108                             break;
1109                         }
1110                         if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1111                             matchState = 4;
1112                             break;
1113                         }
1114                         if (cLBType == UCharacter.LineBreak.HYPHEN) {
1115                             matchState = 4;
1116                             break;
1117                         }
1118                         if (cLBType == UCharacter.LineBreak.NUMERIC) {
1119                             matchState = 7;
1120                             break;
1121                         }
1122                         break matchLoop;   /* No Match  */
1123                         
1124                         
1125                     case 4:
1126                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1127                             matchState = 4;
1128                             break;
1129                         }
1130                         if (cLBType == UCharacter.LineBreak.NUMERIC) {
1131                             matchState = 7;
1132                             break;
1133                         }
1134                         break matchLoop;   /* No Match  */
1135                         //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)?  (PR | PO) CM*)?
1136                         //      0    0   1       3    3    4              7    7    7    7      9   9     11   11    (match states)
1137                  
1138                     case 7:
1139                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1140                             matchState = 7;
1141                             break;                           
1142                         }
1143                         if (cLBType == UCharacter.LineBreak.NUMERIC) {
1144                             matchState = 7;
1145                             break;                           
1146                         }
1147                         if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
1148                             matchState = 7;
1149                             break;                           
1150                         }
1151                         if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
1152                             matchState = 7;
1153                             break;       
1154                         }
1155                         if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
1156                             matchState = 9;
1157                             break;                           
1158                         }
1159                         if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {
1160                             matchState = 9;
1161                             break;                           
1162                         }
1163                         if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1164                             matchState = 11;
1165                             break;                           
1166                         }
1167                         if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1168                             matchState = 11;
1169                             break;                           
1170                         }
1171
1172                         break matchLoop;    // Match Complete.
1173                     case 9:
1174                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1175                             matchState = 9;
1176                             break;                           
1177                         }
1178                         if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1179                             matchState = 11;
1180                             break;                           
1181                         }
1182                         if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1183                             matchState = 11;
1184                             break;                           
1185                         }
1186                         break matchLoop;    // Match Complete.
1187                     case 11:
1188                         if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1189                             matchState = 11;
1190                             break;                           
1191                         }
1192                         break matchLoop;    // Match Complete.
1193                 }
1194             }
1195             if (matchState > 4) {
1196                 retVals[0] = startIdx;   
1197                  retVals[1] = idx;   
1198             }
1199             return retVals;
1200         }
1201         
1202         
1203         List  charClasses() {
1204             return fSets;
1205         }
1206         
1207         
1208     
1209     }
1210
1211      
1212     /**
1213      * 
1214      * Sentence Monkey Test Class
1215      *
1216      * 
1217      * 
1218      */
1219     static class RBBISentenceMonkey extends RBBIMonkeyKind {
1220         List                 fSets;
1221         StringBuffer         fText;
1222
1223         UnicodeSet           fSepSet;
1224         UnicodeSet           fFormatSet;
1225         UnicodeSet           fSpSet;
1226         UnicodeSet           fLowerSet;
1227         UnicodeSet           fUpperSet;
1228         UnicodeSet           fOLetterSet;
1229         UnicodeSet           fNumericSet;
1230         UnicodeSet           fATermSet;
1231         UnicodeSet           fSContinueSet;
1232         UnicodeSet           fSTermSet;
1233         UnicodeSet           fCloseSet;
1234         UnicodeSet           fOtherSet;
1235         UnicodeSet           fExtendSet;
1236
1237  
1238         
1239         RBBISentenceMonkey() {
1240             fCharProperty  = UProperty.SENTENCE_BREAK;
1241
1242             fSets            = new ArrayList();
1243
1244             //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
1245             //                       set and made into character classes of their own.  For the monkey impl,
1246             //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
1247             fSepSet          = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");
1248             fFormatSet       = new UnicodeSet("[\\p{Sentence_Break = Format}]");
1249             fSpSet           = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
1250             fLowerSet        = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
1251             fUpperSet        = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
1252             fOLetterSet      = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");
1253             fNumericSet      = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");
1254             fATermSet        = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
1255             fSContinueSet    = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");
1256             fSTermSet        = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
1257             fCloseSet        = new UnicodeSet("[\\p{Sentence_Break = Close}]");
1258             fExtendSet       = new UnicodeSet("[\\p{Sentence_Break = Extend}]");
1259             fOtherSet        = new UnicodeSet();
1260
1261
1262             fOtherSet.complement();
1263             fOtherSet.removeAll(fSepSet);
1264             fOtherSet.removeAll(fFormatSet);
1265             fOtherSet.removeAll(fSpSet);
1266             fOtherSet.removeAll(fLowerSet);
1267             fOtherSet.removeAll(fUpperSet);
1268             fOtherSet.removeAll(fOLetterSet);
1269             fOtherSet.removeAll(fNumericSet);
1270             fOtherSet.removeAll(fATermSet);
1271             fOtherSet.removeAll(fSContinueSet);
1272             fOtherSet.removeAll(fSTermSet);
1273             fOtherSet.removeAll(fCloseSet);
1274             fOtherSet.removeAll(fExtendSet);
1275
1276             fSets.add(fSepSet);
1277             fSets.add(fFormatSet);
1278
1279             fSets.add(fSpSet);
1280             fSets.add(fLowerSet);
1281             fSets.add(fUpperSet);
1282             fSets.add(fOLetterSet);
1283             fSets.add(fNumericSet);
1284             fSets.add(fATermSet);
1285             fSets.add(fSContinueSet);
1286             fSets.add(fSTermSet);
1287             fSets.add(fCloseSet);
1288             fSets.add(fOtherSet);
1289             fSets.add(fExtendSet);
1290         }
1291         
1292         
1293         List  charClasses() {
1294             return fSets;  
1295         }
1296         
1297         void   setText(StringBuffer s) { 
1298             fText = s;        
1299         }   
1300
1301         
1302         //      moveBack()   Find the "significant" code point preceding the index i.
1303         //      Skips over ($Extend | $Format)*
1304         // 
1305         private int moveBack(int i) {
1306             
1307             if (i <= 0) {
1308                 return -1;
1309             }
1310             
1311             int      c;
1312             int      j = i;
1313             do {
1314                 j = moveIndex32(fText, j, -1);
1315                 c = UTF16.charAt(fText, j);
1316             }
1317             while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));
1318             return j;
1319         }
1320         
1321         
1322         int moveForward(int i) {
1323             if (i>=fText.length()) {
1324                 return fText.length();
1325             }
1326             int   c;
1327             int   j = i;
1328             do {
1329                 j = moveIndex32(fText, j, 1);
1330                 c = cAt(j);
1331             }
1332             while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));
1333             return j;
1334            
1335         }
1336         
1337         int cAt(int pos) {
1338             if (pos<0 || pos>=fText.length()) {
1339                 return -1;
1340             }
1341             return UTF16.charAt(fText, pos);
1342         }
1343
1344         int   next(int prevPos) {  
1345             int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the 
1346                                         //   break position being tested.  The candidate break
1347                                         //   location is before p2.
1348             int     breakPos = -1;
1349             
1350             int c0, c1, c2, c3;         // The code points at p0, p1, p2 & p3.
1351             int c;
1352             
1353             // Prev break at end of string.  return DONE.
1354             if (prevPos >= fText.length()) {
1355                 return -1;
1356             }
1357             /*p0 =*/ p1 = p2 = p3 = prevPos;
1358             c3 = UTF16.charAt(fText, prevPos);
1359             c0 = c1 = c2 = 0;
1360             
1361             // Loop runs once per "significant" character position in the input text.
1362             for (;;) {
1363                 // Move all of the positions forward in the input string.
1364                 /*p0 = p1;*/  c0 = c1;
1365                 p1 = p2;  c1 = c2;
1366                 p2 = p3;  c2 = c3;
1367                 
1368                 // Advancd p3 by  X(Extend | Format)*   Rule 4
1369                 p3 = moveForward(p3);
1370                 c3 = cAt(p3);
1371                 
1372                 // Rule (3) CR x LF
1373                 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
1374                     continue;
1375                 }
1376                 
1377                 // Rule (4)    Sep  <break>
1378                 if (fSepSet.contains(c1)) {
1379                     p2 = p1+1;   // Separators don't combine with Extend or Format
1380                     break;
1381                 }               
1382
1383                 if (p2 >= fText.length()) {
1384                     // Reached end of string.  Always a break position.
1385                     break;
1386                 }
1387
1388                 if (p2 == prevPos) {
1389                     // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1390                     continue;
1391                 }
1392
1393                 // Rule (6).   ATerm x Numeric
1394                 if (fATermSet.contains(c1) &&  fNumericSet.contains(c2))  {
1395                     continue;
1396                 }
1397
1398                 // Rule (7).  Upper ATerm  x  Uppper
1399                 if (fUpperSet.contains(c0) && fATermSet.contains(c1) && fUpperSet.contains(c2)) {
1400                     continue;
1401                 }
1402
1403                 // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep))* Lower
1404                 //           Note:  Sterm | ATerm are added to the negated part of the expression by a 
1405                 //                  note to the Unicode 5.0 documents.
1406                 int p8 = p1;
1407                 while (p8>0 && fSpSet.contains(cAt(p8))) {
1408                     p8 = moveBack(p8);
1409                 }
1410                 while (p8>0 && fCloseSet.contains(cAt(p8))) {
1411                     p8 = moveBack(p8);
1412                 }
1413                 if (fATermSet.contains(cAt(p8))) {
1414                     p8=p2;
1415                     for (;;) {
1416                         c = cAt(p8);
1417                         if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||
1418                             fLowerSet.contains(c) || fSepSet.contains(c) ||
1419                             fATermSet.contains(c) || fSTermSet.contains(c))  
1420                          {
1421                             break;
1422                         }
1423                         p8 = moveForward(p8);
1424                     }
1425                     if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {
1426                         continue;
1427                     }
1428                 }
1429                 
1430                 // Rule 8a  (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)
1431                 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {
1432                     p8 = p1;
1433                     while (setContains(fSpSet, cAt(p8))) {
1434                         p8 = moveBack(p8);
1435                     }
1436                     while (setContains(fCloseSet, cAt(p8))) {
1437                         p8 = moveBack(p8);
1438                     }
1439                     c = cAt(p8);
1440                     if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {
1441                         continue;
1442                     }
1443                 }
1444
1445
1446                 // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
1447                 int p9 = p1;
1448                 while (p9>0 && fCloseSet.contains(cAt(p9))) {
1449                     p9 = moveBack(p9);
1450                 }
1451                 c = cAt(p9);
1452                 if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
1453                     if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {
1454                         continue;
1455                     }
1456                 }
1457
1458                 // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
1459                 int p10 = p1;
1460                 while (p10>0 && fSpSet.contains(cAt(p10))) {
1461                     p10 = moveBack(p10);
1462                 }
1463                 while (p10>0 && fCloseSet.contains(cAt(p10))) {
1464                     p10 = moveBack(p10);
1465                 }
1466                 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {
1467                     if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
1468                         continue;
1469                     }
1470                 }
1471
1472                 // Rule (11)  (STerm | ATerm) Close* Sp*   <break>
1473                 int p11 = p1;
1474                 if (p11>0 && fSepSet.contains(cAt(p11))) {
1475                     p11 = moveBack(p11);
1476                 }
1477                 while (p11>0 && fSpSet.contains(cAt(p11))) {
1478                     p11 = moveBack(p11);
1479                 }
1480                 while (p11>0 && fCloseSet.contains(cAt(p11))) {
1481                     p11 = moveBack(p11);
1482                 }
1483                 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {
1484                     break;
1485                 }
1486
1487                 //  Rule (12)  Any x Any
1488                 continue;
1489             }
1490             breakPos = p2;
1491             return breakPos;
1492         }
1493            
1494
1495         
1496     }
1497
1498  
1499     /**
1500      * Move an index into a string by n code points.
1501      *   Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
1502      *   complicating usage.
1503      * @param s   a Text string
1504      * @param pos The starting code unit index into the text string
1505      * @param amt The amount to adjust the string by.
1506      * @return    The adjusted code unit index, pinned to the string's length, or
1507      *            unchanged if input index was outside of the string.
1508      */
1509     static int moveIndex32(StringBuffer s, int pos, int amt) {
1510         int i;
1511         char  c;
1512         if (amt>0) {
1513             for (i=0; i<amt; i++) {
1514                 if (pos >= s.length()) {
1515                     return s.length();                   
1516                 }
1517                 c = s.charAt(pos);
1518                 pos++;
1519                 if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
1520                     c = s.charAt(pos);
1521                     if (UTF16.isTrailSurrogate(c)) {
1522                         pos++;   
1523                     }
1524                 }
1525             }
1526         } else {
1527             for (i=0; i>amt; i--) {
1528                 if (pos <= 0) {
1529                     return 0;   
1530                 }
1531                 pos--;
1532                 c = s.charAt(pos);
1533                 if (UTF16.isTrailSurrogate(c) && pos >= 0) {
1534                     c = s.charAt(pos);
1535                     if (UTF16.isLeadSurrogate(c)) {
1536                         pos--;   
1537                     }
1538                 }
1539             }
1540         }
1541         return pos;
1542     }
1543     
1544     /**
1545      * No-exceptions form of UnicodeSet.contains(c).
1546      *    Simplifies loops that terminate with an end-of-input character value.
1547      * @param s  A unicode set
1548      * @param c  A code point value
1549      * @return   true if the set contains c.
1550      */
1551     static boolean setContains(UnicodeSet s, int c) {
1552         if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {
1553             return false;
1554         }
1555         return s.contains(c);
1556     }
1557     
1558     
1559     /**
1560      * return the index of the next code point in the input text.
1561      * @param i the preceding index
1562      */
1563     static int  nextCP(StringBuffer s, int i) {
1564         if (i == -1) {
1565             // End of Input indication.  Continue to return end value.
1566             return -1;
1567         }
1568         int  retVal = i + 1;
1569         if (retVal > s.length()) {
1570             return -1;
1571         }
1572         int  c = UTF16.charAt(s, i);
1573         if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {
1574             retVal++;
1575         }
1576         return retVal;
1577     }
1578     
1579     
1580     /**
1581      * random number generator.  Not using Java's built-in Randoms for two reasons:
1582      *    1.  Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
1583      *    2.  We need to get and restore the seed from values occurring in the middle
1584      *        of a long sequence, to more easily reproduce failing cases.
1585      */
1586     private static int m_seed = 1;
1587     private static int  m_rand()
1588     {
1589         m_seed = m_seed * 1103515245 + 12345;
1590         return (int)(m_seed >>> 16) % 32768;
1591     }
1592
1593     // Helper function for formatting error output.
1594     //   Append a string into a fixed-size field in a StringBuffer.
1595     //   Blank-pad the string if it is shorter than the field.
1596     //   Truncate the source string if it is too long.
1597     //
1598     private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {
1599         int appendLen = src.length();
1600         if (appendLen >= fieldLen) {
1601             dest.append(src.substring(0, fieldLen));
1602         } else {
1603             dest.append(src);
1604             while (appendLen < fieldLen) {
1605                 dest.append(' ');
1606                 appendLen++;
1607             }
1608         }
1609     }
1610
1611     // Helper function for formatting error output.
1612     // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format
1613     private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {
1614            String hexChars = "0123456789abcdef";
1615            if (c < 0x10000) {
1616                 dest.append("\\u");
1617                 for (int bn=12; bn>=0; bn-=4) {
1618                     dest.append(hexChars.charAt((((int)c)>>bn)&0xf));
1619                 }
1620                 appendToBuf(dest, " ", fieldLen-6);
1621             } else {
1622                 dest.append("\\U");
1623                 for (int bn=28; bn>=0; bn-=4) {
1624                     dest.append(hexChars.charAt((((int)c)>>bn)&0xf));
1625                 }
1626                 appendToBuf(dest, " ", fieldLen-10);
1627
1628             }
1629        }
1630     
1631 /**
1632  *  Run a RBBI monkey test.  Common routine, for all break iterator types.
1633  *    Parameters:
1634  *       bi      - the break iterator to use
1635  *       mk      - MonkeyKind, abstraction for obtaining expected results
1636  *       name    - Name of test (char, word, etc.) for use in error messages
1637  *       seed    - Seed for starting random number generator (parameter from user)
1638  *       numIterations
1639  */
1640 void RunMonkey(BreakIterator  bi, RBBIMonkeyKind mk, String name, int  seed, int numIterations) {
1641     int              TESTSTRINGLEN = 500;
1642     StringBuffer     testText         = new StringBuffer();
1643     int              numCharClasses;
1644     List             chClasses;
1645     int[]            expected         = new int[TESTSTRINGLEN*2 + 1];
1646     int              expectedCount    = 0;
1647     boolean[]        expectedBreaks   = new boolean[TESTSTRINGLEN*2 + 1];
1648     boolean[]        forwardBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
1649     boolean[]        reverseBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
1650     boolean[]        isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1651     boolean[]        followingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
1652     boolean[]        precedingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
1653     int              i;
1654     int              loopCount        = 0;
1655     boolean          printTestData    = false;
1656     boolean          printBreaksFromBI = false;
1657
1658     m_seed = seed;
1659
1660     numCharClasses = mk.charClasses().size();
1661     chClasses      = mk.charClasses();
1662
1663     // Verify that the character classes all have at least one member.
1664     for (i=0; i<numCharClasses; i++) {
1665         UnicodeSet s = (UnicodeSet)chClasses.get(i);
1666         if (s == null || s.size() == 0) {
1667             errln("Character Class " + i + " is null or of zero size.");
1668             return;
1669         }
1670     }
1671
1672     //--------------------------------------------------------------------------------------------
1673     //
1674     //  Debugging settings.  Comment out everything in the following block for normal operation
1675     //
1676     //--------------------------------------------------------------------------------------------
1677     // numIterations = -1;  
1678     // RuleBasedBreakIterator_New.fTrace = true;
1679     // m_seed = 859056465;
1680     // TESTSTRINGLEN = 50;
1681     // printTestData = true;
1682     // printBreaksFromBI = true;
1683     // ((RuleBasedBreakIterator_New)bi).dump();
1684     
1685     //--------------------------------------------------------------------------------------------
1686     //
1687     //  End of Debugging settings.  
1688     //
1689     //--------------------------------------------------------------------------------------------
1690     
1691     int  dotsOnLine = 0;
1692      while (loopCount < numIterations || numIterations == -1) {
1693         if (numIterations == -1 && loopCount % 10 == 0) {
1694             // If test is running in an infinite loop, display a periodic tic so
1695             //   we can tell that it is making progress.
1696             System.out.print(".");
1697             if (dotsOnLine++ >= 80){
1698                 System.out.println();
1699                 dotsOnLine = 0;
1700             }
1701         }
1702         // Save current random number seed, so that we can recreate the random numbers
1703         //   for this loop iteration in event of an error.
1704         seed = m_seed;
1705
1706         testText.setLength(0);
1707         // Populate a test string with data.
1708         if (printTestData) {
1709             System.out.println("Test Data string ..."); 
1710         }
1711         for (i=0; i<TESTSTRINGLEN; i++) {
1712             int        aClassNum = m_rand() % numCharClasses;
1713             UnicodeSet classSet  = (UnicodeSet)chClasses.get(aClassNum);
1714             int        charIdx   = m_rand() % classSet.size();
1715             int        c         = classSet.charAt(charIdx);
1716             if (c < 0) {   // TODO:  deal with sets containing strings.
1717                 errln("c < 0");
1718             }
1719             UTF16.appendCodePoint(testText, c);
1720             if (printTestData) {
1721                 System.out.print(Integer.toHexString(c) + " ");
1722             }
1723         }
1724         if (printTestData) {
1725             System.out.println(); 
1726         }
1727
1728         Arrays.fill(expected, 0);
1729         Arrays.fill(expectedBreaks, false);
1730         Arrays.fill(forwardBreaks, false);
1731         Arrays.fill(reverseBreaks, false);
1732         Arrays.fill(isBoundaryBreaks, false);
1733         Arrays.fill(followingBreaks, false);
1734         Arrays.fill(precedingBreaks, false);
1735  
1736         // Calculate the expected results for this test string.
1737         mk.setText(testText);
1738         expectedCount = 0;
1739         expectedBreaks[0] = true;
1740         expected[expectedCount ++] = 0;
1741         int breakPos = 0;
1742         int lastBreakPos = -1;
1743         for (;;) {
1744             lastBreakPos = breakPos;
1745             breakPos = mk.next(breakPos);
1746             if (breakPos == -1) {
1747                 break;
1748             }
1749             if (breakPos > testText.length()) {
1750                 errln("breakPos > testText.length()");
1751             }
1752             if (lastBreakPos >= breakPos) {
1753                 errln("Next() not increasing.");
1754                 // break;
1755             }
1756             expectedBreaks[breakPos] = true;
1757             expected[expectedCount ++] = breakPos;
1758         }
1759
1760         // Find the break positions using forward iteration
1761         if (printBreaksFromBI) {
1762             System.out.println("Breaks from BI...");  
1763         }
1764         bi.setText(testText.toString());
1765         for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {
1766             if (i < 0 || i > testText.length()) {
1767                 errln(name + " break monkey test: Out of range value returned by breakIterator::next()");
1768                 break;
1769             }
1770             if (printBreaksFromBI) {
1771                 System.out.print(Integer.toHexString(i) + " ");
1772             }
1773             forwardBreaks[i] = true;
1774         }
1775         if (printBreaksFromBI) {
1776             System.out.println();
1777         }
1778
1779         // Find the break positions using reverse iteration
1780         for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {
1781             if (i < 0 || i > testText.length()) {
1782                 errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);
1783                 break;
1784             }
1785             reverseBreaks[i] = true;
1786         }
1787
1788         // Find the break positions using isBoundary() tests.
1789         for (i=0; i<=testText.length(); i++) {
1790             isBoundaryBreaks[i] = bi.isBoundary(i);
1791         }
1792
1793         // Find the break positions using the following() function.
1794         lastBreakPos = 0;
1795         followingBreaks[0] = true;
1796         for (i=0; i<testText.length(); i++) {
1797             breakPos = bi.following(i);
1798             if (breakPos <= i ||
1799                 breakPos < lastBreakPos ||
1800                 breakPos > testText.length() ||
1801                 breakPos > lastBreakPos && lastBreakPos > i ) {
1802                 errln(name + " break monkey test: " +
1803                     "Out of range value returned by BreakIterator::following().\n" +
1804                     "index=" + i + "following returned=" + breakPos +
1805                     "lastBreak=" + lastBreakPos);
1806                 precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
1807             } else {
1808                 followingBreaks[breakPos] = true;
1809                 lastBreakPos = breakPos;
1810             }
1811         }
1812         
1813         // Find the break positions using the preceding() function.
1814         lastBreakPos = testText.length();
1815         precedingBreaks[testText.length()] = true;
1816         for (i=testText.length(); i>0; i--) {
1817             breakPos = bi.preceding(i);
1818             if (breakPos >= i ||
1819                 breakPos > lastBreakPos ||
1820                 breakPos < 0 ||
1821                 breakPos < lastBreakPos && lastBreakPos < i ) {
1822                 errln(name + " break monkey test: " +
1823                         "Out of range value returned by BreakIterator::preceding().\n" +
1824                         "index=" + i + "preceding returned=" + breakPos +
1825                         "lastBreak=" + lastBreakPos);
1826                 precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
1827             } else {
1828                 precedingBreaks[breakPos] = true;
1829                 lastBreakPos = breakPos;
1830             }
1831         }
1832
1833         
1834
1835         // Compare the expected and actual results.
1836         for (i=0; i<=testText.length(); i++) {
1837             String errorType = null;
1838             if  (forwardBreaks[i] != expectedBreaks[i]) {
1839                 errorType = "next()";
1840             } else if (reverseBreaks[i] != forwardBreaks[i]) {
1841                 errorType = "previous()";
1842             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
1843                 errorType = "isBoundary()";
1844             } else if (followingBreaks[i] != expectedBreaks[i]) {
1845                 errorType = "following()";
1846             } else if (precedingBreaks[i] != expectedBreaks[i]) {
1847                 errorType = "preceding()";
1848             }
1849
1850
1851             if (errorType != null) {
1852                 // Format a range of the test text that includes the failure as
1853                 //  a data item that can be included in the rbbi test data file.
1854
1855                 // Start of the range is the last point where expected and actual results
1856                 //   both agreed that there was a break position.
1857                 int startContext = i;
1858                 int count = 0;
1859                 for (;;) {
1860                     if (startContext==0) { break; }
1861                     startContext --;
1862                     if (expectedBreaks[startContext]) {
1863                         if (count == 2) break;
1864                         count ++;
1865                     }
1866                 }
1867
1868                 // End of range is two expected breaks past the start position.
1869                 int endContext = i + 1;
1870                 int ci;
1871                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
1872                     for (;;) {
1873                         if (endContext >= testText.length()) {break;}
1874                         if (expectedBreaks[endContext-1]) { 
1875                             if (count == 0) break;
1876                             count --;
1877                         }
1878                         endContext ++;
1879                     }
1880                 }
1881
1882                 // Format looks like   "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
1883                 StringBuffer errorText = new StringBuffer();
1884
1885                 int      c;    // Char from test data
1886                 for (ci = startContext;  ci <= endContext && ci != -1;  ci = nextCP(testText, ci)) {
1887                     if (ci == i) {
1888                         // This is the location of the error.
1889                         errorText.append("<?>---------------------------------\n");
1890                     } else if (expectedBreaks[ci]) {
1891                         // This a non-error expected break position.
1892                         errorText.append("------------------------------------\n");
1893                     }
1894                     if (ci < testText.length()) {
1895                         c = UTF16.charAt(testText, ci);
1896                         appendCharToBuf(errorText, c, 11);
1897                         String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT);
1898                         appendToBuf(errorText, gc, 8);
1899                         int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty);
1900                         String extraPropValue = 
1901                             UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG);
1902                         appendToBuf(errorText, extraPropValue, 20);
1903
1904                         String charName = UCharacter.getExtendedName(c);
1905                         appendToBuf(errorText, charName, 40);
1906                         errorText.append('\n');
1907                     }
1908                 }
1909                 if (ci == testText.length() && ci != -1) {
1910                     errorText.append("<>");
1911                 }
1912                 errorText.append("</data>\n");
1913
1914                 // Output the error
1915                 errln(name + " break monkey test error.  " + 
1916                      (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") +
1917                       "\nOperation = " + errorType + "; random seed = " + seed + ";  buf Idx = " + i + "\n" +
1918                       errorText);
1919                 break;
1920             }
1921         }
1922
1923         loopCount++;
1924     }
1925 }
1926
1927 public void TestCharMonkey() {
1928     
1929     int        loopCount = 500;
1930     int        seed      = 1;
1931     
1932     if (params.inclusion >= 9) {
1933         loopCount = 10000;
1934     }
1935     
1936     RBBICharMonkey  m = new RBBICharMonkey();
1937     BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
1938     RunMonkey(bi, m, "char", seed, loopCount);
1939 }
1940
1941 public void TestWordMonkey() {
1942     
1943     int        loopCount = 500;
1944     int        seed      = 1;
1945     
1946     if (params.inclusion >= 9) {
1947         loopCount = 10000;
1948     }
1949     
1950     logln("Word Break Monkey Test");
1951     RBBIWordMonkey  m = new RBBIWordMonkey();
1952     BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
1953     RunMonkey(bi, m, "word", seed, loopCount);
1954 }
1955
1956 public void TestLineMonkey() {
1957     int        loopCount = 500;
1958     int        seed      = 1;
1959     
1960     if (params.inclusion >= 9) {
1961         loopCount = 10000;
1962     }
1963     
1964     logln("Line Break Monkey Test");
1965     RBBILineMonkey  m = new RBBILineMonkey();
1966     BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
1967     if (params == null) {
1968         loopCount = 50;
1969     }
1970     RunMonkey(bi, m, "line", seed, loopCount);
1971 }
1972
1973 public void TestSentMonkey() {
1974     
1975     int        loopCount = 500;
1976     int        seed      = 1;
1977     
1978     if (params.inclusion >= 9) {
1979         loopCount = 3000;
1980     }
1981     
1982     logln("Sentence Break Monkey Test");
1983     RBBISentenceMonkey  m = new RBBISentenceMonkey();
1984     BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
1985     if (params == null) {
1986         loopCount = 30;
1987     }
1988     RunMonkey(bi, m, "sent", seed, loopCount);
1989 }
1990 //
1991 //  Round-trip monkey tests.
1992 //  Verify that break iterators created from the rule source from the default
1993 //    break iterators still pass the monkey test for the iterator type.
1994 //
1995 //  This is a major test for the Rule Compiler.  The default break iterators are built
1996 //  from pre-compiled binary rule data that was created using ICU4C; these
1997 //  round-trip rule recompile tests verify that the Java rule compiler can
1998 //  rebuild break iterators from the original source rules.
1999 //
2000 public void TestRTCharMonkey() {
2001     
2002     int        loopCount = 200;
2003     int        seed      = 1;
2004     
2005     if (params.inclusion >= 9) {
2006         loopCount = 2000;
2007     }
2008     
2009     RBBICharMonkey  m = new RBBICharMonkey();
2010     BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
2011     String rules = bi.toString();
2012     BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2013     RunMonkey(rtbi, m, "char", seed, loopCount);
2014 }
2015
2016 public void TestRTWordMonkey() {
2017     
2018     int        loopCount = 200;
2019     int        seed      = 1;
2020     
2021     if (params.inclusion >= 9) {
2022         loopCount = 2000;
2023     }
2024     logln("Word Break Monkey Test");
2025     RBBIWordMonkey  m = new RBBIWordMonkey();
2026     BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
2027     String rules = bi.toString();
2028     BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2029     RunMonkey(rtbi, m, "word", seed, loopCount);
2030 }
2031
2032 public void TestRTLineMonkey() {
2033     int        loopCount = 200;
2034     int        seed      = 1;
2035     
2036     if (params.inclusion >= 9) {
2037         loopCount = 2000;
2038     }
2039     
2040     logln("Line Break Monkey Test");
2041     RBBILineMonkey  m = new RBBILineMonkey();
2042     BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
2043     String rules = bi.toString();
2044     BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2045     if (params == null) {
2046         loopCount = 50;
2047     }
2048     RunMonkey(rtbi, m, "line", seed, loopCount);
2049 }
2050
2051 public void TestRTSentMonkey() {
2052     
2053     int        loopCount = 200;
2054     int        seed      = 1;
2055     
2056     if (params.inclusion >= 9) {
2057         loopCount = 1000;
2058     }
2059     
2060     logln("Sentence Break Monkey Test");
2061     RBBISentenceMonkey  m = new RBBISentenceMonkey();
2062     BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
2063     String rules = bi.toString();
2064     BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2065     if (params == null) {
2066         loopCount = 30;
2067     }
2068     RunMonkey(rtbi, m, "sent", seed, loopCount);
2069 }
2070
2071
2072
2073 }
2074