2 *******************************************************************************
3 * Copyright (C) 2003-2010 International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.dev.test.rbbi;
10 // Monkey testing of RuleBasedBreakIterator
11 import java.util.ArrayList;
12 import java.util.Arrays;
13 import java.util.List;
14 import java.util.Locale;
16 import com.ibm.icu.dev.test.TestFmwk;
17 import com.ibm.icu.lang.UCharacter;
18 import com.ibm.icu.lang.UProperty;
19 import com.ibm.icu.text.BreakIterator;
20 import com.ibm.icu.text.RuleBasedBreakIterator;
21 import com.ibm.icu.text.UTF16;
22 import com.ibm.icu.text.UnicodeSet;
26 * Monkey tests for RBBI. These tests have independent implementations of
27 * the Unicode TR boundary rules, and compare results between these and ICU's
28 * implementation, using random data.
30 * Tests cover Grapheme Cluster (char), Word and Line breaks
32 * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
35 public class RBBITestMonkey extends TestFmwk {
37 public static void main(String[] args) {
38 new RBBITestMonkey().run(args);
42 // classs RBBIMonkeyKind
44 // Monkey Test for Break Iteration
45 // Abstract interface class. Concrete derived classes independently
46 // implement the break rules for different iterator types.
48 // The Monkey Test itself uses doesn't know which type of break iterator it is
49 // testing, but works purely in terms of the interface defined here.
51 abstract static class RBBIMonkeyKind {
53 // Return a List of UnicodeSets, representing the character classes used
54 // for this type of iterator.
55 abstract List charClasses();
57 // Set the test text on which subsequent calls to next() will operate
58 abstract void setText(StringBuffer text);
60 // Find the next break postion, starting from the specified position.
61 // Return -1 after reaching end of string.
62 abstract int next(int i);
64 // A Character Property, one of the constants defined in class UProperty.
65 // The value fo this property will be displayed for the characters
66 // near any test failure.
72 * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
74 static class RBBICharMonkey extends RBBIMonkeyKind {
78 UnicodeSet fControlSet;
79 UnicodeSet fExtendSet;
80 UnicodeSet fPrependSet;
81 UnicodeSet fSpacingSet;
87 UnicodeSet fHangulSet;
95 fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
96 fCRLFSet = new UnicodeSet("[\\r\\n]");
97 fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
98 fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
99 fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
100 fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
101 fLSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
102 fVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");
103 fTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");
104 fLVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");
105 fLVTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");
106 fHangulSet = new UnicodeSet();
107 fHangulSet.addAll(fLSet);
108 fHangulSet.addAll(fVSet);
109 fHangulSet.addAll(fTSet);
110 fHangulSet.addAll(fLVSet);
111 fHangulSet.addAll(fLVTSet);
113 fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");
115 fSets = new ArrayList();
117 fSets.add(fControlSet);
118 fSets.add(fExtendSet);
119 fSets.add(fPrependSet);
120 fSets.add(fSpacingSet);
121 fSets.add(fHangulSet);
126 void setText(StringBuffer s) {
134 int next(int prevPos) {
135 int p1, p2, p3; // Indices of the significant code points around the
136 // break position being tested. The candidate break
137 // location is before p2.
141 int c1, c2, c3; // The code points at p0, p1, p2 & p3.
143 // Previous break at end of string. return DONE.
144 if (prevPos >= fText.length()) {
147 p1 = p2 = p3 = prevPos;
148 c3 = UTF16.charAt(fText, prevPos);
151 // Loop runs once per "significant" character position in the input text.
153 // Move all of the positions forward in the input string.
157 // Advance p3 by one codepoint
158 p3 = moveIndex32(fText, p3, 1);
159 c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);
162 // Still warming up the loop. (won't work with zero length strings, but we don't care)
165 if (p2 == fText.length()) {
166 // Reached end of string. Always a break position.
171 // No Extend or Format characters may appear between the CR and LF,
172 // which requires the additional check for p2 immediately following p1.
174 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
178 // Rule (GB4). ( Control | CR | LF ) <break>
179 if (fControlSet.contains(c1) ||
185 // Rule (GB5) <break> ( Control | CR | LF )
187 if (fControlSet.contains(c2) ||
194 // Rule (GB6) L x ( L | V | LV | LVT )
195 if (fLSet.contains(c1) &&
196 (fLSet.contains(c2) ||
197 fVSet.contains(c2) ||
198 fLVSet.contains(c2) ||
199 fLVTSet.contains(c2))) {
203 // Rule (GB7) ( LV | V ) x ( V | T )
204 if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&
205 (fVSet.contains(c2) || fTSet.contains(c2))) {
209 // Rule (GB8) ( LVT | T) x T
210 if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&
211 fTSet.contains(c2)) {
215 // Rule (GB9) Numeric x ALetter
216 if (fExtendSet.contains(c2)) {
220 // Rule (GB9a) x SpacingMark
221 if (fSpacingSet.contains(c2)) {
225 // Rule (GB9b) Prepend x
226 if (fPrependSet.contains(c1)) {
230 // Rule (GB10) Any <break> Any
242 * Word Monkey Test Class
247 static class RBBIWordMonkey extends RBBIMonkeyKind {
253 UnicodeSet fNewlineSet;
254 UnicodeSet fKatakanaSet;
255 UnicodeSet fALetterSet;
256 UnicodeSet fMidNumLetSet;
257 UnicodeSet fMidLetterSet;
258 UnicodeSet fMidNumSet;
259 UnicodeSet fNumericSet;
260 UnicodeSet fFormatSet;
261 UnicodeSet fExtendSet;
262 UnicodeSet fExtendNumLetSet;
263 UnicodeSet fOtherSet;
267 fCharProperty = UProperty.WORD_BREAK;
269 fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]");
270 fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]");
271 fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]");
272 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]");
273 fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]");
274 fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
275 fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
276 fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]");
277 fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]");
278 fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]");
279 fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
280 fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]");
282 fOtherSet = new UnicodeSet();
283 fOtherSet.complement();
284 fOtherSet.removeAll(fCRSet);
285 fOtherSet.removeAll(fLFSet);
286 fOtherSet.removeAll(fNewlineSet);
287 fOtherSet.removeAll(fALetterSet);
288 fOtherSet.removeAll(fKatakanaSet);
289 fOtherSet.removeAll(fMidLetterSet);
290 fOtherSet.removeAll(fMidNumSet);
291 fOtherSet.removeAll(fNumericSet);
292 fOtherSet.removeAll(fFormatSet);
293 fOtherSet.removeAll(fExtendSet);
294 fOtherSet.removeAll(fExtendNumLetSet);
295 // Inhibit dictionary characters from being tested at all.
296 fOtherSet.removeAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
298 fSets = new ArrayList();
301 fSets.add(fNewlineSet);
302 fSets.add(fALetterSet);
303 fSets.add(fKatakanaSet);
304 fSets.add(fMidLetterSet);
305 fSets.add(fMidNumLetSet);
306 fSets.add(fMidNumSet);
307 fSets.add(fNumericSet);
308 fSets.add(fFormatSet);
309 fSets.add(fExtendSet);
310 fSets.add(fExtendNumLetSet);
311 fSets.add(fOtherSet);
319 void setText(StringBuffer s) {
323 int next(int prevPos) {
324 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the
325 // break position being tested. The candidate break
326 // location is before p2.
329 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
331 // Previous break at end of string. return DONE.
332 if (prevPos >= fText.length()) {
335 /*p0 =*/ p1 = p2 = p3 = prevPos;
336 c3 = UTF16.charAt(fText, prevPos);
341 // Loop runs once per "significant" character position in the input text.
343 // Move all of the positions forward in the input string.
344 /*p0 = p1;*/ c0 = c1;
348 // Advancd p3 by X(Extend | Format)* Rule 4
349 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
351 p3 = moveIndex32(fText, p3, 1);
353 if (p3>=fText.length()) {
356 c3 = UTF16.charAt(fText, p3);
357 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
361 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3));
364 // Still warming up the loop. (won't work with zero length strings, but we don't care)
367 if (p2 == fText.length()) {
368 // Reached end of string. Always a break position.
373 // No Extend or Format characters may appear between the CR and LF,
374 // which requires the additional check for p2 immediately following p1.
376 if (c1==0x0D && c2==0x0A) {
380 // Rule (3a) Break before and after newlines (including CR and LF)
382 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {
385 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
389 // Rule (5). ALetter x ALetter
390 if (fALetterSet.contains(c1) &&
391 fALetterSet.contains(c2)) {
395 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
397 if ( fALetterSet.contains(c1) &&
398 (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
399 setContains(fALetterSet, c3)) {
404 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
405 if (fALetterSet.contains(c0) &&
406 (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1)) &&
407 fALetterSet.contains(c2)) {
411 // Rule (8) Numeric x Numeric
412 if (fNumericSet.contains(c1) &&
413 fNumericSet.contains(c2)) {
417 // Rule (9) ALetter x Numeric
418 if (fALetterSet.contains(c1) &&
419 fNumericSet.contains(c2)) {
423 // Rule (10) Numeric x ALetter
424 if (fNumericSet.contains(c1) &&
425 fALetterSet.contains(c2)) {
429 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
430 if ( fNumericSet.contains(c0) &&
431 (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1)) &&
432 fNumericSet.contains(c2)) {
436 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
437 if (fNumericSet.contains(c1) &&
438 (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
439 setContains(fNumericSet, c3)) {
443 // Rule (13) Katakana x Katakana
444 if (fKatakanaSet.contains(c1) &&
445 fKatakanaSet.contains(c2)) {
449 // Rule 13a (ALetter | Numeric | Katakana | ExtendNumLet) x ExtendNumLet
450 if ((fALetterSet.contains(c1) || fNumericSet.contains(c1) ||
451 fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
452 fExtendNumLetSet.contains(c2)) {
455 // Rule 13b ExtendNumLet x (ALetter | Numeric | Katakana | ExtendNumLet)
456 if (fExtendNumLetSet.contains(c1) &&
457 (fALetterSet.contains(c2) || fNumericSet.contains(c2) ||
458 fKatakanaSet.contains(c2) || fExtendNumLetSet.contains(c2))) {
462 // Rule 14. Break found here.
473 static class RBBILineMonkey extends RBBIMonkeyKind {
522 fCharProperty = UProperty.LINE_BREAK;
523 fSets = new ArrayList();
525 fBK = new UnicodeSet("[\\p{Line_Break=BK}]");
526 fCR = new UnicodeSet("[\\p{Line_break=CR}]");
527 fLF = new UnicodeSet("[\\p{Line_break=LF}]");
528 fCM = new UnicodeSet("[\\p{Line_break=CM}]");
529 fNL = new UnicodeSet("[\\p{Line_break=NL}]");
530 fWJ = new UnicodeSet("[\\p{Line_break=WJ}]");
531 fZW = new UnicodeSet("[\\p{Line_break=ZW}]");
532 fGL = new UnicodeSet("[\\p{Line_break=GL}]");
533 fCB = new UnicodeSet("[\\p{Line_break=CB}]");
534 fSP = new UnicodeSet("[\\p{Line_break=SP}]");
535 fB2 = new UnicodeSet("[\\p{Line_break=B2}]");
536 fBA = new UnicodeSet("[\\p{Line_break=BA}]");
537 fBB = new UnicodeSet("[\\p{Line_break=BB}]");
538 fHY = new UnicodeSet("[\\p{Line_break=HY}]");
539 fCL = new UnicodeSet("[\\p{Line_break=CL}]");
540 fCP = new UnicodeSet("[\\p{Line_break=CP}]");
541 fEX = new UnicodeSet("[\\p{Line_break=EX}]");
542 fIN = new UnicodeSet("[\\p{Line_break=IN}]");
543 fNS = new UnicodeSet("[\\p{Line_break=NS}]");
544 fOP = new UnicodeSet("[\\p{Line_break=OP}]");
545 fQU = new UnicodeSet("[\\p{Line_break=QU}]");
546 fIS = new UnicodeSet("[\\p{Line_break=IS}]");
547 fNU = new UnicodeSet("[\\p{Line_break=NU}]");
548 fPO = new UnicodeSet("[\\p{Line_break=PO}]");
549 fPR = new UnicodeSet("[\\p{Line_break=PR}]");
550 fSY = new UnicodeSet("[\\p{Line_break=SY}]");
551 fAI = new UnicodeSet("[\\p{Line_break=AI}]");
552 fAL = new UnicodeSet("[\\p{Line_break=AL}]");
553 fID = new UnicodeSet("[\\p{Line_break=ID}]");
554 fSA = new UnicodeSet("[\\p{Line_break=SA}]");
555 fJL = new UnicodeSet("[\\p{Line_break=JL}]");
556 fJV = new UnicodeSet("[\\p{Line_break=JV}]");
557 fJT = new UnicodeSet("[\\p{Line_break=JT}]");
558 fH2 = new UnicodeSet("[\\p{Line_break=H2}]");
559 fH3 = new UnicodeSet("[\\p{Line_break=H3}]");
560 fSG = new UnicodeSet("[\\ud800-\\udfff]");
561 fXX = new UnicodeSet("[\\p{Line_break=XX}]");
564 fAL.addAll(fXX); // Default behavior for XX is identical to AL
565 fAL.addAll(fAI); // Default behavior for AI is identical to AL
566 fAL.addAll(fSA); // Default behavior for SA is XX, which defaults to AL
567 fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL
611 void setText(StringBuffer s) {
618 int next(int startPos) {
619 int pos; // Index of the char following a potential break position
620 int thisChar; // Character at above position "pos"
622 int prevPos; // Index of the char preceding a potential break position
623 int prevChar; // Character at above position. Note that prevChar
624 // and thisChar may not be adjacent because combining
625 // characters between them will be ignored.
627 int nextPos; // Index of the next character following pos.
628 // Usually skips over combining marks.
629 int tPos; // temp value.
630 int matchVals[] = null; // Number Expression Match Results
633 if (startPos >= fText.length()) {
638 // Initial values for loop. Loop will run the first time without finding breaks,
639 // while the invalid values shift out and the "this" and
640 // "prev" positions are filled in with good values.
641 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.
642 thisChar = prevChar = 0;
646 // Loop runs once per position in the test text, until a break position
647 // is found. In each iteration, we are testing for a possible break
648 // just preceding the character at index "pos". The character preceding
649 // this char is at postion "prevPos"; because of combining sequences,
650 // "prevPos" can be arbitrarily far before "pos".
652 // Advance to the next position to be tested.
656 nextPos = moveIndex32(fText, pos, 1);
658 // Rule LB2 - Break at end of text.
659 if (pos >= fText.length()) {
663 // Rule LB 9 - adjust for combining sequences.
664 // We do this rule out-of-order because the adjustment does
665 // not effect the way that rules LB 3 through LB 6 match,
666 // and doing it here rather than after LB 6 is substantially
667 // simpler when combining sequences do occur.
670 // LB 9 Keep combining sequences together.
671 // advance over any CM class chars at "pos",
672 // result is "nextPos" for the following loop iteration.
673 thisChar = UTF16.charAt(fText, pos);
674 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||
675 thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {
677 if (nextPos == fText.length()) {
680 int nextChar = UTF16.charAt(fText, nextPos);
681 if (!fCM.contains(nextChar)) {
684 nextPos = moveIndex32(fText, nextPos, 1);
688 // LB 9 Treat X CM* as if it were X
689 // No explicit action required.
691 // LB 10 Treat any remaining combining mark as AL
692 if (fCM.contains(thisChar)) {
697 // If the loop is still warming up - if we haven't shifted the initial
698 // -1 positions out of prevPos yet - loop back to advance the
699 // position in the input without any further looking for breaks.
704 // LB 4 Always break after hard line breaks,
705 if (fBK.contains(prevChar)) {
709 // LB 5 Break after CR, LF, NL, but not inside CR LF
710 if (fCR.contains(prevChar) && fLF.contains(thisChar)) {
713 if (fCR.contains(prevChar) ||
714 fLF.contains(prevChar) ||
715 fNL.contains(prevChar)) {
719 // LB 6 Don't break before hard line breaks
720 if (fBK.contains(thisChar) || fCR.contains(thisChar) ||
721 fLF.contains(thisChar) || fNL.contains(thisChar) ) {
726 // LB 7 Don't break before spaces or zero-width space.
727 if (fSP.contains(thisChar)) {
731 if (fZW.contains(thisChar)) {
735 // LB 8 Break after zero width space
736 if (fZW.contains(prevChar)) {
740 // LB 9, 10 Already done, at top of loop.
747 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
754 if (fGL.contains(prevChar)) {
760 if (!(fSP.contains(prevChar) ||
761 fBA.contains(prevChar) ||
762 fHY.contains(prevChar) ) && fGL.contains(thisChar)) {
768 // LB 13 Don't break before closings.
769 // NU x CL, NU x CP and NU x IS are not matched here so that they will
770 // fall into LB 17 and the more general number regular expression.
772 if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
773 !fNU.contains(prevChar) && fCP.contains(thisChar) ||
774 fEX.contains(thisChar) ||
775 !fNU.contains(prevChar) && fIS.contains(thisChar) ||
776 !fNU.contains(prevChar) && fSY.contains(thisChar)) {
780 // LB 14 Don't break after OP SP*
781 // Scan backwards, checking for this sequence.
782 // The OP char could include combining marks, so we actually check for
785 if (fSP.contains(prevChar)) {
786 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
787 tPos=moveIndex32(fText, tPos, -1);
790 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
791 tPos=moveIndex32(fText, tPos, -1);
793 if (fOP.contains(UTF16.charAt(fText, tPos))) {
797 // LB 15 Do not break within "[
799 if (fOP.contains(thisChar)) {
800 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
802 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
803 tPos = moveIndex32(fText, tPos, -1);
805 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
806 tPos = moveIndex32(fText, tPos, -1);
808 if (fQU.contains(UTF16.charAt(fText, tPos))) {
813 // LB 16 (CL | CP) SP* x NS
814 if (fNS.contains(thisChar)) {
816 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
817 tPos = moveIndex32(fText, tPos, -1);
819 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
820 tPos = moveIndex32(fText, tPos, -1);
822 if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {
829 if (fB2.contains(thisChar)) {
831 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
832 tPos = moveIndex32(fText, tPos, -1);
834 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
835 tPos = moveIndex32(fText, tPos, -1);
837 if (fB2.contains(UTF16.charAt(fText, tPos))) {
842 // LB 18 break after space
843 if (fSP.contains(prevChar)) {
850 if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
854 // LB 20 Break around a CB
855 if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
860 if (fBA.contains(thisChar) ||
861 fHY.contains(thisChar) ||
862 fNS.contains(thisChar) ||
863 fBB.contains(prevChar) ) {
868 if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
869 fID.contains(prevChar) && fIN.contains(thisChar) ||
870 fIN.contains(prevChar) && fIN.contains(thisChar) ||
871 fNU.contains(prevChar) && fIN.contains(thisChar) ) {
876 // LB 23 ID x PO (Note: Leading CM behaves like ID)
879 if (fID.contains(prevChar) && fPO.contains(thisChar) ||
880 fAL.contains(prevChar) && fNU.contains(thisChar) ||
881 fNU.contains(prevChar) && fAL.contains(thisChar) ) {
885 // LB 24 Do not break between prefix and letters or ideographs.
889 if (fPR.contains(prevChar) && fID.contains(thisChar) ||
890 fPR.contains(prevChar) && fAL.contains(thisChar) ||
891 fPO.contains(prevChar) && fAL.contains(thisChar)) {
897 matchVals = LBNumberCheck(fText, prevPos, matchVals);
898 if (matchVals[0] != -1) {
899 // Matched a number. But could have been just a single digit, which would
900 // not represent a "no break here" between prevChar and thisChar
901 int numEndIdx = matchVals[1]; // idx of first char following num
902 if (numEndIdx > pos) {
903 // Number match includes at least the two chars being checked
904 if (numEndIdx > nextPos) {
905 // Number match includes additional chars. Update pos and nextPos
906 // so that next loop iteration will continue at the end of the number,
907 // checking for breaks between last char in number & whatever follows.
911 pos = moveIndex32(fText, pos, -1);
912 thisChar = UTF16.charAt(fText, pos);
914 while (fCM.contains(thisChar));
921 // LB 26 Do not break Korean Syllables
922 if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
923 fJV.contains(thisChar) ||
924 fH2.contains(thisChar) ||
925 fH3.contains(thisChar))) {
929 if ((fJV.contains(prevChar) || fH2.contains(prevChar)) &&
930 (fJV.contains(thisChar) || fJT.contains(thisChar))) {
934 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&
935 fJT.contains(thisChar)) {
939 // LB 27 Treat a Korean Syllable Block the same as ID
940 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
941 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
942 fIN.contains(thisChar)) {
945 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
946 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
947 fPO.contains(thisChar)) {
950 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||
951 fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {
957 // LB 28 Do not break between alphabetics
958 if (fAL.contains(prevChar) && fAL.contains(thisChar)) {
962 // LB 29 Do not break between numeric punctuation and alphabetics
963 if (fIS.contains(prevChar) && fAL.contains(thisChar)) {
967 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
970 if ((fAL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
973 if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fNU.contains(thisChar))) {
978 // LB 31 Break everywhere else
987 // Match the following regular expression in the input text.
988 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)? (PR | PO) CM*)?
989 // 0 0 1 3 3 4 7 7 7 7 9 9 9 11 11 (match states)
990 // retVals array [0] index of the start of the match, or -1 if no match
991 // [1] index of first char following the match.
992 // Can not use Java regex because need supplementary character support,
993 // and because Unicode char properties version must be the same as in
994 // the version of ICU being tested.
995 private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
996 if (retVals == null) {
997 retVals = new int[2];
999 retVals[0] = -1; // Indicates no match.
1003 matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
1004 int c = UTF16.charAt(s, idx);
1005 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
1006 switch (matchState) {
1008 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||
1009 cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1013 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1017 if (cLBType == UCharacter.LineBreak.HYPHEN) {
1021 if (cLBType == UCharacter.LineBreak.NUMERIC) {
1025 break matchLoop; /* No Match */
1028 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1032 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1036 if (cLBType == UCharacter.LineBreak.HYPHEN) {
1040 if (cLBType == UCharacter.LineBreak.NUMERIC) {
1044 break matchLoop; /* No Match */
1048 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1052 if (cLBType == UCharacter.LineBreak.NUMERIC) {
1056 break matchLoop; /* No Match */
1057 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)?
1058 // 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states)
1061 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1065 if (cLBType == UCharacter.LineBreak.NUMERIC) {
1069 if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
1073 if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
1077 if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
1081 if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {
1085 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1089 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1094 break matchLoop; // Match Complete.
1096 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1100 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1104 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1108 break matchLoop; // Match Complete.
1110 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1114 break matchLoop; // Match Complete.
1117 if (matchState > 4) {
1118 retVals[0] = startIdx;
1125 List charClasses() {
1136 * Sentence Monkey Test Class
1141 static class RBBISentenceMonkey extends RBBIMonkeyKind {
1146 UnicodeSet fFormatSet;
1148 UnicodeSet fLowerSet;
1149 UnicodeSet fUpperSet;
1150 UnicodeSet fOLetterSet;
1151 UnicodeSet fNumericSet;
1152 UnicodeSet fATermSet;
1153 UnicodeSet fSContinueSet;
1154 UnicodeSet fSTermSet;
1155 UnicodeSet fCloseSet;
1156 UnicodeSet fOtherSet;
1157 UnicodeSet fExtendSet;
1161 RBBISentenceMonkey() {
1162 fCharProperty = UProperty.SENTENCE_BREAK;
1164 fSets = new ArrayList();
1166 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
1167 // set and made into character classes of their own. For the monkey impl,
1168 // they remain in SEP, since Sep always appears with CR and LF in the rules.
1169 fSepSet = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");
1170 fFormatSet = new UnicodeSet("[\\p{Sentence_Break = Format}]");
1171 fSpSet = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
1172 fLowerSet = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
1173 fUpperSet = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
1174 fOLetterSet = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");
1175 fNumericSet = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");
1176 fATermSet = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
1177 fSContinueSet = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");
1178 fSTermSet = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
1179 fCloseSet = new UnicodeSet("[\\p{Sentence_Break = Close}]");
1180 fExtendSet = new UnicodeSet("[\\p{Sentence_Break = Extend}]");
1181 fOtherSet = new UnicodeSet();
1184 fOtherSet.complement();
1185 fOtherSet.removeAll(fSepSet);
1186 fOtherSet.removeAll(fFormatSet);
1187 fOtherSet.removeAll(fSpSet);
1188 fOtherSet.removeAll(fLowerSet);
1189 fOtherSet.removeAll(fUpperSet);
1190 fOtherSet.removeAll(fOLetterSet);
1191 fOtherSet.removeAll(fNumericSet);
1192 fOtherSet.removeAll(fATermSet);
1193 fOtherSet.removeAll(fSContinueSet);
1194 fOtherSet.removeAll(fSTermSet);
1195 fOtherSet.removeAll(fCloseSet);
1196 fOtherSet.removeAll(fExtendSet);
1199 fSets.add(fFormatSet);
1202 fSets.add(fLowerSet);
1203 fSets.add(fUpperSet);
1204 fSets.add(fOLetterSet);
1205 fSets.add(fNumericSet);
1206 fSets.add(fATermSet);
1207 fSets.add(fSContinueSet);
1208 fSets.add(fSTermSet);
1209 fSets.add(fCloseSet);
1210 fSets.add(fOtherSet);
1211 fSets.add(fExtendSet);
1215 List charClasses() {
1219 void setText(StringBuffer s) {
1224 // moveBack() Find the "significant" code point preceding the index i.
1225 // Skips over ($Extend | $Format)*
1227 private int moveBack(int i) {
1236 j = moveIndex32(fText, j, -1);
1237 c = UTF16.charAt(fText, j);
1239 while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));
1244 int moveForward(int i) {
1245 if (i>=fText.length()) {
1246 return fText.length();
1251 j = moveIndex32(fText, j, 1);
1254 while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));
1260 if (pos<0 || pos>=fText.length()) {
1263 return UTF16.charAt(fText, pos);
1266 int next(int prevPos) {
1267 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the
1268 // break position being tested. The candidate break
1269 // location is before p2.
1272 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1275 // Prev break at end of string. return DONE.
1276 if (prevPos >= fText.length()) {
1279 /*p0 =*/ p1 = p2 = p3 = prevPos;
1280 c3 = UTF16.charAt(fText, prevPos);
1283 // Loop runs once per "significant" character position in the input text.
1285 // Move all of the positions forward in the input string.
1286 /*p0 = p1;*/ c0 = c1;
1290 // Advancd p3 by X(Extend | Format)* Rule 4
1291 p3 = moveForward(p3);
1295 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
1299 // Rule (4) Sep <break>
1300 if (fSepSet.contains(c1)) {
1301 p2 = p1+1; // Separators don't combine with Extend or Format
1305 if (p2 >= fText.length()) {
1306 // Reached end of string. Always a break position.
1310 if (p2 == prevPos) {
1311 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1315 // Rule (6). ATerm x Numeric
1316 if (fATermSet.contains(c1) && fNumericSet.contains(c2)) {
1320 // Rule (7). Upper ATerm x Uppper
1321 if (fUpperSet.contains(c0) && fATermSet.contains(c1) && fUpperSet.contains(c2)) {
1325 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep))* Lower
1326 // Note: Sterm | ATerm are added to the negated part of the expression by a
1327 // note to the Unicode 5.0 documents.
1329 while (p8>0 && fSpSet.contains(cAt(p8))) {
1332 while (p8>0 && fCloseSet.contains(cAt(p8))) {
1335 if (fATermSet.contains(cAt(p8))) {
1339 if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||
1340 fLowerSet.contains(c) || fSepSet.contains(c) ||
1341 fATermSet.contains(c) || fSTermSet.contains(c))
1345 p8 = moveForward(p8);
1347 if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {
1352 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)
1353 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {
1355 while (setContains(fSpSet, cAt(p8))) {
1358 while (setContains(fCloseSet, cAt(p8))) {
1362 if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {
1368 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
1370 while (p9>0 && fCloseSet.contains(cAt(p9))) {
1374 if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
1375 if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {
1380 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
1382 while (p10>0 && fSpSet.contains(cAt(p10))) {
1383 p10 = moveBack(p10);
1385 while (p10>0 && fCloseSet.contains(cAt(p10))) {
1386 p10 = moveBack(p10);
1388 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {
1389 if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
1394 // Rule (11) (STerm | ATerm) Close* Sp* <break>
1396 if (p11>0 && fSepSet.contains(cAt(p11))) {
1397 p11 = moveBack(p11);
1399 while (p11>0 && fSpSet.contains(cAt(p11))) {
1400 p11 = moveBack(p11);
1402 while (p11>0 && fCloseSet.contains(cAt(p11))) {
1403 p11 = moveBack(p11);
1405 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {
1409 // Rule (12) Any x Any
1422 * Move an index into a string by n code points.
1423 * Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
1424 * complicating usage.
1425 * @param s a Text string
1426 * @param pos The starting code unit index into the text string
1427 * @param amt The amount to adjust the string by.
1428 * @return The adjusted code unit index, pinned to the string's length, or
1429 * unchanged if input index was outside of the string.
1431 static int moveIndex32(StringBuffer s, int pos, int amt) {
1435 for (i=0; i<amt; i++) {
1436 if (pos >= s.length()) {
1441 if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
1443 if (UTF16.isTrailSurrogate(c)) {
1449 for (i=0; i>amt; i--) {
1455 if (UTF16.isTrailSurrogate(c) && pos >= 0) {
1457 if (UTF16.isLeadSurrogate(c)) {
1467 * No-exceptions form of UnicodeSet.contains(c).
1468 * Simplifies loops that terminate with an end-of-input character value.
1469 * @param s A unicode set
1470 * @param c A code point value
1471 * @return true if the set contains c.
1473 static boolean setContains(UnicodeSet s, int c) {
1474 if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {
1477 return s.contains(c);
1482 * return the index of the next code point in the input text.
1483 * @param i the preceding index
1486 static int nextCP(StringBuffer s, int i) {
1488 // End of Input indication. Continue to return end value.
1492 if (retVal > s.length()) {
1495 int c = UTF16.charAt(s, i);
1496 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {
1504 * random number generator. Not using Java's built-in Randoms for two reasons:
1505 * 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
1506 * 2. We need to get and restore the seed from values occurring in the middle
1507 * of a long sequence, to more easily reproduce failing cases.
1509 private static int m_seed = 1;
1510 private static int m_rand()
1512 m_seed = m_seed * 1103515245 + 12345;
1513 return (int)(m_seed >>> 16) % 32768;
1516 // Helper function for formatting error output.
1517 // Append a string into a fixed-size field in a StringBuffer.
1518 // Blank-pad the string if it is shorter than the field.
1519 // Truncate the source string if it is too long.
1521 private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {
1522 int appendLen = src.length();
1523 if (appendLen >= fieldLen) {
1524 dest.append(src.substring(0, fieldLen));
1527 while (appendLen < fieldLen) {
1534 // Helper function for formatting error output.
1535 // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format
1536 private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {
1537 String hexChars = "0123456789abcdef";
1540 for (int bn=12; bn>=0; bn-=4) {
1541 dest.append(hexChars.charAt((((int)c)>>bn)&0xf));
1543 appendToBuf(dest, " ", fieldLen-6);
1546 for (int bn=28; bn>=0; bn-=4) {
1547 dest.append(hexChars.charAt((((int)c)>>bn)&0xf));
1549 appendToBuf(dest, " ", fieldLen-10);
1555 * Run a RBBI monkey test. Common routine, for all break iterator types.
1557 * bi - the break iterator to use
1558 * mk - MonkeyKind, abstraction for obtaining expected results
1559 * name - Name of test (char, word, etc.) for use in error messages
1560 * seed - Seed for starting random number generator (parameter from user)
1563 void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations) {
1564 int TESTSTRINGLEN = 500;
1565 StringBuffer testText = new StringBuffer();
1568 int[] expected = new int[TESTSTRINGLEN*2 + 1];
1569 int expectedCount = 0;
1570 boolean[] expectedBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1571 boolean[] forwardBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1572 boolean[] reverseBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1573 boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1574 boolean[] followingBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1575 boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1578 boolean printTestData = false;
1579 boolean printBreaksFromBI = false;
1583 numCharClasses = mk.charClasses().size();
1584 chClasses = mk.charClasses();
1586 // Verify that the character classes all have at least one member.
1587 for (i=0; i<numCharClasses; i++) {
1588 UnicodeSet s = (UnicodeSet)chClasses.get(i);
1589 if (s == null || s.size() == 0) {
1590 errln("Character Class " + i + " is null or of zero size.");
1595 //--------------------------------------------------------------------------------------------
1597 // Debugging settings. Comment out everything in the following block for normal operation
1599 //--------------------------------------------------------------------------------------------
1600 // numIterations = -1;
1601 // RuleBasedBreakIterator_New.fTrace = true;
1602 // m_seed = 859056465;
1603 // TESTSTRINGLEN = 50;
1604 // printTestData = true;
1605 // printBreaksFromBI = true;
1606 // ((RuleBasedBreakIterator_New)bi).dump();
1608 //--------------------------------------------------------------------------------------------
1610 // End of Debugging settings.
1612 //--------------------------------------------------------------------------------------------
1615 while (loopCount < numIterations || numIterations == -1) {
1616 if (numIterations == -1 && loopCount % 10 == 0) {
1617 // If test is running in an infinite loop, display a periodic tic so
1618 // we can tell that it is making progress.
1619 System.out.print(".");
1620 if (dotsOnLine++ >= 80){
1621 System.out.println();
1625 // Save current random number seed, so that we can recreate the random numbers
1626 // for this loop iteration in event of an error.
1629 testText.setLength(0);
1630 // Populate a test string with data.
1631 if (printTestData) {
1632 System.out.println("Test Data string ...");
1634 for (i=0; i<TESTSTRINGLEN; i++) {
1635 int aClassNum = m_rand() % numCharClasses;
1636 UnicodeSet classSet = (UnicodeSet)chClasses.get(aClassNum);
1637 int charIdx = m_rand() % classSet.size();
1638 int c = classSet.charAt(charIdx);
1639 if (c < 0) { // TODO: deal with sets containing strings.
1642 UTF16.appendCodePoint(testText, c);
1643 if (printTestData) {
1644 System.out.print(Integer.toHexString(c) + " ");
1647 if (printTestData) {
1648 System.out.println();
1651 Arrays.fill(expected, 0);
1652 Arrays.fill(expectedBreaks, false);
1653 Arrays.fill(forwardBreaks, false);
1654 Arrays.fill(reverseBreaks, false);
1655 Arrays.fill(isBoundaryBreaks, false);
1656 Arrays.fill(followingBreaks, false);
1657 Arrays.fill(precedingBreaks, false);
1659 // Calculate the expected results for this test string.
1660 mk.setText(testText);
1662 expectedBreaks[0] = true;
1663 expected[expectedCount ++] = 0;
1665 int lastBreakPos = -1;
1667 lastBreakPos = breakPos;
1668 breakPos = mk.next(breakPos);
1669 if (breakPos == -1) {
1672 if (breakPos > testText.length()) {
1673 errln("breakPos > testText.length()");
1675 if (lastBreakPos >= breakPos) {
1676 errln("Next() not increasing.");
1679 expectedBreaks[breakPos] = true;
1680 expected[expectedCount ++] = breakPos;
1683 // Find the break positions using forward iteration
1684 if (printBreaksFromBI) {
1685 System.out.println("Breaks from BI...");
1687 bi.setText(testText.toString());
1688 for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {
1689 if (i < 0 || i > testText.length()) {
1690 errln(name + " break monkey test: Out of range value returned by breakIterator::next()");
1693 if (printBreaksFromBI) {
1694 System.out.print(Integer.toHexString(i) + " ");
1696 forwardBreaks[i] = true;
1698 if (printBreaksFromBI) {
1699 System.out.println();
1702 // Find the break positions using reverse iteration
1703 for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {
1704 if (i < 0 || i > testText.length()) {
1705 errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);
1708 reverseBreaks[i] = true;
1711 // Find the break positions using isBoundary() tests.
1712 for (i=0; i<=testText.length(); i++) {
1713 isBoundaryBreaks[i] = bi.isBoundary(i);
1716 // Find the break positions using the following() function.
1718 followingBreaks[0] = true;
1719 for (i=0; i<testText.length(); i++) {
1720 breakPos = bi.following(i);
1721 if (breakPos <= i ||
1722 breakPos < lastBreakPos ||
1723 breakPos > testText.length() ||
1724 breakPos > lastBreakPos && lastBreakPos > i ) {
1725 errln(name + " break monkey test: " +
1726 "Out of range value returned by BreakIterator::following().\n" +
1727 "index=" + i + "following returned=" + breakPos +
1728 "lastBreak=" + lastBreakPos);
1729 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
1731 followingBreaks[breakPos] = true;
1732 lastBreakPos = breakPos;
1736 // Find the break positions using the preceding() function.
1737 lastBreakPos = testText.length();
1738 precedingBreaks[testText.length()] = true;
1739 for (i=testText.length(); i>0; i--) {
1740 breakPos = bi.preceding(i);
1741 if (breakPos >= i ||
1742 breakPos > lastBreakPos ||
1744 breakPos < lastBreakPos && lastBreakPos < i ) {
1745 errln(name + " break monkey test: " +
1746 "Out of range value returned by BreakIterator::preceding().\n" +
1747 "index=" + i + "preceding returned=" + breakPos +
1748 "lastBreak=" + lastBreakPos);
1749 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
1751 precedingBreaks[breakPos] = true;
1752 lastBreakPos = breakPos;
1758 // Compare the expected and actual results.
1759 for (i=0; i<=testText.length(); i++) {
1760 String errorType = null;
1761 if (forwardBreaks[i] != expectedBreaks[i]) {
1762 errorType = "next()";
1763 } else if (reverseBreaks[i] != forwardBreaks[i]) {
1764 errorType = "previous()";
1765 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
1766 errorType = "isBoundary()";
1767 } else if (followingBreaks[i] != expectedBreaks[i]) {
1768 errorType = "following()";
1769 } else if (precedingBreaks[i] != expectedBreaks[i]) {
1770 errorType = "preceding()";
1774 if (errorType != null) {
1775 // Format a range of the test text that includes the failure as
1776 // a data item that can be included in the rbbi test data file.
1778 // Start of the range is the last point where expected and actual results
1779 // both agreed that there was a break position.
1780 int startContext = i;
1783 if (startContext==0) { break; }
1785 if (expectedBreaks[startContext]) {
1786 if (count == 2) break;
1791 // End of range is two expected breaks past the start position.
1792 int endContext = i + 1;
1794 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
1796 if (endContext >= testText.length()) {break;}
1797 if (expectedBreaks[endContext-1]) {
1798 if (count == 0) break;
1805 // Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
1806 StringBuffer errorText = new StringBuffer();
1808 int c; // Char from test data
1809 for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testText, ci)) {
1811 // This is the location of the error.
1812 errorText.append("<?>---------------------------------\n");
1813 } else if (expectedBreaks[ci]) {
1814 // This a non-error expected break position.
1815 errorText.append("------------------------------------\n");
1817 if (ci < testText.length()) {
1818 c = UTF16.charAt(testText, ci);
1819 appendCharToBuf(errorText, c, 11);
1820 String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT);
1821 appendToBuf(errorText, gc, 8);
1822 int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty);
1823 String extraPropValue =
1824 UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG);
1825 appendToBuf(errorText, extraPropValue, 20);
1827 String charName = UCharacter.getExtendedName(c);
1828 appendToBuf(errorText, charName, 40);
1829 errorText.append('\n');
1832 if (ci == testText.length() && ci != -1) {
1833 errorText.append("<>");
1835 errorText.append("</data>\n");
1838 errln(name + " break monkey test error. " +
1839 (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") +
1840 "\nOperation = " + errorType + "; random seed = " + seed + "; buf Idx = " + i + "\n" +
1850 public void TestCharMonkey() {
1852 int loopCount = 500;
1855 if (params.inclusion >= 9) {
1859 RBBICharMonkey m = new RBBICharMonkey();
1860 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US);
1861 RunMonkey(bi, m, "char", seed, loopCount);
1864 public void TestWordMonkey() {
1866 int loopCount = 500;
1869 if (params.inclusion >= 9) {
1873 logln("Word Break Monkey Test");
1874 RBBIWordMonkey m = new RBBIWordMonkey();
1875 BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
1876 RunMonkey(bi, m, "word", seed, loopCount);
1879 public void TestLineMonkey() {
1880 int loopCount = 500;
1883 if (params.inclusion >= 9) {
1887 logln("Line Break Monkey Test");
1888 RBBILineMonkey m = new RBBILineMonkey();
1889 BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
1890 if (params == null) {
1893 RunMonkey(bi, m, "line", seed, loopCount);
1896 public void TestSentMonkey() {
1898 int loopCount = 500;
1901 if (params.inclusion >= 9) {
1905 logln("Sentence Break Monkey Test");
1906 RBBISentenceMonkey m = new RBBISentenceMonkey();
1907 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
1908 if (params == null) {
1911 RunMonkey(bi, m, "sent", seed, loopCount);
1914 // Round-trip monkey tests.
1915 // Verify that break iterators created from the rule source from the default
1916 // break iterators still pass the monkey test for the iterator type.
1918 // This is a major test for the Rule Compiler. The default break iterators are built
1919 // from pre-compiled binary rule data that was created using ICU4C; these
1920 // round-trip rule recompile tests verify that the Java rule compiler can
1921 // rebuild break iterators from the original source rules.
1923 public void TestRTCharMonkey() {
1925 int loopCount = 200;
1928 if (params.inclusion >= 9) {
1932 RBBICharMonkey m = new RBBICharMonkey();
1933 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US);
1934 String rules = bi.toString();
1935 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
1936 RunMonkey(rtbi, m, "char", seed, loopCount);
1939 public void TestRTWordMonkey() {
1941 int loopCount = 200;
1944 if (params.inclusion >= 9) {
1948 logln("Word Break Monkey Test");
1949 RBBIWordMonkey m = new RBBIWordMonkey();
1950 BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
1951 String rules = bi.toString();
1952 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
1953 RunMonkey(rtbi, m, "word", seed, loopCount);
1956 public void TestRTLineMonkey() {
1957 int loopCount = 200;
1960 if (params.inclusion >= 9) {
1964 logln("Line Break Monkey Test");
1965 RBBILineMonkey m = new RBBILineMonkey();
1966 BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
1967 String rules = bi.toString();
1968 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
1969 if (params == null) {
1972 RunMonkey(rtbi, m, "line", seed, loopCount);
1975 public void TestRTSentMonkey() {
1977 int loopCount = 200;
1980 if (params.inclusion >= 9) {
1984 logln("Sentence Break Monkey Test");
1985 RBBISentenceMonkey m = new RBBISentenceMonkey();
1986 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
1987 String rules = bi.toString();
1988 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
1989 if (params == null) {
1992 RunMonkey(rtbi, m, "sent", seed, loopCount);