2 *******************************************************************************
3 * Copyright (C) 2003-2013 International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
7 package com.ibm.icu.dev.test.rbbi;
10 // Monkey testing of RuleBasedBreakIterator
11 import java.util.ArrayList;
12 import java.util.Arrays;
13 import java.util.List;
14 import java.util.Locale;
16 import com.ibm.icu.dev.test.TestFmwk;
17 import com.ibm.icu.lang.UCharacter;
18 import com.ibm.icu.lang.UProperty;
19 import com.ibm.icu.text.BreakIterator;
20 import com.ibm.icu.text.RuleBasedBreakIterator;
21 import com.ibm.icu.text.UTF16;
22 import com.ibm.icu.text.UnicodeSet;
26 * Monkey tests for RBBI. These tests have independent implementations of
27 * the Unicode TR boundary rules, and compare results between these and ICU's
28 * implementation, using random data.
30 * Tests cover Grapheme Cluster (char), Word and Line breaks
32 * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
35 public class RBBITestMonkey extends TestFmwk {
37 public static void main(String[] args) {
38 new RBBITestMonkey().run(args);
42 // classs RBBIMonkeyKind
44 // Monkey Test for Break Iteration
45 // Abstract interface class. Concrete derived classes independently
46 // implement the break rules for different iterator types.
48 // The Monkey Test itself uses doesn't know which type of break iterator it is
49 // testing, but works purely in terms of the interface defined here.
51 abstract static class RBBIMonkeyKind {
53 // Return a List of UnicodeSets, representing the character classes used
54 // for this type of iterator.
55 abstract List charClasses();
57 // Set the test text on which subsequent calls to next() will operate
58 abstract void setText(StringBuffer text);
60 // Find the next break position, starting from the specified position.
61 // Return -1 after reaching end of string.
62 abstract int next(int i);
64 // A Character Property, one of the constants defined in class UProperty.
65 // The value of this property will be displayed for the characters
66 // near any test failure.
72 * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
73 * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets
75 static class RBBICharMonkey extends RBBIMonkeyKind {
79 UnicodeSet fControlSet;
80 UnicodeSet fExtendSet;
81 UnicodeSet fRegionalIndicatorSet;
82 UnicodeSet fPrependSet;
83 UnicodeSet fSpacingSet;
89 UnicodeSet fHangulSet;
97 fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
98 fCRLFSet = new UnicodeSet("[\\r\\n]");
99 fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
100 fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
101 fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]");
102 fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
103 fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
104 fLSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
105 fVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");
106 fTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");
107 fLVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");
108 fLVTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");
109 fHangulSet = new UnicodeSet();
110 fHangulSet.addAll(fLSet);
111 fHangulSet.addAll(fVSet);
112 fHangulSet.addAll(fTSet);
113 fHangulSet.addAll(fLVSet);
114 fHangulSet.addAll(fLVTSet);
116 fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");
118 fSets = new ArrayList();
120 fSets.add(fControlSet);
121 fSets.add(fExtendSet);
122 fSets.add(fRegionalIndicatorSet);
123 if (!fPrependSet.isEmpty()) {
124 fSets.add(fPrependSet);
126 fSets.add(fSpacingSet);
127 fSets.add(fHangulSet);
132 void setText(StringBuffer s) {
140 int next(int prevPos) {
141 int p1, p2, p3; // Indices of the significant code points around the
142 // break position being tested. The candidate break
143 // location is before p2.
147 int c1, c2, c3; // The code points at p0, p1, p2 & p3.
149 // Previous break at end of string. return DONE.
150 if (prevPos >= fText.length()) {
153 p1 = p2 = p3 = prevPos;
154 c3 = UTF16.charAt(fText, prevPos);
157 // Loop runs once per "significant" character position in the input text.
159 // Move all of the positions forward in the input string.
163 // Advance p3 by one codepoint
164 p3 = moveIndex32(fText, p3, 1);
165 c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);
168 // Still warming up the loop. (won't work with zero length strings, but we don't care)
171 if (p2 == fText.length()) {
172 // Reached end of string. Always a break position.
177 // No Extend or Format characters may appear between the CR and LF,
178 // which requires the additional check for p2 immediately following p1.
180 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
184 // Rule (GB4). ( Control | CR | LF ) <break>
185 if (fControlSet.contains(c1) ||
191 // Rule (GB5) <break> ( Control | CR | LF )
193 if (fControlSet.contains(c2) ||
200 // Rule (GB6) L x ( L | V | LV | LVT )
201 if (fLSet.contains(c1) &&
202 (fLSet.contains(c2) ||
203 fVSet.contains(c2) ||
204 fLVSet.contains(c2) ||
205 fLVTSet.contains(c2))) {
209 // Rule (GB7) ( LV | V ) x ( V | T )
210 if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&
211 (fVSet.contains(c2) || fTSet.contains(c2))) {
215 // Rule (GB8) ( LVT | T) x T
216 if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&
217 fTSet.contains(c2)) {
221 // Rule (GB8a) Regional_Indicator x Regional_Indicator
222 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
226 // Rule (GB9) Numeric x ALetter
227 if (fExtendSet.contains(c2)) {
231 // Rule (GB9a) x SpacingMark
232 if (fSpacingSet.contains(c2)) {
236 // Rule (GB9b) Prepend x
237 if (fPrependSet.contains(c1)) {
241 // Rule (GB10) Any <break> Any
253 * Word Monkey Test Class
258 static class RBBIWordMonkey extends RBBIMonkeyKind {
264 UnicodeSet fNewlineSet;
265 UnicodeSet fRegionalIndicatorSet;
266 UnicodeSet fKatakanaSet;
267 UnicodeSet fHebrew_LetterSet;
268 UnicodeSet fALetterSet;
269 UnicodeSet fSingle_QuoteSet;
270 UnicodeSet fDouble_QuoteSet;
271 UnicodeSet fMidNumLetSet;
272 UnicodeSet fMidLetterSet;
273 UnicodeSet fMidNumSet;
274 UnicodeSet fNumericSet;
275 UnicodeSet fFormatSet;
276 UnicodeSet fExtendSet;
277 UnicodeSet fExtendNumLetSet;
278 UnicodeSet fOtherSet;
279 UnicodeSet fDictionaryCjkSet;
283 fCharProperty = UProperty.WORD_BREAK;
285 fDictionaryCjkSet= new UnicodeSet("[[:Script=Hangul:][:Han:][:Hiragana:][:Katakana:]]");
286 fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]");
287 fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]");
288 fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]");
289 fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]");
290 fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]");
291 fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]");
292 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]");
293 fALetterSet.removeAll(fDictionaryCjkSet);
294 fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]");
295 fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]");
296 fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
297 fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
298 fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]");
299 fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]");
300 fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]");
301 fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
302 fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]");
304 fOtherSet = new UnicodeSet();
305 fOtherSet.complement();
306 fOtherSet.removeAll(fCRSet);
307 fOtherSet.removeAll(fLFSet);
308 fOtherSet.removeAll(fNewlineSet);
309 fOtherSet.removeAll(fALetterSet);
310 fOtherSet.removeAll(fSingle_QuoteSet);
311 fOtherSet.removeAll(fDouble_QuoteSet);
312 fOtherSet.removeAll(fKatakanaSet);
313 fOtherSet.removeAll(fHebrew_LetterSet);
314 fOtherSet.removeAll(fMidLetterSet);
315 fOtherSet.removeAll(fMidNumSet);
316 fOtherSet.removeAll(fNumericSet);
317 fOtherSet.removeAll(fFormatSet);
318 fOtherSet.removeAll(fExtendSet);
319 fOtherSet.removeAll(fExtendNumLetSet);
320 fOtherSet.removeAll(fRegionalIndicatorSet);
321 // Inhibit dictionary characters from being tested at all.
322 // remove surrogates so as to not generate higher CJK characters
323 fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]"));
324 fOtherSet.removeAll(fDictionaryCjkSet);
326 fSets = new ArrayList();
329 fSets.add(fNewlineSet);
330 fSets.add(fRegionalIndicatorSet);
331 fSets.add(fHebrew_LetterSet);
332 fSets.add(fALetterSet);
333 //fSets.add(fKatakanaSet); // TODO: work out how to test katakana
334 fSets.add(fSingle_QuoteSet);
335 fSets.add(fDouble_QuoteSet);
336 fSets.add(fMidLetterSet);
337 fSets.add(fMidNumLetSet);
338 fSets.add(fMidNumSet);
339 fSets.add(fNumericSet);
340 fSets.add(fFormatSet);
341 fSets.add(fExtendSet);
342 fSets.add(fExtendNumLetSet);
343 fSets.add(fOtherSet);
351 void setText(StringBuffer s) {
355 int next(int prevPos) {
356 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the
357 // break position being tested. The candidate break
358 // location is before p2.
361 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
363 // Previous break at end of string. return DONE.
364 if (prevPos >= fText.length()) {
367 /*p0 =*/ p1 = p2 = p3 = prevPos;
368 c3 = UTF16.charAt(fText, prevPos);
373 // Loop runs once per "significant" character position in the input text.
375 // Move all of the positions forward in the input string.
376 /*p0 = p1;*/ c0 = c1;
380 // Advance p3 by X(Extend | Format)* Rule 4
381 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
383 p3 = moveIndex32(fText, p3, 1);
385 if (p3>=fText.length()) {
388 c3 = UTF16.charAt(fText, p3);
389 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
393 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3));
396 // Still warming up the loop. (won't work with zero length strings, but we don't care)
399 if (p2 == fText.length()) {
400 // Reached end of string. Always a break position.
405 // No Extend or Format characters may appear between the CR and LF,
406 // which requires the additional check for p2 immediately following p1.
408 if (c1==0x0D && c2==0x0A) {
412 // Rule (3a) Break before and after newlines (including CR and LF)
414 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {
417 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
421 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
422 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
423 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
427 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
429 if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
430 (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
431 (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) {
435 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
436 if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) &&
437 (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
438 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
442 // Rule (7a) Hebrew_Letter x Single_Quote
443 if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) {
447 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
448 if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) {
452 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
453 if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) {
457 // Rule (8) Numeric x Numeric
458 if (fNumericSet.contains(c1) &&
459 fNumericSet.contains(c2)) {
463 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
464 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
465 fNumericSet.contains(c2)) {
469 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
470 if (fNumericSet.contains(c1) &&
471 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
475 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
476 if (fNumericSet.contains(c0) &&
477 (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
478 fNumericSet.contains(c2)) {
482 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
483 if (fNumericSet.contains(c1) &&
484 (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
485 setContains(fNumericSet, c3)) {
489 // Rule (13) Katakana x Katakana
490 if (fKatakanaSet.contains(c1) &&
491 fKatakanaSet.contains(c2)) {
495 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
496 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) ||
497 fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
498 fExtendNumLetSet.contains(c2)) {
502 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
503 if (fExtendNumLetSet.contains(c1) &&
504 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) ||
505 fNumericSet.contains(c2) || fKatakanaSet.contains(c2))) {
510 // Rule 13c Do not break between Regional Indicators.
511 // Regional_Indicator × Regional_Indicator
512 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
516 // Rule 14. Break found here.
527 static class RBBILineMonkey extends RBBIMonkeyKind {
578 fCharProperty = UProperty.LINE_BREAK;
579 fSets = new ArrayList();
581 fBK = new UnicodeSet("[\\p{Line_Break=BK}]");
582 fCR = new UnicodeSet("[\\p{Line_break=CR}]");
583 fLF = new UnicodeSet("[\\p{Line_break=LF}]");
584 fCM = new UnicodeSet("[\\p{Line_break=CM}]");
585 fNL = new UnicodeSet("[\\p{Line_break=NL}]");
586 fWJ = new UnicodeSet("[\\p{Line_break=WJ}]");
587 fZW = new UnicodeSet("[\\p{Line_break=ZW}]");
588 fGL = new UnicodeSet("[\\p{Line_break=GL}]");
589 fCB = new UnicodeSet("[\\p{Line_break=CB}]");
590 fSP = new UnicodeSet("[\\p{Line_break=SP}]");
591 fB2 = new UnicodeSet("[\\p{Line_break=B2}]");
592 fBA = new UnicodeSet("[\\p{Line_break=BA}]");
593 fBB = new UnicodeSet("[\\p{Line_break=BB}]");
594 fHY = new UnicodeSet("[\\p{Line_break=HY}]");
595 fCL = new UnicodeSet("[\\p{Line_break=CL}]");
596 fCP = new UnicodeSet("[\\p{Line_break=CP}]");
597 fEX = new UnicodeSet("[\\p{Line_break=EX}]");
598 fIN = new UnicodeSet("[\\p{Line_break=IN}]");
599 fNS = new UnicodeSet("[\\p{Line_break=NS}]");
600 fOP = new UnicodeSet("[\\p{Line_break=OP}]");
601 fQU = new UnicodeSet("[\\p{Line_break=QU}]");
602 fIS = new UnicodeSet("[\\p{Line_break=IS}]");
603 fNU = new UnicodeSet("[\\p{Line_break=NU}]");
604 fPO = new UnicodeSet("[\\p{Line_break=PO}]");
605 fPR = new UnicodeSet("[\\p{Line_break=PR}]");
606 fSY = new UnicodeSet("[\\p{Line_break=SY}]");
607 fAI = new UnicodeSet("[\\p{Line_break=AI}]");
608 fAL = new UnicodeSet("[\\p{Line_break=AL}]");
609 fHL = new UnicodeSet("[\\p{Line_break=HL}]");
610 fID = new UnicodeSet("[\\p{Line_break=ID}]");
611 fSA = new UnicodeSet("[\\p{Line_break=SA}]");
612 fJL = new UnicodeSet("[\\p{Line_break=JL}]");
613 fJV = new UnicodeSet("[\\p{Line_break=JV}]");
614 fJT = new UnicodeSet("[\\p{Line_break=JT}]");
615 fH2 = new UnicodeSet("[\\p{Line_break=H2}]");
616 fH3 = new UnicodeSet("[\\p{Line_break=H3}]");
617 fSG = new UnicodeSet("[\\ud800-\\udfff]");
618 fRI = new UnicodeSet("[\\p{Line_break=RI}]");
619 fXX = new UnicodeSet("[\\p{Line_break=XX}]");
622 fAL.addAll(fXX); // Default behavior for XX is identical to AL
623 fAL.addAll(fAI); // Default behavior for AI is identical to AL
624 fAL.addAll(fSA); // Default behavior for SA is XX, which defaults to AL
625 fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL
670 void setText(StringBuffer s) {
677 int next(int startPos) {
678 int pos; // Index of the char following a potential break position
679 int thisChar; // Character at above position "pos"
681 int prevPos; // Index of the char preceding a potential break position
682 int prevChar; // Character at above position. Note that prevChar
683 // and thisChar may not be adjacent because combining
684 // characters between them will be ignored.
685 int prevCharX2; // Character before prevChar, more contex for LB 21a
687 int nextPos; // Index of the next character following pos.
688 // Usually skips over combining marks.
689 int tPos; // temp value.
690 int matchVals[] = null; // Number Expression Match Results
693 if (startPos >= fText.length()) {
698 // Initial values for loop. Loop will run the first time without finding breaks,
699 // while the invalid values shift out and the "this" and
700 // "prev" positions are filled in with good values.
701 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.
702 thisChar = prevChar = prevCharX2 = 0;
706 // Loop runs once per position in the test text, until a break position
707 // is found. In each iteration, we are testing for a possible break
708 // just preceding the character at index "pos". The character preceding
709 // this char is at postion "prevPos"; because of combining sequences,
710 // "prevPos" can be arbitrarily far before "pos".
712 // Advance to the next position to be tested.
713 prevCharX2 = prevChar;
717 nextPos = moveIndex32(fText, pos, 1);
719 // Rule LB2 - Break at end of text.
720 if (pos >= fText.length()) {
724 // Rule LB 9 - adjust for combining sequences.
725 // We do this rule out-of-order because the adjustment does
726 // not effect the way that rules LB 3 through LB 6 match,
727 // and doing it here rather than after LB 6 is substantially
728 // simpler when combining sequences do occur.
731 // LB 9 Keep combining sequences together.
732 // advance over any CM class chars at "pos",
733 // result is "nextPos" for the following loop iteration.
734 thisChar = UTF16.charAt(fText, pos);
735 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||
736 thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {
738 if (nextPos == fText.length()) {
741 int nextChar = UTF16.charAt(fText, nextPos);
742 if (!fCM.contains(nextChar)) {
745 nextPos = moveIndex32(fText, nextPos, 1);
749 // LB 9 Treat X CM* as if it were X
750 // No explicit action required.
752 // LB 10 Treat any remaining combining mark as AL
753 if (fCM.contains(thisChar)) {
758 // If the loop is still warming up - if we haven't shifted the initial
759 // -1 positions out of prevPos yet - loop back to advance the
760 // position in the input without any further looking for breaks.
765 // LB 4 Always break after hard line breaks,
766 if (fBK.contains(prevChar)) {
770 // LB 5 Break after CR, LF, NL, but not inside CR LF
771 if (fCR.contains(prevChar) && fLF.contains(thisChar)) {
774 if (fCR.contains(prevChar) ||
775 fLF.contains(prevChar) ||
776 fNL.contains(prevChar)) {
780 // LB 6 Don't break before hard line breaks
781 if (fBK.contains(thisChar) || fCR.contains(thisChar) ||
782 fLF.contains(thisChar) || fNL.contains(thisChar) ) {
787 // LB 7 Don't break before spaces or zero-width space.
788 if (fSP.contains(thisChar)) {
792 if (fZW.contains(thisChar)) {
796 // LB 8 Break after zero width space
797 if (fZW.contains(prevChar)) {
801 // LB 9, 10 Already done, at top of loop.
808 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
815 if (fGL.contains(prevChar)) {
821 if (!(fSP.contains(prevChar) ||
822 fBA.contains(prevChar) ||
823 fHY.contains(prevChar) ) && fGL.contains(thisChar)) {
829 // LB 13 Don't break before closings.
830 // NU x CL, NU x CP and NU x IS are not matched here so that they will
831 // fall into LB 17 and the more general number regular expression.
833 if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
834 !fNU.contains(prevChar) && fCP.contains(thisChar) ||
835 fEX.contains(thisChar) ||
836 !fNU.contains(prevChar) && fIS.contains(thisChar) ||
837 !fNU.contains(prevChar) && fSY.contains(thisChar)) {
841 // LB 14 Don't break after OP SP*
842 // Scan backwards, checking for this sequence.
843 // The OP char could include combining marks, so we actually check for
846 if (fSP.contains(prevChar)) {
847 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
848 tPos=moveIndex32(fText, tPos, -1);
851 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
852 tPos=moveIndex32(fText, tPos, -1);
854 if (fOP.contains(UTF16.charAt(fText, tPos))) {
858 // LB 15 Do not break within "[
860 if (fOP.contains(thisChar)) {
861 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
863 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
864 tPos = moveIndex32(fText, tPos, -1);
866 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
867 tPos = moveIndex32(fText, tPos, -1);
869 if (fQU.contains(UTF16.charAt(fText, tPos))) {
874 // LB 16 (CL | CP) SP* x NS
875 if (fNS.contains(thisChar)) {
877 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
878 tPos = moveIndex32(fText, tPos, -1);
880 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
881 tPos = moveIndex32(fText, tPos, -1);
883 if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {
890 if (fB2.contains(thisChar)) {
892 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
893 tPos = moveIndex32(fText, tPos, -1);
895 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
896 tPos = moveIndex32(fText, tPos, -1);
898 if (fB2.contains(UTF16.charAt(fText, tPos))) {
903 // LB 18 break after space
904 if (fSP.contains(prevChar)) {
911 if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
915 // LB 20 Break around a CB
916 if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
921 if (fBA.contains(thisChar) ||
922 fHY.contains(thisChar) ||
923 fNS.contains(thisChar) ||
924 fBB.contains(prevChar) ) {
928 // LB 21a, HL (HY | BA) x
929 if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) {
934 if (fSY.contains(prevChar) && fHL.contains(thisChar)) {
939 if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
940 fHL.contains(prevChar) && fIN.contains(thisChar) ||
941 fID.contains(prevChar) && fIN.contains(thisChar) ||
942 fIN.contains(prevChar) && fIN.contains(thisChar) ||
943 fNU.contains(prevChar) && fIN.contains(thisChar) ) {
948 // LB 23 ID x PO (Note: Leading CM behaves like ID)
951 if (fID.contains(prevChar) && fPO.contains(thisChar) ||
952 fAL.contains(prevChar) && fNU.contains(thisChar) ||
953 fHL.contains(prevChar) && fNU.contains(thisChar) ||
954 fNU.contains(prevChar) && fAL.contains(thisChar) ||
955 fNU.contains(prevChar) && fHL.contains(thisChar) ) {
959 // LB 24 Do not break between prefix and letters or ideographs.
963 if (fPR.contains(prevChar) && fID.contains(thisChar) ||
964 fPR.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar)) ||
965 fPO.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
971 matchVals = LBNumberCheck(fText, prevPos, matchVals);
972 if (matchVals[0] != -1) {
973 // Matched a number. But could have been just a single digit, which would
974 // not represent a "no break here" between prevChar and thisChar
975 int numEndIdx = matchVals[1]; // idx of first char following num
976 if (numEndIdx > pos) {
977 // Number match includes at least the two chars being checked
978 if (numEndIdx > nextPos) {
979 // Number match includes additional chars. Update pos and nextPos
980 // so that next loop iteration will continue at the end of the number,
981 // checking for breaks between last char in number & whatever follows.
985 pos = moveIndex32(fText, pos, -1);
986 thisChar = UTF16.charAt(fText, pos);
988 while (fCM.contains(thisChar));
995 // LB 26 Do not break Korean Syllables
996 if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
997 fJV.contains(thisChar) ||
998 fH2.contains(thisChar) ||
999 fH3.contains(thisChar))) {
1003 if ((fJV.contains(prevChar) || fH2.contains(prevChar)) &&
1004 (fJV.contains(thisChar) || fJT.contains(thisChar))) {
1008 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&
1009 fJT.contains(thisChar)) {
1013 // LB 27 Treat a Korean Syllable Block the same as ID
1014 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
1015 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
1016 fIN.contains(thisChar)) {
1019 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
1020 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
1021 fPO.contains(thisChar)) {
1024 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||
1025 fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {
1031 // LB 28 Do not break between alphabetics
1032 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1036 // LB 29 Do not break between numeric punctuation and alphabetics
1037 if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1041 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
1044 if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
1047 if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) {
1051 // LB 30a Do not break between regional indicators. RI × RI
1052 if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
1056 // LB 31 Break everywhere else
1065 // Match the following regular expression in the input text.
1066 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)? (PR | PO) CM*)?
1067 // 0 0 1 3 3 4 7 7 7 7 9 9 9 11 11 (match states)
1068 // retVals array [0] index of the start of the match, or -1 if no match
1069 // [1] index of first char following the match.
1070 // Can not use Java regex because need supplementary character support,
1071 // and because Unicode char properties version must be the same as in
1072 // the version of ICU being tested.
1073 private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
1074 if (retVals == null) {
1075 retVals = new int[2];
1077 retVals[0] = -1; // Indicates no match.
1081 matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
1082 int c = UTF16.charAt(s, idx);
1083 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
1084 switch (matchState) {
1086 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||
1087 cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1091 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1095 if (cLBType == UCharacter.LineBreak.HYPHEN) {
1099 if (cLBType == UCharacter.LineBreak.NUMERIC) {
1103 break matchLoop; /* No Match */
1106 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1110 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1114 if (cLBType == UCharacter.LineBreak.HYPHEN) {
1118 if (cLBType == UCharacter.LineBreak.NUMERIC) {
1122 break matchLoop; /* No Match */
1126 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1130 if (cLBType == UCharacter.LineBreak.NUMERIC) {
1134 break matchLoop; /* No Match */
1135 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)?
1136 // 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states)
1139 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1143 if (cLBType == UCharacter.LineBreak.NUMERIC) {
1147 if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
1151 if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
1155 if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
1159 if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {
1163 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1167 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1172 break matchLoop; // Match Complete.
1174 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1178 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1182 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1186 break matchLoop; // Match Complete.
1188 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
1192 break matchLoop; // Match Complete.
1195 if (matchState > 4) {
1196 retVals[0] = startIdx;
1203 List charClasses() {
1214 * Sentence Monkey Test Class
1219 static class RBBISentenceMonkey extends RBBIMonkeyKind {
1224 UnicodeSet fFormatSet;
1226 UnicodeSet fLowerSet;
1227 UnicodeSet fUpperSet;
1228 UnicodeSet fOLetterSet;
1229 UnicodeSet fNumericSet;
1230 UnicodeSet fATermSet;
1231 UnicodeSet fSContinueSet;
1232 UnicodeSet fSTermSet;
1233 UnicodeSet fCloseSet;
1234 UnicodeSet fOtherSet;
1235 UnicodeSet fExtendSet;
1239 RBBISentenceMonkey() {
1240 fCharProperty = UProperty.SENTENCE_BREAK;
1242 fSets = new ArrayList();
1244 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
1245 // set and made into character classes of their own. For the monkey impl,
1246 // they remain in SEP, since Sep always appears with CR and LF in the rules.
1247 fSepSet = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");
1248 fFormatSet = new UnicodeSet("[\\p{Sentence_Break = Format}]");
1249 fSpSet = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
1250 fLowerSet = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
1251 fUpperSet = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
1252 fOLetterSet = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");
1253 fNumericSet = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");
1254 fATermSet = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
1255 fSContinueSet = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");
1256 fSTermSet = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
1257 fCloseSet = new UnicodeSet("[\\p{Sentence_Break = Close}]");
1258 fExtendSet = new UnicodeSet("[\\p{Sentence_Break = Extend}]");
1259 fOtherSet = new UnicodeSet();
1262 fOtherSet.complement();
1263 fOtherSet.removeAll(fSepSet);
1264 fOtherSet.removeAll(fFormatSet);
1265 fOtherSet.removeAll(fSpSet);
1266 fOtherSet.removeAll(fLowerSet);
1267 fOtherSet.removeAll(fUpperSet);
1268 fOtherSet.removeAll(fOLetterSet);
1269 fOtherSet.removeAll(fNumericSet);
1270 fOtherSet.removeAll(fATermSet);
1271 fOtherSet.removeAll(fSContinueSet);
1272 fOtherSet.removeAll(fSTermSet);
1273 fOtherSet.removeAll(fCloseSet);
1274 fOtherSet.removeAll(fExtendSet);
1277 fSets.add(fFormatSet);
1280 fSets.add(fLowerSet);
1281 fSets.add(fUpperSet);
1282 fSets.add(fOLetterSet);
1283 fSets.add(fNumericSet);
1284 fSets.add(fATermSet);
1285 fSets.add(fSContinueSet);
1286 fSets.add(fSTermSet);
1287 fSets.add(fCloseSet);
1288 fSets.add(fOtherSet);
1289 fSets.add(fExtendSet);
1293 List charClasses() {
1297 void setText(StringBuffer s) {
1302 // moveBack() Find the "significant" code point preceding the index i.
1303 // Skips over ($Extend | $Format)*
1305 private int moveBack(int i) {
1314 j = moveIndex32(fText, j, -1);
1315 c = UTF16.charAt(fText, j);
1317 while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));
1322 int moveForward(int i) {
1323 if (i>=fText.length()) {
1324 return fText.length();
1329 j = moveIndex32(fText, j, 1);
1332 while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));
1338 if (pos<0 || pos>=fText.length()) {
1341 return UTF16.charAt(fText, pos);
1344 int next(int prevPos) {
1345 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the
1346 // break position being tested. The candidate break
1347 // location is before p2.
1350 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1353 // Prev break at end of string. return DONE.
1354 if (prevPos >= fText.length()) {
1357 /*p0 =*/ p1 = p2 = p3 = prevPos;
1358 c3 = UTF16.charAt(fText, prevPos);
1361 // Loop runs once per "significant" character position in the input text.
1363 // Move all of the positions forward in the input string.
1364 /*p0 = p1;*/ c0 = c1;
1368 // Advancd p3 by X(Extend | Format)* Rule 4
1369 p3 = moveForward(p3);
1373 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
1377 // Rule (4) Sep <break>
1378 if (fSepSet.contains(c1)) {
1379 p2 = p1+1; // Separators don't combine with Extend or Format
1383 if (p2 >= fText.length()) {
1384 // Reached end of string. Always a break position.
1388 if (p2 == prevPos) {
1389 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1393 // Rule (6). ATerm x Numeric
1394 if (fATermSet.contains(c1) && fNumericSet.contains(c2)) {
1398 // Rule (7). Upper ATerm x Uppper
1399 if (fUpperSet.contains(c0) && fATermSet.contains(c1) && fUpperSet.contains(c2)) {
1403 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep))* Lower
1404 // Note: Sterm | ATerm are added to the negated part of the expression by a
1405 // note to the Unicode 5.0 documents.
1407 while (p8>0 && fSpSet.contains(cAt(p8))) {
1410 while (p8>0 && fCloseSet.contains(cAt(p8))) {
1413 if (fATermSet.contains(cAt(p8))) {
1417 if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||
1418 fLowerSet.contains(c) || fSepSet.contains(c) ||
1419 fATermSet.contains(c) || fSTermSet.contains(c))
1423 p8 = moveForward(p8);
1425 if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {
1430 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)
1431 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {
1433 while (setContains(fSpSet, cAt(p8))) {
1436 while (setContains(fCloseSet, cAt(p8))) {
1440 if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {
1446 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
1448 while (p9>0 && fCloseSet.contains(cAt(p9))) {
1452 if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
1453 if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {
1458 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
1460 while (p10>0 && fSpSet.contains(cAt(p10))) {
1461 p10 = moveBack(p10);
1463 while (p10>0 && fCloseSet.contains(cAt(p10))) {
1464 p10 = moveBack(p10);
1466 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {
1467 if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
1472 // Rule (11) (STerm | ATerm) Close* Sp* <break>
1474 if (p11>0 && fSepSet.contains(cAt(p11))) {
1475 p11 = moveBack(p11);
1477 while (p11>0 && fSpSet.contains(cAt(p11))) {
1478 p11 = moveBack(p11);
1480 while (p11>0 && fCloseSet.contains(cAt(p11))) {
1481 p11 = moveBack(p11);
1483 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {
1487 // Rule (12) Any x Any
1500 * Move an index into a string by n code points.
1501 * Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
1502 * complicating usage.
1503 * @param s a Text string
1504 * @param pos The starting code unit index into the text string
1505 * @param amt The amount to adjust the string by.
1506 * @return The adjusted code unit index, pinned to the string's length, or
1507 * unchanged if input index was outside of the string.
1509 static int moveIndex32(StringBuffer s, int pos, int amt) {
1513 for (i=0; i<amt; i++) {
1514 if (pos >= s.length()) {
1519 if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
1521 if (UTF16.isTrailSurrogate(c)) {
1527 for (i=0; i>amt; i--) {
1533 if (UTF16.isTrailSurrogate(c) && pos >= 0) {
1535 if (UTF16.isLeadSurrogate(c)) {
1545 * No-exceptions form of UnicodeSet.contains(c).
1546 * Simplifies loops that terminate with an end-of-input character value.
1547 * @param s A unicode set
1548 * @param c A code point value
1549 * @return true if the set contains c.
1551 static boolean setContains(UnicodeSet s, int c) {
1552 if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {
1555 return s.contains(c);
1560 * return the index of the next code point in the input text.
1561 * @param i the preceding index
1563 static int nextCP(StringBuffer s, int i) {
1565 // End of Input indication. Continue to return end value.
1569 if (retVal > s.length()) {
1572 int c = UTF16.charAt(s, i);
1573 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {
1581 * random number generator. Not using Java's built-in Randoms for two reasons:
1582 * 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
1583 * 2. We need to get and restore the seed from values occurring in the middle
1584 * of a long sequence, to more easily reproduce failing cases.
1586 private static int m_seed = 1;
1587 private static int m_rand()
1589 m_seed = m_seed * 1103515245 + 12345;
1590 return (int)(m_seed >>> 16) % 32768;
1593 // Helper function for formatting error output.
1594 // Append a string into a fixed-size field in a StringBuffer.
1595 // Blank-pad the string if it is shorter than the field.
1596 // Truncate the source string if it is too long.
1598 private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {
1599 int appendLen = src.length();
1600 if (appendLen >= fieldLen) {
1601 dest.append(src.substring(0, fieldLen));
1604 while (appendLen < fieldLen) {
1611 // Helper function for formatting error output.
1612 // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format
1613 private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {
1614 String hexChars = "0123456789abcdef";
1617 for (int bn=12; bn>=0; bn-=4) {
1618 dest.append(hexChars.charAt((((int)c)>>bn)&0xf));
1620 appendToBuf(dest, " ", fieldLen-6);
1623 for (int bn=28; bn>=0; bn-=4) {
1624 dest.append(hexChars.charAt((((int)c)>>bn)&0xf));
1626 appendToBuf(dest, " ", fieldLen-10);
1632 * Run a RBBI monkey test. Common routine, for all break iterator types.
1634 * bi - the break iterator to use
1635 * mk - MonkeyKind, abstraction for obtaining expected results
1636 * name - Name of test (char, word, etc.) for use in error messages
1637 * seed - Seed for starting random number generator (parameter from user)
1640 void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations) {
1641 int TESTSTRINGLEN = 500;
1642 StringBuffer testText = new StringBuffer();
1645 int[] expected = new int[TESTSTRINGLEN*2 + 1];
1646 int expectedCount = 0;
1647 boolean[] expectedBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1648 boolean[] forwardBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1649 boolean[] reverseBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1650 boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1651 boolean[] followingBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1652 boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1655 boolean printTestData = false;
1656 boolean printBreaksFromBI = false;
1660 numCharClasses = mk.charClasses().size();
1661 chClasses = mk.charClasses();
1663 // Verify that the character classes all have at least one member.
1664 for (i=0; i<numCharClasses; i++) {
1665 UnicodeSet s = (UnicodeSet)chClasses.get(i);
1666 if (s == null || s.size() == 0) {
1667 errln("Character Class " + i + " is null or of zero size.");
1672 //--------------------------------------------------------------------------------------------
1674 // Debugging settings. Comment out everything in the following block for normal operation
1676 //--------------------------------------------------------------------------------------------
1677 // numIterations = -1;
1678 // RuleBasedBreakIterator_New.fTrace = true;
1679 // m_seed = 859056465;
1680 // TESTSTRINGLEN = 50;
1681 // printTestData = true;
1682 // printBreaksFromBI = true;
1683 // ((RuleBasedBreakIterator_New)bi).dump();
1685 //--------------------------------------------------------------------------------------------
1687 // End of Debugging settings.
1689 //--------------------------------------------------------------------------------------------
1692 while (loopCount < numIterations || numIterations == -1) {
1693 if (numIterations == -1 && loopCount % 10 == 0) {
1694 // If test is running in an infinite loop, display a periodic tic so
1695 // we can tell that it is making progress.
1696 System.out.print(".");
1697 if (dotsOnLine++ >= 80){
1698 System.out.println();
1702 // Save current random number seed, so that we can recreate the random numbers
1703 // for this loop iteration in event of an error.
1706 testText.setLength(0);
1707 // Populate a test string with data.
1708 if (printTestData) {
1709 System.out.println("Test Data string ...");
1711 for (i=0; i<TESTSTRINGLEN; i++) {
1712 int aClassNum = m_rand() % numCharClasses;
1713 UnicodeSet classSet = (UnicodeSet)chClasses.get(aClassNum);
1714 int charIdx = m_rand() % classSet.size();
1715 int c = classSet.charAt(charIdx);
1716 if (c < 0) { // TODO: deal with sets containing strings.
1719 UTF16.appendCodePoint(testText, c);
1720 if (printTestData) {
1721 System.out.print(Integer.toHexString(c) + " ");
1724 if (printTestData) {
1725 System.out.println();
1728 Arrays.fill(expected, 0);
1729 Arrays.fill(expectedBreaks, false);
1730 Arrays.fill(forwardBreaks, false);
1731 Arrays.fill(reverseBreaks, false);
1732 Arrays.fill(isBoundaryBreaks, false);
1733 Arrays.fill(followingBreaks, false);
1734 Arrays.fill(precedingBreaks, false);
1736 // Calculate the expected results for this test string.
1737 mk.setText(testText);
1739 expectedBreaks[0] = true;
1740 expected[expectedCount ++] = 0;
1742 int lastBreakPos = -1;
1744 lastBreakPos = breakPos;
1745 breakPos = mk.next(breakPos);
1746 if (breakPos == -1) {
1749 if (breakPos > testText.length()) {
1750 errln("breakPos > testText.length()");
1752 if (lastBreakPos >= breakPos) {
1753 errln("Next() not increasing.");
1756 expectedBreaks[breakPos] = true;
1757 expected[expectedCount ++] = breakPos;
1760 // Find the break positions using forward iteration
1761 if (printBreaksFromBI) {
1762 System.out.println("Breaks from BI...");
1764 bi.setText(testText.toString());
1765 for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {
1766 if (i < 0 || i > testText.length()) {
1767 errln(name + " break monkey test: Out of range value returned by breakIterator::next()");
1770 if (printBreaksFromBI) {
1771 System.out.print(Integer.toHexString(i) + " ");
1773 forwardBreaks[i] = true;
1775 if (printBreaksFromBI) {
1776 System.out.println();
1779 // Find the break positions using reverse iteration
1780 for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {
1781 if (i < 0 || i > testText.length()) {
1782 errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);
1785 reverseBreaks[i] = true;
1788 // Find the break positions using isBoundary() tests.
1789 for (i=0; i<=testText.length(); i++) {
1790 isBoundaryBreaks[i] = bi.isBoundary(i);
1793 // Find the break positions using the following() function.
1795 followingBreaks[0] = true;
1796 for (i=0; i<testText.length(); i++) {
1797 breakPos = bi.following(i);
1798 if (breakPos <= i ||
1799 breakPos < lastBreakPos ||
1800 breakPos > testText.length() ||
1801 breakPos > lastBreakPos && lastBreakPos > i ) {
1802 errln(name + " break monkey test: " +
1803 "Out of range value returned by BreakIterator::following().\n" +
1804 "index=" + i + "following returned=" + breakPos +
1805 "lastBreak=" + lastBreakPos);
1806 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
1808 followingBreaks[breakPos] = true;
1809 lastBreakPos = breakPos;
1813 // Find the break positions using the preceding() function.
1814 lastBreakPos = testText.length();
1815 precedingBreaks[testText.length()] = true;
1816 for (i=testText.length(); i>0; i--) {
1817 breakPos = bi.preceding(i);
1818 if (breakPos >= i ||
1819 breakPos > lastBreakPos ||
1821 breakPos < lastBreakPos && lastBreakPos < i ) {
1822 errln(name + " break monkey test: " +
1823 "Out of range value returned by BreakIterator::preceding().\n" +
1824 "index=" + i + "preceding returned=" + breakPos +
1825 "lastBreak=" + lastBreakPos);
1826 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
1828 precedingBreaks[breakPos] = true;
1829 lastBreakPos = breakPos;
1835 // Compare the expected and actual results.
1836 for (i=0; i<=testText.length(); i++) {
1837 String errorType = null;
1838 if (forwardBreaks[i] != expectedBreaks[i]) {
1839 errorType = "next()";
1840 } else if (reverseBreaks[i] != forwardBreaks[i]) {
1841 errorType = "previous()";
1842 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
1843 errorType = "isBoundary()";
1844 } else if (followingBreaks[i] != expectedBreaks[i]) {
1845 errorType = "following()";
1846 } else if (precedingBreaks[i] != expectedBreaks[i]) {
1847 errorType = "preceding()";
1851 if (errorType != null) {
1852 // Format a range of the test text that includes the failure as
1853 // a data item that can be included in the rbbi test data file.
1855 // Start of the range is the last point where expected and actual results
1856 // both agreed that there was a break position.
1857 int startContext = i;
1860 if (startContext==0) { break; }
1862 if (expectedBreaks[startContext]) {
1863 if (count == 2) break;
1868 // End of range is two expected breaks past the start position.
1869 int endContext = i + 1;
1871 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
1873 if (endContext >= testText.length()) {break;}
1874 if (expectedBreaks[endContext-1]) {
1875 if (count == 0) break;
1882 // Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
1883 StringBuffer errorText = new StringBuffer();
1885 int c; // Char from test data
1886 for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testText, ci)) {
1888 // This is the location of the error.
1889 errorText.append("<?>---------------------------------\n");
1890 } else if (expectedBreaks[ci]) {
1891 // This a non-error expected break position.
1892 errorText.append("------------------------------------\n");
1894 if (ci < testText.length()) {
1895 c = UTF16.charAt(testText, ci);
1896 appendCharToBuf(errorText, c, 11);
1897 String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT);
1898 appendToBuf(errorText, gc, 8);
1899 int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty);
1900 String extraPropValue =
1901 UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG);
1902 appendToBuf(errorText, extraPropValue, 20);
1904 String charName = UCharacter.getExtendedName(c);
1905 appendToBuf(errorText, charName, 40);
1906 errorText.append('\n');
1909 if (ci == testText.length() && ci != -1) {
1910 errorText.append("<>");
1912 errorText.append("</data>\n");
1915 errln(name + " break monkey test error. " +
1916 (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") +
1917 "\nOperation = " + errorType + "; random seed = " + seed + "; buf Idx = " + i + "\n" +
1927 public void TestCharMonkey() {
1929 int loopCount = 500;
1932 if (params.inclusion >= 9) {
1936 RBBICharMonkey m = new RBBICharMonkey();
1937 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US);
1938 RunMonkey(bi, m, "char", seed, loopCount);
1941 public void TestWordMonkey() {
1943 int loopCount = 500;
1946 if (params.inclusion >= 9) {
1950 logln("Word Break Monkey Test");
1951 RBBIWordMonkey m = new RBBIWordMonkey();
1952 BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
1953 RunMonkey(bi, m, "word", seed, loopCount);
1956 public void TestLineMonkey() {
1957 int loopCount = 500;
1960 if (params.inclusion >= 9) {
1964 logln("Line Break Monkey Test");
1965 RBBILineMonkey m = new RBBILineMonkey();
1966 BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
1967 if (params == null) {
1970 RunMonkey(bi, m, "line", seed, loopCount);
1973 public void TestSentMonkey() {
1975 int loopCount = 500;
1978 if (params.inclusion >= 9) {
1982 logln("Sentence Break Monkey Test");
1983 RBBISentenceMonkey m = new RBBISentenceMonkey();
1984 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
1985 if (params == null) {
1988 RunMonkey(bi, m, "sent", seed, loopCount);
1991 // Round-trip monkey tests.
1992 // Verify that break iterators created from the rule source from the default
1993 // break iterators still pass the monkey test for the iterator type.
1995 // This is a major test for the Rule Compiler. The default break iterators are built
1996 // from pre-compiled binary rule data that was created using ICU4C; these
1997 // round-trip rule recompile tests verify that the Java rule compiler can
1998 // rebuild break iterators from the original source rules.
2000 public void TestRTCharMonkey() {
2002 int loopCount = 200;
2005 if (params.inclusion >= 9) {
2009 RBBICharMonkey m = new RBBICharMonkey();
2010 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US);
2011 String rules = bi.toString();
2012 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2013 RunMonkey(rtbi, m, "char", seed, loopCount);
2016 public void TestRTWordMonkey() {
2018 int loopCount = 200;
2021 if (params.inclusion >= 9) {
2024 logln("Word Break Monkey Test");
2025 RBBIWordMonkey m = new RBBIWordMonkey();
2026 BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
2027 String rules = bi.toString();
2028 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2029 RunMonkey(rtbi, m, "word", seed, loopCount);
2032 public void TestRTLineMonkey() {
2033 int loopCount = 200;
2036 if (params.inclusion >= 9) {
2040 logln("Line Break Monkey Test");
2041 RBBILineMonkey m = new RBBILineMonkey();
2042 BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
2043 String rules = bi.toString();
2044 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2045 if (params == null) {
2048 RunMonkey(rtbi, m, "line", seed, loopCount);
2051 public void TestRTSentMonkey() {
2053 int loopCount = 200;
2056 if (params.inclusion >= 9) {
2060 logln("Sentence Break Monkey Test");
2061 RBBISentenceMonkey m = new RBBISentenceMonkey();
2062 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
2063 String rules = bi.toString();
2064 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2065 if (params == null) {
2068 RunMonkey(rtbi, m, "sent", seed, loopCount);