2 *******************************************************************************
\r
3 * Copyright (C) 2003-2010 International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.dev.test.rbbi;
\r
10 // Monkey testing of RuleBasedBreakIterator
\r
11 import java.util.ArrayList;
\r
12 import java.util.Arrays;
\r
13 import java.util.List;
\r
14 import java.util.Locale;
\r
16 import com.ibm.icu.dev.test.TestFmwk;
\r
17 import com.ibm.icu.lang.UCharacter;
\r
18 import com.ibm.icu.lang.UProperty;
\r
19 import com.ibm.icu.text.BreakIterator;
\r
20 import com.ibm.icu.text.RuleBasedBreakIterator;
\r
21 import com.ibm.icu.text.UTF16;
\r
22 import com.ibm.icu.text.UnicodeSet;
\r
26 * Monkey tests for RBBI. These tests have independent implementations of
\r
27 * the Unicode TR boundary rules, and compare results between these and ICU's
\r
28 * implementation, using random data.
\r
30 * Tests cover Grapheme Cluster (char), Word and Line breaks
\r
32 * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
\r
35 public class RBBITestMonkey extends TestFmwk {
\r
37 public static void main(String[] args) {
\r
38 new RBBITestMonkey().run(args);
\r
42 // classs RBBIMonkeyKind
\r
44 // Monkey Test for Break Iteration
\r
45 // Abstract interface class. Concrete derived classes independently
\r
46 // implement the break rules for different iterator types.
\r
48 // The Monkey Test itself uses doesn't know which type of break iterator it is
\r
49 // testing, but works purely in terms of the interface defined here.
\r
51 abstract static class RBBIMonkeyKind {
\r
53 // Return a List of UnicodeSets, representing the character classes used
\r
54 // for this type of iterator.
\r
55 abstract List charClasses();
\r
57 // Set the test text on which subsequent calls to next() will operate
\r
58 abstract void setText(StringBuffer text);
\r
60 // Find the next break postion, starting from the specified position.
\r
61 // Return -1 after reaching end of string.
\r
62 abstract int next(int i);
\r
64 // A Character Property, one of the constants defined in class UProperty.
\r
65 // The value fo this property will be displayed for the characters
\r
66 // near any test failure.
\r
72 * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
\r
74 static class RBBICharMonkey extends RBBIMonkeyKind {
\r
77 UnicodeSet fCRLFSet;
\r
78 UnicodeSet fControlSet;
\r
79 UnicodeSet fExtendSet;
\r
80 UnicodeSet fPrependSet;
\r
81 UnicodeSet fSpacingSet;
\r
87 UnicodeSet fHangulSet;
\r
95 fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
\r
96 fCRLFSet = new UnicodeSet("[\\r\\n]");
\r
97 fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
\r
98 fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
\r
99 fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
\r
100 fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
\r
101 fLSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
\r
102 fVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");
\r
103 fTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");
\r
104 fLVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");
\r
105 fLVTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");
\r
106 fHangulSet = new UnicodeSet();
\r
107 fHangulSet.addAll(fLSet);
\r
108 fHangulSet.addAll(fVSet);
\r
109 fHangulSet.addAll(fTSet);
\r
110 fHangulSet.addAll(fLVSet);
\r
111 fHangulSet.addAll(fLVTSet);
\r
113 fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");
\r
115 fSets = new ArrayList();
\r
116 fSets.add(fCRLFSet);
\r
117 fSets.add(fControlSet);
\r
118 fSets.add(fExtendSet);
\r
119 fSets.add(fPrependSet);
\r
120 fSets.add(fSpacingSet);
\r
121 fSets.add(fHangulSet);
\r
122 fSets.add(fAnySet);
\r
126 void setText(StringBuffer s) {
\r
130 List charClasses() {
\r
134 int next(int prevPos) {
\r
135 int p1, p2, p3; // Indices of the significant code points around the
\r
136 // break position being tested. The candidate break
\r
137 // location is before p2.
\r
141 int c1, c2, c3; // The code points at p0, p1, p2 & p3.
\r
143 // Previous break at end of string. return DONE.
\r
144 if (prevPos >= fText.length()) {
\r
147 p1 = p2 = p3 = prevPos;
\r
148 c3 = UTF16.charAt(fText, prevPos);
\r
151 // Loop runs once per "significant" character position in the input text.
\r
153 // Move all of the positions forward in the input string.
\r
157 // Advance p3 by one codepoint
\r
158 p3 = moveIndex32(fText, p3, 1);
\r
159 c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);
\r
162 // Still warming up the loop. (won't work with zero length strings, but we don't care)
\r
165 if (p2 == fText.length()) {
\r
166 // Reached end of string. Always a break position.
\r
170 // Rule GB3 CR x LF
\r
171 // No Extend or Format characters may appear between the CR and LF,
\r
172 // which requires the additional check for p2 immediately following p1.
\r
174 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
\r
178 // Rule (GB4). ( Control | CR | LF ) <break>
\r
179 if (fControlSet.contains(c1) ||
\r
185 // Rule (GB5) <break> ( Control | CR | LF )
\r
187 if (fControlSet.contains(c2) ||
\r
194 // Rule (GB6) L x ( L | V | LV | LVT )
\r
195 if (fLSet.contains(c1) &&
\r
196 (fLSet.contains(c2) ||
\r
197 fVSet.contains(c2) ||
\r
198 fLVSet.contains(c2) ||
\r
199 fLVTSet.contains(c2))) {
\r
203 // Rule (GB7) ( LV | V ) x ( V | T )
\r
204 if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&
\r
205 (fVSet.contains(c2) || fTSet.contains(c2))) {
\r
209 // Rule (GB8) ( LVT | T) x T
\r
210 if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&
\r
211 fTSet.contains(c2)) {
\r
215 // Rule (GB9) Numeric x ALetter
\r
216 if (fExtendSet.contains(c2)) {
\r
220 // Rule (GB9a) x SpacingMark
\r
221 if (fSpacingSet.contains(c2)) {
\r
225 // Rule (GB9b) Prepend x
\r
226 if (fPrependSet.contains(c1)) {
\r
230 // Rule (GB10) Any <break> Any
\r
242 * Word Monkey Test Class
\r
247 static class RBBIWordMonkey extends RBBIMonkeyKind {
\r
249 StringBuffer fText;
\r
253 UnicodeSet fNewlineSet;
\r
254 UnicodeSet fKatakanaSet;
\r
255 UnicodeSet fALetterSet;
\r
256 UnicodeSet fMidNumLetSet;
\r
257 UnicodeSet fMidLetterSet;
\r
258 UnicodeSet fMidNumSet;
\r
259 UnicodeSet fNumericSet;
\r
260 UnicodeSet fFormatSet;
\r
261 UnicodeSet fExtendSet;
\r
262 UnicodeSet fExtendNumLetSet;
\r
263 UnicodeSet fOtherSet;
\r
267 fCharProperty = UProperty.WORD_BREAK;
\r
269 fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]");
\r
270 fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]");
\r
271 fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]");
\r
272 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]");
\r
273 fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]");
\r
274 fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
\r
275 fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
\r
276 fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]");
\r
277 fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]");
\r
278 fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]");
\r
279 fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
\r
280 fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]");
\r
282 fOtherSet = new UnicodeSet();
\r
283 fOtherSet.complement();
\r
284 fOtherSet.removeAll(fCRSet);
\r
285 fOtherSet.removeAll(fLFSet);
\r
286 fOtherSet.removeAll(fNewlineSet);
\r
287 fOtherSet.removeAll(fALetterSet);
\r
288 fOtherSet.removeAll(fKatakanaSet);
\r
289 fOtherSet.removeAll(fMidLetterSet);
\r
290 fOtherSet.removeAll(fMidNumSet);
\r
291 fOtherSet.removeAll(fNumericSet);
\r
292 fOtherSet.removeAll(fFormatSet);
\r
293 fOtherSet.removeAll(fExtendSet);
\r
294 fOtherSet.removeAll(fExtendNumLetSet);
\r
295 // Inhibit dictionary characters from being tested at all.
\r
296 fOtherSet.removeAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
\r
298 fSets = new ArrayList();
\r
301 fSets.add(fNewlineSet);
\r
302 fSets.add(fALetterSet);
\r
303 fSets.add(fKatakanaSet);
\r
304 fSets.add(fMidLetterSet);
\r
305 fSets.add(fMidNumLetSet);
\r
306 fSets.add(fMidNumSet);
\r
307 fSets.add(fNumericSet);
\r
308 fSets.add(fFormatSet);
\r
309 fSets.add(fExtendSet);
\r
310 fSets.add(fExtendNumLetSet);
\r
311 fSets.add(fOtherSet);
\r
315 List charClasses() {
\r
319 void setText(StringBuffer s) {
\r
323 int next(int prevPos) {
\r
324 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the
\r
325 // break position being tested. The candidate break
\r
326 // location is before p2.
\r
329 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
\r
331 // Previous break at end of string. return DONE.
\r
332 if (prevPos >= fText.length()) {
\r
335 /*p0 =*/ p1 = p2 = p3 = prevPos;
\r
336 c3 = UTF16.charAt(fText, prevPos);
\r
341 // Loop runs once per "significant" character position in the input text.
\r
343 // Move all of the positions forward in the input string.
\r
344 /*p0 = p1;*/ c0 = c1;
\r
348 // Advancd p3 by X(Extend | Format)* Rule 4
\r
349 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
\r
351 p3 = moveIndex32(fText, p3, 1);
\r
353 if (p3>=fText.length()) {
\r
356 c3 = UTF16.charAt(fText, p3);
\r
357 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
\r
361 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3));
\r
364 // Still warming up the loop. (won't work with zero length strings, but we don't care)
\r
367 if (p2 == fText.length()) {
\r
368 // Reached end of string. Always a break position.
\r
372 // Rule (3) CR x LF
\r
373 // No Extend or Format characters may appear between the CR and LF,
\r
374 // which requires the additional check for p2 immediately following p1.
\r
376 if (c1==0x0D && c2==0x0A) {
\r
380 // Rule (3a) Break before and after newlines (including CR and LF)
\r
382 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {
\r
385 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
\r
389 // Rule (5). ALetter x ALetter
\r
390 if (fALetterSet.contains(c1) &&
\r
391 fALetterSet.contains(c2)) {
\r
395 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
\r
397 if ( fALetterSet.contains(c1) &&
\r
398 (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
\r
399 setContains(fALetterSet, c3)) {
\r
404 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
\r
405 if (fALetterSet.contains(c0) &&
\r
406 (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1)) &&
\r
407 fALetterSet.contains(c2)) {
\r
411 // Rule (8) Numeric x Numeric
\r
412 if (fNumericSet.contains(c1) &&
\r
413 fNumericSet.contains(c2)) {
\r
417 // Rule (9) ALetter x Numeric
\r
418 if (fALetterSet.contains(c1) &&
\r
419 fNumericSet.contains(c2)) {
\r
423 // Rule (10) Numeric x ALetter
\r
424 if (fNumericSet.contains(c1) &&
\r
425 fALetterSet.contains(c2)) {
\r
429 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
\r
430 if ( fNumericSet.contains(c0) &&
\r
431 (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1)) &&
\r
432 fNumericSet.contains(c2)) {
\r
436 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
\r
437 if (fNumericSet.contains(c1) &&
\r
438 (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
\r
439 setContains(fNumericSet, c3)) {
\r
443 // Rule (13) Katakana x Katakana
\r
444 if (fKatakanaSet.contains(c1) &&
\r
445 fKatakanaSet.contains(c2)) {
\r
449 // Rule 13a (ALetter | Numeric | Katakana | ExtendNumLet) x ExtendNumLet
\r
450 if ((fALetterSet.contains(c1) || fNumericSet.contains(c1) ||
\r
451 fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
\r
452 fExtendNumLetSet.contains(c2)) {
\r
455 // Rule 13b ExtendNumLet x (ALetter | Numeric | Katakana | ExtendNumLet)
\r
456 if (fExtendNumLetSet.contains(c1) &&
\r
457 (fALetterSet.contains(c2) || fNumericSet.contains(c2) ||
\r
458 fKatakanaSet.contains(c2) || fExtendNumLetSet.contains(c2))) {
\r
462 // Rule 14. Break found here.
\r
473 static class RBBILineMonkey extends RBBIMonkeyKind {
\r
515 StringBuffer fText;
\r
516 int fOrigPositions;
\r
522 fCharProperty = UProperty.LINE_BREAK;
\r
523 fSets = new ArrayList();
\r
525 fBK = new UnicodeSet("[\\p{Line_Break=BK}]");
\r
526 fCR = new UnicodeSet("[\\p{Line_break=CR}]");
\r
527 fLF = new UnicodeSet("[\\p{Line_break=LF}]");
\r
528 fCM = new UnicodeSet("[\\p{Line_break=CM}]");
\r
529 fNL = new UnicodeSet("[\\p{Line_break=NL}]");
\r
530 fWJ = new UnicodeSet("[\\p{Line_break=WJ}]");
\r
531 fZW = new UnicodeSet("[\\p{Line_break=ZW}]");
\r
532 fGL = new UnicodeSet("[\\p{Line_break=GL}]");
\r
533 fCB = new UnicodeSet("[\\p{Line_break=CB}]");
\r
534 fSP = new UnicodeSet("[\\p{Line_break=SP}]");
\r
535 fB2 = new UnicodeSet("[\\p{Line_break=B2}]");
\r
536 fBA = new UnicodeSet("[\\p{Line_break=BA}]");
\r
537 fBB = new UnicodeSet("[\\p{Line_break=BB}]");
\r
538 fHY = new UnicodeSet("[\\p{Line_break=HY}]");
\r
539 fCL = new UnicodeSet("[\\p{Line_break=CL}]");
\r
540 fCP = new UnicodeSet("[\\p{Line_break=CP}]");
\r
541 fEX = new UnicodeSet("[\\p{Line_break=EX}]");
\r
542 fIN = new UnicodeSet("[\\p{Line_break=IN}]");
\r
543 fNS = new UnicodeSet("[\\p{Line_break=NS}]");
\r
544 fOP = new UnicodeSet("[\\p{Line_break=OP}]");
\r
545 fQU = new UnicodeSet("[\\p{Line_break=QU}]");
\r
546 fIS = new UnicodeSet("[\\p{Line_break=IS}]");
\r
547 fNU = new UnicodeSet("[\\p{Line_break=NU}]");
\r
548 fPO = new UnicodeSet("[\\p{Line_break=PO}]");
\r
549 fPR = new UnicodeSet("[\\p{Line_break=PR}]");
\r
550 fSY = new UnicodeSet("[\\p{Line_break=SY}]");
\r
551 fAI = new UnicodeSet("[\\p{Line_break=AI}]");
\r
552 fAL = new UnicodeSet("[\\p{Line_break=AL}]");
\r
553 fID = new UnicodeSet("[\\p{Line_break=ID}]");
\r
554 fSA = new UnicodeSet("[\\p{Line_break=SA}]");
\r
555 fJL = new UnicodeSet("[\\p{Line_break=JL}]");
\r
556 fJV = new UnicodeSet("[\\p{Line_break=JV}]");
\r
557 fJT = new UnicodeSet("[\\p{Line_break=JT}]");
\r
558 fH2 = new UnicodeSet("[\\p{Line_break=H2}]");
\r
559 fH3 = new UnicodeSet("[\\p{Line_break=H3}]");
\r
560 fSG = new UnicodeSet("[\\ud800-\\udfff]");
\r
561 fXX = new UnicodeSet("[\\p{Line_break=XX}]");
\r
564 fAL.addAll(fXX); // Default behavior for XX is identical to AL
\r
565 fAL.addAll(fAI); // Default behavior for AI is identical to AL
\r
566 fAL.addAll(fSA); // Default behavior for SA is XX, which defaults to AL
\r
567 fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL
\r
611 void setText(StringBuffer s) {
\r
618 int next(int startPos) {
\r
619 int pos; // Index of the char following a potential break position
\r
620 int thisChar; // Character at above position "pos"
\r
622 int prevPos; // Index of the char preceding a potential break position
\r
623 int prevChar; // Character at above position. Note that prevChar
\r
624 // and thisChar may not be adjacent because combining
\r
625 // characters between them will be ignored.
\r
627 int nextPos; // Index of the next character following pos.
\r
628 // Usually skips over combining marks.
\r
629 int tPos; // temp value.
\r
630 int matchVals[] = null; // Number Expression Match Results
\r
633 if (startPos >= fText.length()) {
\r
638 // Initial values for loop. Loop will run the first time without finding breaks,
\r
639 // while the invalid values shift out and the "this" and
\r
640 // "prev" positions are filled in with good values.
\r
641 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.
\r
642 thisChar = prevChar = 0;
\r
643 nextPos = startPos;
\r
646 // Loop runs once per position in the test text, until a break position
\r
647 // is found. In each iteration, we are testing for a possible break
\r
648 // just preceding the character at index "pos". The character preceding
\r
649 // this char is at postion "prevPos"; because of combining sequences,
\r
650 // "prevPos" can be arbitrarily far before "pos".
\r
652 // Advance to the next position to be tested.
\r
654 prevChar = thisChar;
\r
656 nextPos = moveIndex32(fText, pos, 1);
\r
658 // Rule LB2 - Break at end of text.
\r
659 if (pos >= fText.length()) {
\r
663 // Rule LB 9 - adjust for combining sequences.
\r
664 // We do this rule out-of-order because the adjustment does
\r
665 // not effect the way that rules LB 3 through LB 6 match,
\r
666 // and doing it here rather than after LB 6 is substantially
\r
667 // simpler when combining sequences do occur.
\r
670 // LB 9 Keep combining sequences together.
\r
671 // advance over any CM class chars at "pos",
\r
672 // result is "nextPos" for the following loop iteration.
\r
673 thisChar = UTF16.charAt(fText, pos);
\r
674 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||
\r
675 thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {
\r
677 if (nextPos == fText.length()) {
\r
680 int nextChar = UTF16.charAt(fText, nextPos);
\r
681 if (!fCM.contains(nextChar)) {
\r
684 nextPos = moveIndex32(fText, nextPos, 1);
\r
688 // LB 9 Treat X CM* as if it were X
\r
689 // No explicit action required.
\r
691 // LB 10 Treat any remaining combining mark as AL
\r
692 if (fCM.contains(thisChar)) {
\r
697 // If the loop is still warming up - if we haven't shifted the initial
\r
698 // -1 positions out of prevPos yet - loop back to advance the
\r
699 // position in the input without any further looking for breaks.
\r
700 if (prevPos == -1) {
\r
704 // LB 4 Always break after hard line breaks,
\r
705 if (fBK.contains(prevChar)) {
\r
709 // LB 5 Break after CR, LF, NL, but not inside CR LF
\r
710 if (fCR.contains(prevChar) && fLF.contains(thisChar)) {
\r
713 if (fCR.contains(prevChar) ||
\r
714 fLF.contains(prevChar) ||
\r
715 fNL.contains(prevChar)) {
\r
719 // LB 6 Don't break before hard line breaks
\r
720 if (fBK.contains(thisChar) || fCR.contains(thisChar) ||
\r
721 fLF.contains(thisChar) || fNL.contains(thisChar) ) {
\r
726 // LB 7 Don't break before spaces or zero-width space.
\r
727 if (fSP.contains(thisChar)) {
\r
731 if (fZW.contains(thisChar)) {
\r
735 // LB 8 Break after zero width space
\r
736 if (fZW.contains(prevChar)) {
\r
740 // LB 9, 10 Already done, at top of loop.
\r
747 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
\r
754 if (fGL.contains(prevChar)) {
\r
759 // [^SP BA HY] x GL
\r
760 if (!(fSP.contains(prevChar) ||
\r
761 fBA.contains(prevChar) ||
\r
762 fHY.contains(prevChar) ) && fGL.contains(thisChar)) {
\r
768 // LB 13 Don't break before closings.
\r
769 // NU x CL, NU x CP and NU x IS are not matched here so that they will
\r
770 // fall into LB 17 and the more general number regular expression.
\r
772 if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
\r
773 !fNU.contains(prevChar) && fCP.contains(thisChar) ||
\r
774 fEX.contains(thisChar) ||
\r
775 !fNU.contains(prevChar) && fIS.contains(thisChar) ||
\r
776 !fNU.contains(prevChar) && fSY.contains(thisChar)) {
\r
780 // LB 14 Don't break after OP SP*
\r
781 // Scan backwards, checking for this sequence.
\r
782 // The OP char could include combining marks, so we actually check for
\r
785 if (fSP.contains(prevChar)) {
\r
786 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
\r
787 tPos=moveIndex32(fText, tPos, -1);
\r
790 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
\r
791 tPos=moveIndex32(fText, tPos, -1);
\r
793 if (fOP.contains(UTF16.charAt(fText, tPos))) {
\r
797 // LB 15 Do not break within "[
\r
799 if (fOP.contains(thisChar)) {
\r
800 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
\r
802 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
\r
803 tPos = moveIndex32(fText, tPos, -1);
\r
805 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
\r
806 tPos = moveIndex32(fText, tPos, -1);
\r
808 if (fQU.contains(UTF16.charAt(fText, tPos))) {
\r
813 // LB 16 (CL | CP) SP* x NS
\r
814 if (fNS.contains(thisChar)) {
\r
816 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
\r
817 tPos = moveIndex32(fText, tPos, -1);
\r
819 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
\r
820 tPos = moveIndex32(fText, tPos, -1);
\r
822 if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {
\r
828 // LB 17 B2 SP* x B2
\r
829 if (fB2.contains(thisChar)) {
\r
831 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
\r
832 tPos = moveIndex32(fText, tPos, -1);
\r
834 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
\r
835 tPos = moveIndex32(fText, tPos, -1);
\r
837 if (fB2.contains(UTF16.charAt(fText, tPos))) {
\r
842 // LB 18 break after space
\r
843 if (fSP.contains(prevChar)) {
\r
850 if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
\r
854 // LB 20 Break around a CB
\r
855 if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
\r
860 if (fBA.contains(thisChar) ||
\r
861 fHY.contains(thisChar) ||
\r
862 fNS.contains(thisChar) ||
\r
863 fBB.contains(prevChar) ) {
\r
868 if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
\r
869 fID.contains(prevChar) && fIN.contains(thisChar) ||
\r
870 fIN.contains(prevChar) && fIN.contains(thisChar) ||
\r
871 fNU.contains(prevChar) && fIN.contains(thisChar) ) {
\r
876 // LB 23 ID x PO (Note: Leading CM behaves like ID)
\r
879 if (fID.contains(prevChar) && fPO.contains(thisChar) ||
\r
880 fAL.contains(prevChar) && fNU.contains(thisChar) ||
\r
881 fNU.contains(prevChar) && fAL.contains(thisChar) ) {
\r
885 // LB 24 Do not break between prefix and letters or ideographs.
\r
889 if (fPR.contains(prevChar) && fID.contains(thisChar) ||
\r
890 fPR.contains(prevChar) && fAL.contains(thisChar) ||
\r
891 fPO.contains(prevChar) && fAL.contains(thisChar)) {
\r
897 matchVals = LBNumberCheck(fText, prevPos, matchVals);
\r
898 if (matchVals[0] != -1) {
\r
899 // Matched a number. But could have been just a single digit, which would
\r
900 // not represent a "no break here" between prevChar and thisChar
\r
901 int numEndIdx = matchVals[1]; // idx of first char following num
\r
902 if (numEndIdx > pos) {
\r
903 // Number match includes at least the two chars being checked
\r
904 if (numEndIdx > nextPos) {
\r
905 // Number match includes additional chars. Update pos and nextPos
\r
906 // so that next loop iteration will continue at the end of the number,
\r
907 // checking for breaks between last char in number & whatever follows.
\r
908 nextPos = numEndIdx;
\r
911 pos = moveIndex32(fText, pos, -1);
\r
912 thisChar = UTF16.charAt(fText, pos);
\r
914 while (fCM.contains(thisChar));
\r
921 // LB 26 Do not break Korean Syllables
\r
922 if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
\r
923 fJV.contains(thisChar) ||
\r
924 fH2.contains(thisChar) ||
\r
925 fH3.contains(thisChar))) {
\r
929 if ((fJV.contains(prevChar) || fH2.contains(prevChar)) &&
\r
930 (fJV.contains(thisChar) || fJT.contains(thisChar))) {
\r
934 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&
\r
935 fJT.contains(thisChar)) {
\r
939 // LB 27 Treat a Korean Syllable Block the same as ID
\r
940 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
\r
941 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
\r
942 fIN.contains(thisChar)) {
\r
945 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
\r
946 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
\r
947 fPO.contains(thisChar)) {
\r
950 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||
\r
951 fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {
\r
957 // LB 28 Do not break between alphabetics
\r
958 if (fAL.contains(prevChar) && fAL.contains(thisChar)) {
\r
962 // LB 29 Do not break between numeric punctuation and alphabetics
\r
963 if (fIS.contains(prevChar) && fAL.contains(thisChar)) {
\r
967 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
\r
970 if ((fAL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
\r
973 if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fNU.contains(thisChar))) {
\r
978 // LB 31 Break everywhere else
\r
987 // Match the following regular expression in the input text.
\r
988 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)? (PR | PO) CM*)?
\r
989 // 0 0 1 3 3 4 7 7 7 7 9 9 9 11 11 (match states)
\r
990 // retVals array [0] index of the start of the match, or -1 if no match
\r
991 // [1] index of first char following the match.
\r
992 // Can not use Java regex because need supplementary character support,
\r
993 // and because Unicode char properties version must be the same as in
\r
994 // the version of ICU being tested.
\r
995 private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
\r
996 if (retVals == null) {
\r
997 retVals = new int[2];
\r
999 retVals[0] = -1; // Indicates no match.
\r
1000 int matchState = 0;
\r
1001 int idx = startIdx;
\r
1003 matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
\r
1004 int c = UTF16.charAt(s, idx);
\r
1005 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
\r
1006 switch (matchState) {
\r
1008 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||
\r
1009 cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
\r
1013 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
\r
1017 if (cLBType == UCharacter.LineBreak.HYPHEN) {
\r
1021 if (cLBType == UCharacter.LineBreak.NUMERIC) {
\r
1025 break matchLoop; /* No Match */
\r
1028 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
\r
1032 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
\r
1036 if (cLBType == UCharacter.LineBreak.HYPHEN) {
\r
1040 if (cLBType == UCharacter.LineBreak.NUMERIC) {
\r
1044 break matchLoop; /* No Match */
\r
1048 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
\r
1052 if (cLBType == UCharacter.LineBreak.NUMERIC) {
\r
1056 break matchLoop; /* No Match */
\r
1057 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)?
\r
1058 // 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states)
\r
1061 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
\r
1065 if (cLBType == UCharacter.LineBreak.NUMERIC) {
\r
1069 if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
\r
1073 if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
\r
1077 if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
\r
1081 if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {
\r
1085 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
\r
1089 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
\r
1094 break matchLoop; // Match Complete.
\r
1096 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
\r
1100 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
\r
1104 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
\r
1108 break matchLoop; // Match Complete.
\r
1110 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
\r
1114 break matchLoop; // Match Complete.
\r
1117 if (matchState > 4) {
\r
1118 retVals[0] = startIdx;
\r
1119 retVals[1] = idx;
\r
1125 List charClasses() {
\r
1136 * Sentence Monkey Test Class
\r
1141 static class RBBISentenceMonkey extends RBBIMonkeyKind {
\r
1143 StringBuffer fText;
\r
1145 UnicodeSet fSepSet;
\r
1146 UnicodeSet fFormatSet;
\r
1147 UnicodeSet fSpSet;
\r
1148 UnicodeSet fLowerSet;
\r
1149 UnicodeSet fUpperSet;
\r
1150 UnicodeSet fOLetterSet;
\r
1151 UnicodeSet fNumericSet;
\r
1152 UnicodeSet fATermSet;
\r
1153 UnicodeSet fSContinueSet;
\r
1154 UnicodeSet fSTermSet;
\r
1155 UnicodeSet fCloseSet;
\r
1156 UnicodeSet fOtherSet;
\r
1157 UnicodeSet fExtendSet;
\r
1161 RBBISentenceMonkey() {
\r
1162 fCharProperty = UProperty.SENTENCE_BREAK;
\r
1164 fSets = new ArrayList();
\r
1166 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
\r
1167 // set and made into character classes of their own. For the monkey impl,
\r
1168 // they remain in SEP, since Sep always appears with CR and LF in the rules.
\r
1169 fSepSet = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");
\r
1170 fFormatSet = new UnicodeSet("[\\p{Sentence_Break = Format}]");
\r
1171 fSpSet = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
\r
1172 fLowerSet = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
\r
1173 fUpperSet = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
\r
1174 fOLetterSet = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");
\r
1175 fNumericSet = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");
\r
1176 fATermSet = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
\r
1177 fSContinueSet = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");
\r
1178 fSTermSet = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
\r
1179 fCloseSet = new UnicodeSet("[\\p{Sentence_Break = Close}]");
\r
1180 fExtendSet = new UnicodeSet("[\\p{Sentence_Break = Extend}]");
\r
1181 fOtherSet = new UnicodeSet();
\r
1184 fOtherSet.complement();
\r
1185 fOtherSet.removeAll(fSepSet);
\r
1186 fOtherSet.removeAll(fFormatSet);
\r
1187 fOtherSet.removeAll(fSpSet);
\r
1188 fOtherSet.removeAll(fLowerSet);
\r
1189 fOtherSet.removeAll(fUpperSet);
\r
1190 fOtherSet.removeAll(fOLetterSet);
\r
1191 fOtherSet.removeAll(fNumericSet);
\r
1192 fOtherSet.removeAll(fATermSet);
\r
1193 fOtherSet.removeAll(fSContinueSet);
\r
1194 fOtherSet.removeAll(fSTermSet);
\r
1195 fOtherSet.removeAll(fCloseSet);
\r
1196 fOtherSet.removeAll(fExtendSet);
\r
1198 fSets.add(fSepSet);
\r
1199 fSets.add(fFormatSet);
\r
1201 fSets.add(fSpSet);
\r
1202 fSets.add(fLowerSet);
\r
1203 fSets.add(fUpperSet);
\r
1204 fSets.add(fOLetterSet);
\r
1205 fSets.add(fNumericSet);
\r
1206 fSets.add(fATermSet);
\r
1207 fSets.add(fSContinueSet);
\r
1208 fSets.add(fSTermSet);
\r
1209 fSets.add(fCloseSet);
\r
1210 fSets.add(fOtherSet);
\r
1211 fSets.add(fExtendSet);
\r
1215 List charClasses() {
\r
1219 void setText(StringBuffer s) {
\r
1224 // moveBack() Find the "significant" code point preceding the index i.
\r
1225 // Skips over ($Extend | $Format)*
\r
1227 private int moveBack(int i) {
\r
1236 j = moveIndex32(fText, j, -1);
\r
1237 c = UTF16.charAt(fText, j);
\r
1239 while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));
\r
1244 int moveForward(int i) {
\r
1245 if (i>=fText.length()) {
\r
1246 return fText.length();
\r
1251 j = moveIndex32(fText, j, 1);
\r
1254 while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));
\r
1259 int cAt(int pos) {
\r
1260 if (pos<0 || pos>=fText.length()) {
\r
1263 return UTF16.charAt(fText, pos);
\r
1266 int next(int prevPos) {
\r
1267 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the
\r
1268 // break position being tested. The candidate break
\r
1269 // location is before p2.
\r
1270 int breakPos = -1;
\r
1272 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
\r
1275 // Prev break at end of string. return DONE.
\r
1276 if (prevPos >= fText.length()) {
\r
1279 /*p0 =*/ p1 = p2 = p3 = prevPos;
\r
1280 c3 = UTF16.charAt(fText, prevPos);
\r
1283 // Loop runs once per "significant" character position in the input text.
\r
1285 // Move all of the positions forward in the input string.
\r
1286 /*p0 = p1;*/ c0 = c1;
\r
1290 // Advancd p3 by X(Extend | Format)* Rule 4
\r
1291 p3 = moveForward(p3);
\r
1294 // Rule (3) CR x LF
\r
1295 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
\r
1299 // Rule (4) Sep <break>
\r
1300 if (fSepSet.contains(c1)) {
\r
1301 p2 = p1+1; // Separators don't combine with Extend or Format
\r
1305 if (p2 >= fText.length()) {
\r
1306 // Reached end of string. Always a break position.
\r
1310 if (p2 == prevPos) {
\r
1311 // Still warming up the loop. (won't work with zero length strings, but we don't care)
\r
1315 // Rule (6). ATerm x Numeric
\r
1316 if (fATermSet.contains(c1) && fNumericSet.contains(c2)) {
\r
1320 // Rule (7). Upper ATerm x Uppper
\r
1321 if (fUpperSet.contains(c0) && fATermSet.contains(c1) && fUpperSet.contains(c2)) {
\r
1325 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep))* Lower
\r
1326 // Note: Sterm | ATerm are added to the negated part of the expression by a
\r
1327 // note to the Unicode 5.0 documents.
\r
1329 while (p8>0 && fSpSet.contains(cAt(p8))) {
\r
1330 p8 = moveBack(p8);
\r
1332 while (p8>0 && fCloseSet.contains(cAt(p8))) {
\r
1333 p8 = moveBack(p8);
\r
1335 if (fATermSet.contains(cAt(p8))) {
\r
1339 if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||
\r
1340 fLowerSet.contains(c) || fSepSet.contains(c) ||
\r
1341 fATermSet.contains(c) || fSTermSet.contains(c))
\r
1345 p8 = moveForward(p8);
\r
1347 if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {
\r
1352 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)
\r
1353 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {
\r
1355 while (setContains(fSpSet, cAt(p8))) {
\r
1356 p8 = moveBack(p8);
\r
1358 while (setContains(fCloseSet, cAt(p8))) {
\r
1359 p8 = moveBack(p8);
\r
1362 if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {
\r
1368 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
\r
1370 while (p9>0 && fCloseSet.contains(cAt(p9))) {
\r
1371 p9 = moveBack(p9);
\r
1374 if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
\r
1375 if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {
\r
1380 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
\r
1382 while (p10>0 && fSpSet.contains(cAt(p10))) {
\r
1383 p10 = moveBack(p10);
\r
1385 while (p10>0 && fCloseSet.contains(cAt(p10))) {
\r
1386 p10 = moveBack(p10);
\r
1388 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {
\r
1389 if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
\r
1394 // Rule (11) (STerm | ATerm) Close* Sp* <break>
\r
1396 if (p11>0 && fSepSet.contains(cAt(p11))) {
\r
1397 p11 = moveBack(p11);
\r
1399 while (p11>0 && fSpSet.contains(cAt(p11))) {
\r
1400 p11 = moveBack(p11);
\r
1402 while (p11>0 && fCloseSet.contains(cAt(p11))) {
\r
1403 p11 = moveBack(p11);
\r
1405 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {
\r
1409 // Rule (12) Any x Any
\r
1422 * Move an index into a string by n code points.
\r
1423 * Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
\r
1424 * complicating usage.
\r
1425 * @param s a Text string
\r
1426 * @param pos The starting code unit index into the text string
\r
1427 * @param amt The amount to adjust the string by.
\r
1428 * @return The adjusted code unit index, pinned to the string's length, or
\r
1429 * unchanged if input index was outside of the string.
\r
1431 static int moveIndex32(StringBuffer s, int pos, int amt) {
\r
1435 for (i=0; i<amt; i++) {
\r
1436 if (pos >= s.length()) {
\r
1437 return s.length();
\r
1439 c = s.charAt(pos);
\r
1441 if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
\r
1442 c = s.charAt(pos);
\r
1443 if (UTF16.isTrailSurrogate(c)) {
\r
1449 for (i=0; i>amt; i--) {
\r
1454 c = s.charAt(pos);
\r
1455 if (UTF16.isTrailSurrogate(c) && pos >= 0) {
\r
1456 c = s.charAt(pos);
\r
1457 if (UTF16.isLeadSurrogate(c)) {
\r
1467 * No-exceptions form of UnicodeSet.contains(c).
\r
1468 * Simplifies loops that terminate with an end-of-input character value.
\r
1469 * @param s A unicode set
\r
1470 * @param c A code point value
\r
1471 * @return true if the set contains c.
\r
1473 static boolean setContains(UnicodeSet s, int c) {
\r
1474 if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {
\r
1477 return s.contains(c);
\r
1482 * return the index of the next code point in the input text.
\r
1483 * @param i the preceding index
\r
1486 static int nextCP(StringBuffer s, int i) {
\r
1488 // End of Input indication. Continue to return end value.
\r
1491 int retVal = i + 1;
\r
1492 if (retVal > s.length()) {
\r
1495 int c = UTF16.charAt(s, i);
\r
1496 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {
\r
1504 * random number generator. Not using Java's built-in Randoms for two reasons:
\r
1505 * 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
\r
1506 * 2. We need to get and restore the seed from values occurring in the middle
\r
1507 * of a long sequence, to more easily reproduce failing cases.
\r
1509 private static int m_seed = 1;
\r
1510 private static int m_rand()
\r
1512 m_seed = m_seed * 1103515245 + 12345;
\r
1513 return (int)(m_seed >>> 16) % 32768;
\r
1516 // Helper function for formatting error output.
\r
1517 // Append a string into a fixed-size field in a StringBuffer.
\r
1518 // Blank-pad the string if it is shorter than the field.
\r
1519 // Truncate the source string if it is too long.
\r
1521 private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {
\r
1522 int appendLen = src.length();
\r
1523 if (appendLen >= fieldLen) {
\r
1524 dest.append(src.substring(0, fieldLen));
\r
1527 while (appendLen < fieldLen) {
\r
1534 // Helper function for formatting error output.
\r
1535 // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format
\r
1536 private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {
\r
1537 String hexChars = "0123456789abcdef";
\r
1538 if (c < 0x10000) {
\r
1539 dest.append("\\u");
\r
1540 for (int bn=12; bn>=0; bn-=4) {
\r
1541 dest.append(hexChars.charAt((((int)c)>>bn)&0xf));
\r
1543 appendToBuf(dest, " ", fieldLen-6);
\r
1545 dest.append("\\U");
\r
1546 for (int bn=28; bn>=0; bn-=4) {
\r
1547 dest.append(hexChars.charAt((((int)c)>>bn)&0xf));
\r
1549 appendToBuf(dest, " ", fieldLen-10);
\r
1555 * Run a RBBI monkey test. Common routine, for all break iterator types.
\r
1557 * bi - the break iterator to use
\r
1558 * mk - MonkeyKind, abstraction for obtaining expected results
\r
1559 * name - Name of test (char, word, etc.) for use in error messages
\r
1560 * seed - Seed for starting random number generator (parameter from user)
\r
1563 void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations) {
\r
1564 int TESTSTRINGLEN = 500;
\r
1565 StringBuffer testText = new StringBuffer();
\r
1566 int numCharClasses;
\r
1568 int[] expected = new int[TESTSTRINGLEN*2 + 1];
\r
1569 int expectedCount = 0;
\r
1570 boolean[] expectedBreaks = new boolean[TESTSTRINGLEN*2 + 1];
\r
1571 boolean[] forwardBreaks = new boolean[TESTSTRINGLEN*2 + 1];
\r
1572 boolean[] reverseBreaks = new boolean[TESTSTRINGLEN*2 + 1];
\r
1573 boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
\r
1574 boolean[] followingBreaks = new boolean[TESTSTRINGLEN*2 + 1];
\r
1575 boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1];
\r
1577 int loopCount = 0;
\r
1578 boolean printTestData = false;
\r
1579 boolean printBreaksFromBI = false;
\r
1583 numCharClasses = mk.charClasses().size();
\r
1584 chClasses = mk.charClasses();
\r
1586 // Verify that the character classes all have at least one member.
\r
1587 for (i=0; i<numCharClasses; i++) {
\r
1588 UnicodeSet s = (UnicodeSet)chClasses.get(i);
\r
1589 if (s == null || s.size() == 0) {
\r
1590 errln("Character Class " + i + " is null or of zero size.");
\r
1595 //--------------------------------------------------------------------------------------------
\r
1597 // Debugging settings. Comment out everything in the following block for normal operation
\r
1599 //--------------------------------------------------------------------------------------------
\r
1600 // numIterations = -1;
\r
1601 // RuleBasedBreakIterator_New.fTrace = true;
\r
1602 // m_seed = 859056465;
\r
1603 // TESTSTRINGLEN = 50;
\r
1604 // printTestData = true;
\r
1605 // printBreaksFromBI = true;
\r
1606 // ((RuleBasedBreakIterator_New)bi).dump();
\r
1608 //--------------------------------------------------------------------------------------------
\r
1610 // End of Debugging settings.
\r
1612 //--------------------------------------------------------------------------------------------
\r
1614 int dotsOnLine = 0;
\r
1615 while (loopCount < numIterations || numIterations == -1) {
\r
1616 if (numIterations == -1 && loopCount % 10 == 0) {
\r
1617 // If test is running in an infinite loop, display a periodic tic so
\r
1618 // we can tell that it is making progress.
\r
1619 System.out.print(".");
\r
1620 if (dotsOnLine++ >= 80){
\r
1621 System.out.println();
\r
1625 // Save current random number seed, so that we can recreate the random numbers
\r
1626 // for this loop iteration in event of an error.
\r
1629 testText.setLength(0);
\r
1630 // Populate a test string with data.
\r
1631 if (printTestData) {
\r
1632 System.out.println("Test Data string ...");
\r
1634 for (i=0; i<TESTSTRINGLEN; i++) {
\r
1635 int aClassNum = m_rand() % numCharClasses;
\r
1636 UnicodeSet classSet = (UnicodeSet)chClasses.get(aClassNum);
\r
1637 int charIdx = m_rand() % classSet.size();
\r
1638 int c = classSet.charAt(charIdx);
\r
1639 if (c < 0) { // TODO: deal with sets containing strings.
\r
1642 UTF16.appendCodePoint(testText, c);
\r
1643 if (printTestData) {
\r
1644 System.out.print(Integer.toHexString(c) + " ");
\r
1647 if (printTestData) {
\r
1648 System.out.println();
\r
1651 Arrays.fill(expected, 0);
\r
1652 Arrays.fill(expectedBreaks, false);
\r
1653 Arrays.fill(forwardBreaks, false);
\r
1654 Arrays.fill(reverseBreaks, false);
\r
1655 Arrays.fill(isBoundaryBreaks, false);
\r
1656 Arrays.fill(followingBreaks, false);
\r
1657 Arrays.fill(precedingBreaks, false);
\r
1659 // Calculate the expected results for this test string.
\r
1660 mk.setText(testText);
\r
1661 expectedCount = 0;
\r
1662 expectedBreaks[0] = true;
\r
1663 expected[expectedCount ++] = 0;
\r
1665 int lastBreakPos = -1;
\r
1667 lastBreakPos = breakPos;
\r
1668 breakPos = mk.next(breakPos);
\r
1669 if (breakPos == -1) {
\r
1672 if (breakPos > testText.length()) {
\r
1673 errln("breakPos > testText.length()");
\r
1675 if (lastBreakPos >= breakPos) {
\r
1676 errln("Next() not increasing.");
\r
1679 expectedBreaks[breakPos] = true;
\r
1680 expected[expectedCount ++] = breakPos;
\r
1683 // Find the break positions using forward iteration
\r
1684 if (printBreaksFromBI) {
\r
1685 System.out.println("Breaks from BI...");
\r
1687 bi.setText(testText.toString());
\r
1688 for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {
\r
1689 if (i < 0 || i > testText.length()) {
\r
1690 errln(name + " break monkey test: Out of range value returned by breakIterator::next()");
\r
1693 if (printBreaksFromBI) {
\r
1694 System.out.print(Integer.toHexString(i) + " ");
\r
1696 forwardBreaks[i] = true;
\r
1698 if (printBreaksFromBI) {
\r
1699 System.out.println();
\r
1702 // Find the break positions using reverse iteration
\r
1703 for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {
\r
1704 if (i < 0 || i > testText.length()) {
\r
1705 errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);
\r
1708 reverseBreaks[i] = true;
\r
1711 // Find the break positions using isBoundary() tests.
\r
1712 for (i=0; i<=testText.length(); i++) {
\r
1713 isBoundaryBreaks[i] = bi.isBoundary(i);
\r
1716 // Find the break positions using the following() function.
\r
1718 followingBreaks[0] = true;
\r
1719 for (i=0; i<testText.length(); i++) {
\r
1720 breakPos = bi.following(i);
\r
1721 if (breakPos <= i ||
\r
1722 breakPos < lastBreakPos ||
\r
1723 breakPos > testText.length() ||
\r
1724 breakPos > lastBreakPos && lastBreakPos > i ) {
\r
1725 errln(name + " break monkey test: " +
\r
1726 "Out of range value returned by BreakIterator::following().\n" +
\r
1727 "index=" + i + "following returned=" + breakPos +
\r
1728 "lastBreak=" + lastBreakPos);
\r
1729 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
\r
1731 followingBreaks[breakPos] = true;
\r
1732 lastBreakPos = breakPos;
\r
1736 // Find the break positions using the preceding() function.
\r
1737 lastBreakPos = testText.length();
\r
1738 precedingBreaks[testText.length()] = true;
\r
1739 for (i=testText.length(); i>0; i--) {
\r
1740 breakPos = bi.preceding(i);
\r
1741 if (breakPos >= i ||
\r
1742 breakPos > lastBreakPos ||
\r
1744 breakPos < lastBreakPos && lastBreakPos < i ) {
\r
1745 errln(name + " break monkey test: " +
\r
1746 "Out of range value returned by BreakIterator::preceding().\n" +
\r
1747 "index=" + i + "preceding returned=" + breakPos +
\r
1748 "lastBreak=" + lastBreakPos);
\r
1749 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
\r
1751 precedingBreaks[breakPos] = true;
\r
1752 lastBreakPos = breakPos;
\r
1758 // Compare the expected and actual results.
\r
1759 for (i=0; i<=testText.length(); i++) {
\r
1760 String errorType = null;
\r
1761 if (forwardBreaks[i] != expectedBreaks[i]) {
\r
1762 errorType = "next()";
\r
1763 } else if (reverseBreaks[i] != forwardBreaks[i]) {
\r
1764 errorType = "previous()";
\r
1765 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
\r
1766 errorType = "isBoundary()";
\r
1767 } else if (followingBreaks[i] != expectedBreaks[i]) {
\r
1768 errorType = "following()";
\r
1769 } else if (precedingBreaks[i] != expectedBreaks[i]) {
\r
1770 errorType = "preceding()";
\r
1774 if (errorType != null) {
\r
1775 // Format a range of the test text that includes the failure as
\r
1776 // a data item that can be included in the rbbi test data file.
\r
1778 // Start of the range is the last point where expected and actual results
\r
1779 // both agreed that there was a break position.
\r
1780 int startContext = i;
\r
1783 if (startContext==0) { break; }
\r
1785 if (expectedBreaks[startContext]) {
\r
1786 if (count == 2) break;
\r
1791 // End of range is two expected breaks past the start position.
\r
1792 int endContext = i + 1;
\r
1794 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
\r
1796 if (endContext >= testText.length()) {break;}
\r
1797 if (expectedBreaks[endContext-1]) {
\r
1798 if (count == 0) break;
\r
1805 // Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
\r
1806 StringBuffer errorText = new StringBuffer();
\r
1808 int c; // Char from test data
\r
1809 for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testText, ci)) {
\r
1811 // This is the location of the error.
\r
1812 errorText.append("<?>---------------------------------\n");
\r
1813 } else if (expectedBreaks[ci]) {
\r
1814 // This a non-error expected break position.
\r
1815 errorText.append("------------------------------------\n");
\r
1817 if (ci < testText.length()) {
\r
1818 c = UTF16.charAt(testText, ci);
\r
1819 appendCharToBuf(errorText, c, 11);
\r
1820 String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT);
\r
1821 appendToBuf(errorText, gc, 8);
\r
1822 int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty);
\r
1823 String extraPropValue =
\r
1824 UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG);
\r
1825 appendToBuf(errorText, extraPropValue, 20);
\r
1827 String charName = UCharacter.getExtendedName(c);
\r
1828 appendToBuf(errorText, charName, 40);
\r
1829 errorText.append('\n');
\r
1832 if (ci == testText.length() && ci != -1) {
\r
1833 errorText.append("<>");
\r
1835 errorText.append("</data>\n");
\r
1837 // Output the error
\r
1838 errln(name + " break monkey test error. " +
\r
1839 (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") +
\r
1840 "\nOperation = " + errorType + "; random seed = " + seed + "; buf Idx = " + i + "\n" +
\r
1850 public void TestCharMonkey() {
\r
1852 int loopCount = 500;
\r
1855 if (params.inclusion >= 9) {
\r
1856 loopCount = 10000;
\r
1859 RBBICharMonkey m = new RBBICharMonkey();
\r
1860 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US);
\r
1861 RunMonkey(bi, m, "char", seed, loopCount);
\r
1864 public void TestWordMonkey() {
\r
1866 int loopCount = 500;
\r
1869 if (params.inclusion >= 9) {
\r
1870 loopCount = 10000;
\r
1873 logln("Word Break Monkey Test");
\r
1874 RBBIWordMonkey m = new RBBIWordMonkey();
\r
1875 BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
\r
1876 RunMonkey(bi, m, "word", seed, loopCount);
\r
1879 public void TestLineMonkey() {
\r
1880 int loopCount = 500;
\r
1883 if (params.inclusion >= 9) {
\r
1884 loopCount = 10000;
\r
1887 logln("Line Break Monkey Test");
\r
1888 RBBILineMonkey m = new RBBILineMonkey();
\r
1889 BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
\r
1890 if (params == null) {
\r
1893 RunMonkey(bi, m, "line", seed, loopCount);
\r
1896 public void TestSentMonkey() {
\r
1898 int loopCount = 500;
\r
1901 if (params.inclusion >= 9) {
\r
1905 logln("Sentence Break Monkey Test");
\r
1906 RBBISentenceMonkey m = new RBBISentenceMonkey();
\r
1907 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
\r
1908 if (params == null) {
\r
1911 RunMonkey(bi, m, "sent", seed, loopCount);
\r
1914 // Round-trip monkey tests.
\r
1915 // Verify that break iterators created from the rule source from the default
\r
1916 // break iterators still pass the monkey test for the iterator type.
\r
1918 // This is a major test for the Rule Compiler. The default break iterators are built
\r
1919 // from pre-compiled binary rule data that was created using ICU4C; these
\r
1920 // round-trip rule recompile tests verify that the Java rule compiler can
\r
1921 // rebuild break iterators from the original source rules.
\r
1923 public void TestRTCharMonkey() {
\r
1925 int loopCount = 200;
\r
1928 if (params.inclusion >= 9) {
\r
1932 RBBICharMonkey m = new RBBICharMonkey();
\r
1933 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US);
\r
1934 String rules = bi.toString();
\r
1935 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
\r
1936 RunMonkey(rtbi, m, "char", seed, loopCount);
\r
1939 public void TestRTWordMonkey() {
\r
1941 int loopCount = 200;
\r
1944 if (params.inclusion >= 9) {
\r
1948 logln("Word Break Monkey Test");
\r
1949 RBBIWordMonkey m = new RBBIWordMonkey();
\r
1950 BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
\r
1951 String rules = bi.toString();
\r
1952 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
\r
1953 RunMonkey(rtbi, m, "word", seed, loopCount);
\r
1956 public void TestRTLineMonkey() {
\r
1957 int loopCount = 200;
\r
1960 if (params.inclusion >= 9) {
\r
1964 logln("Line Break Monkey Test");
\r
1965 RBBILineMonkey m = new RBBILineMonkey();
\r
1966 BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
\r
1967 String rules = bi.toString();
\r
1968 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
\r
1969 if (params == null) {
\r
1972 RunMonkey(rtbi, m, "line", seed, loopCount);
\r
1975 public void TestRTSentMonkey() {
\r
1977 int loopCount = 200;
\r
1980 if (params.inclusion >= 9) {
\r
1984 logln("Sentence Break Monkey Test");
\r
1985 RBBISentenceMonkey m = new RBBISentenceMonkey();
\r
1986 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
\r
1987 String rules = bi.toString();
\r
1988 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
\r
1989 if (params == null) {
\r
1992 RunMonkey(rtbi, m, "sent", seed, loopCount);
\r