2 *******************************************************************************
\r
3 * Copyright (C) 2003-2008 International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.dev.test.rbbi;
\r
10 // Monkey testing of RuleBasedBreakIterator
\r
11 import java.util.ArrayList;
\r
12 import java.util.Arrays;
\r
13 import java.util.List;
\r
14 import java.util.Locale;
\r
16 import com.ibm.icu.dev.test.TestFmwk;
\r
17 import com.ibm.icu.lang.UCharacter;
\r
18 import com.ibm.icu.lang.UProperty;
\r
19 import com.ibm.icu.text.BreakIterator;
\r
20 import com.ibm.icu.text.RuleBasedBreakIterator;
\r
21 import com.ibm.icu.text.UTF16;
\r
22 import com.ibm.icu.text.UnicodeSet;
\r
26 * Monkey tests for RBBI. These tests have independent implementations of
\r
27 * the Unicode TR boundary rules, and compare results between these and ICU's
\r
28 * implementation, using random data.
\r
30 * Tests cover Grapheme Cluster (char), Word and Line breaks
\r
32 * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
\r
35 public class RBBITestMonkey extends TestFmwk {
\r
37 public static void main(String[] args) {
\r
38 new RBBITestMonkey().run(args);
\r
42 // classs RBBIMonkeyKind
\r
44 // Monkey Test for Break Iteration
\r
45 // Abstract interface class. Concrete derived classes independently
\r
46 // implement the break rules for different iterator types.
\r
48 // The Monkey Test itself uses doesn't know which type of break iterator it is
\r
49 // testing, but works purely in terms of the interface defined here.
\r
51 abstract static class RBBIMonkeyKind {
\r
53 // Return a List of UnicodeSets, representing the character classes used
\r
54 // for this type of iterator.
\r
55 abstract List charClasses();
\r
57 // Set the test text on which subsequent calls to next() will operate
\r
58 abstract void setText(StringBuffer text);
\r
60 // Find the next break postion, starting from the specified position.
\r
61 // Return -1 after reaching end of string.
\r
62 abstract int next(int i);
\r
64 // A Character Property, one of the constants defined in class UProperty.
\r
65 // The value fo this property will be displayed for the characters
\r
66 // near any test failure.
\r
72 * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
\r
74 static class RBBICharMonkey extends RBBIMonkeyKind {
\r
77 UnicodeSet fCRLFSet;
\r
78 UnicodeSet fControlSet;
\r
79 UnicodeSet fExtendSet;
\r
80 UnicodeSet fPrependSet;
\r
81 UnicodeSet fSpacingSet;
\r
87 UnicodeSet fHangulSet;
\r
95 fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
\r
96 fCRLFSet = new UnicodeSet("[\\r\\n]");
\r
97 fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
\r
98 fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
\r
99 fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
\r
100 fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
\r
101 fLSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
\r
102 fVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");
\r
103 fTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");
\r
104 fLVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");
\r
105 fLVTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");
\r
106 fHangulSet = new UnicodeSet();
\r
107 fHangulSet.addAll(fLSet);
\r
108 fHangulSet.addAll(fVSet);
\r
109 fHangulSet.addAll(fTSet);
\r
110 fHangulSet.addAll(fLVSet);
\r
111 fHangulSet.addAll(fLVTSet);
\r
113 fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");
\r
115 fSets = new ArrayList();
\r
116 fSets.add(fCRLFSet);
\r
117 fSets.add(fControlSet);
\r
118 fSets.add(fExtendSet);
\r
119 fSets.add(fPrependSet);
\r
120 fSets.add(fSpacingSet);
\r
121 fSets.add(fHangulSet);
\r
122 fSets.add(fAnySet);
\r
126 void setText(StringBuffer s) {
\r
130 List charClasses() {
\r
134 int next(int prevPos) {
\r
135 int p1, p2, p3; // Indices of the significant code points around the
\r
136 // break position being tested. The candidate break
\r
137 // location is before p2.
\r
141 int c1, c2, c3; // The code points at p0, p1, p2 & p3.
\r
143 // Previous break at end of string. return DONE.
\r
144 if (prevPos >= fText.length()) {
\r
147 p1 = p2 = p3 = prevPos;
\r
148 c3 = UTF16.charAt(fText, prevPos);
\r
151 // Loop runs once per "significant" character position in the input text.
\r
153 // Move all of the positions forward in the input string.
\r
157 // Advance p3 by one codepoint
\r
158 p3 = moveIndex32(fText, p3, 1);
\r
159 c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);
\r
162 // Still warming up the loop. (won't work with zero length strings, but we don't care)
\r
165 if (p2 == fText.length()) {
\r
166 // Reached end of string. Always a break position.
\r
170 // Rule GB3 CR x LF
\r
171 // No Extend or Format characters may appear between the CR and LF,
\r
172 // which requires the additional check for p2 immediately following p1.
\r
174 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
\r
178 // Rule (GB4). ( Control | CR | LF ) <break>
\r
179 if (fControlSet.contains(c1) ||
\r
185 // Rule (GB5) <break> ( Control | CR | LF )
\r
187 if (fControlSet.contains(c2) ||
\r
194 // Rule (GB6) L x ( L | V | LV | LVT )
\r
195 if (fLSet.contains(c1) &&
\r
196 (fLSet.contains(c2) ||
\r
197 fVSet.contains(c2) ||
\r
198 fLVSet.contains(c2) ||
\r
199 fLVTSet.contains(c2))) {
\r
203 // Rule (GB7) ( LV | V ) x ( V | T )
\r
204 if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&
\r
205 (fVSet.contains(c2) || fTSet.contains(c2))) {
\r
209 // Rule (GB8) ( LVT | T) x T
\r
210 if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&
\r
211 fTSet.contains(c2)) {
\r
215 // Rule (GB9) Numeric x ALetter
\r
216 if (fExtendSet.contains(c2)) {
\r
220 // Rule (GB9a) x SpacingMark
\r
221 if (fSpacingSet.contains(c2)) {
\r
225 // Rule (GB9b) Prepend x
\r
226 if (fPrependSet.contains(c1)) {
\r
230 // Rule (GB10) Any <break> Any
\r
242 * Word Monkey Test Class
\r
247 static class RBBIWordMonkey extends RBBIMonkeyKind {
\r
249 StringBuffer fText;
\r
253 UnicodeSet fNewlineSet;
\r
254 UnicodeSet fKatakanaSet;
\r
255 UnicodeSet fALetterSet;
\r
256 UnicodeSet fMidNumLetSet;
\r
257 UnicodeSet fMidLetterSet;
\r
258 UnicodeSet fMidNumSet;
\r
259 UnicodeSet fNumericSet;
\r
260 UnicodeSet fFormatSet;
\r
261 UnicodeSet fExtendSet;
\r
262 UnicodeSet fExtendNumLetSet;
\r
263 UnicodeSet fOtherSet;
\r
267 fCharProperty = UProperty.WORD_BREAK;
\r
269 fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]");
\r
270 fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]");
\r
271 fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]");
\r
272 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]");
\r
273 fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]");
\r
274 fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
\r
275 fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
\r
276 fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]");
\r
277 fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]");
\r
278 fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]");
\r
279 fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
\r
280 fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]");
\r
282 fOtherSet = new UnicodeSet();
\r
283 fOtherSet.complement();
\r
284 fOtherSet.removeAll(fCRSet);
\r
285 fOtherSet.removeAll(fLFSet);
\r
286 fOtherSet.removeAll(fNewlineSet);
\r
287 fOtherSet.removeAll(fALetterSet);
\r
288 fOtherSet.removeAll(fKatakanaSet);
\r
289 fOtherSet.removeAll(fMidLetterSet);
\r
290 fOtherSet.removeAll(fMidNumSet);
\r
291 fOtherSet.removeAll(fNumericSet);
\r
292 fOtherSet.removeAll(fFormatSet);
\r
293 fOtherSet.removeAll(fExtendSet);
\r
294 fOtherSet.removeAll(fExtendNumLetSet);
\r
295 // Inhibit dictionary characters from being tested at all.
\r
296 fOtherSet.removeAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
\r
298 fSets = new ArrayList();
\r
301 fSets.add(fNewlineSet);
\r
302 fSets.add(fALetterSet);
\r
303 fSets.add(fKatakanaSet);
\r
304 fSets.add(fMidLetterSet);
\r
305 fSets.add(fMidNumLetSet);
\r
306 fSets.add(fMidNumSet);
\r
307 fSets.add(fNumericSet);
\r
308 fSets.add(fFormatSet);
\r
309 fSets.add(fExtendSet);
\r
310 fSets.add(fExtendNumLetSet);
\r
311 fSets.add(fOtherSet);
\r
315 List charClasses() {
\r
319 void setText(StringBuffer s) {
\r
323 int next(int prevPos) {
\r
324 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the
\r
325 // break position being tested. The candidate break
\r
326 // location is before p2.
\r
329 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
\r
331 // Previous break at end of string. return DONE.
\r
332 if (prevPos >= fText.length()) {
\r
335 /*p0 =*/ p1 = p2 = p3 = prevPos;
\r
336 c3 = UTF16.charAt(fText, prevPos);
\r
341 // Loop runs once per "significant" character position in the input text.
\r
343 // Move all of the positions forward in the input string.
\r
344 /*p0 = p1;*/ c0 = c1;
\r
348 // Advancd p3 by X(Extend | Format)* Rule 4
\r
349 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
\r
351 p3 = moveIndex32(fText, p3, 1);
\r
353 if (p3>=fText.length()) {
\r
356 c3 = UTF16.charAt(fText, p3);
\r
357 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
\r
361 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3));
\r
364 // Still warming up the loop. (won't work with zero length strings, but we don't care)
\r
367 if (p2 == fText.length()) {
\r
368 // Reached end of string. Always a break position.
\r
372 // Rule (3) CR x LF
\r
373 // No Extend or Format characters may appear between the CR and LF,
\r
374 // which requires the additional check for p2 immediately following p1.
\r
376 if (c1==0x0D && c2==0x0A) {
\r
380 // Rule (3a) Break before and after newlines (including CR and LF)
\r
382 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {
\r
385 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
\r
389 // Rule (5). ALetter x ALetter
\r
390 if (fALetterSet.contains(c1) &&
\r
391 fALetterSet.contains(c2)) {
\r
395 // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
\r
397 if ( fALetterSet.contains(c1) &&
\r
398 (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
\r
399 setContains(fALetterSet, c3)) {
\r
404 // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
\r
405 if (fALetterSet.contains(c0) &&
\r
406 (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1)) &&
\r
407 fALetterSet.contains(c2)) {
\r
411 // Rule (8) Numeric x Numeric
\r
412 if (fNumericSet.contains(c1) &&
\r
413 fNumericSet.contains(c2)) {
\r
417 // Rule (9) ALetter x Numeric
\r
418 if (fALetterSet.contains(c1) &&
\r
419 fNumericSet.contains(c2)) {
\r
423 // Rule (10) Numeric x ALetter
\r
424 if (fNumericSet.contains(c1) &&
\r
425 fALetterSet.contains(c2)) {
\r
429 // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
\r
430 if ( fNumericSet.contains(c0) &&
\r
431 (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1)) &&
\r
432 fNumericSet.contains(c2)) {
\r
436 // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
\r
437 if (fNumericSet.contains(c1) &&
\r
438 (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
\r
439 setContains(fNumericSet, c3)) {
\r
443 // Rule (13) Katakana x Katakana
\r
444 if (fKatakanaSet.contains(c1) &&
\r
445 fKatakanaSet.contains(c2)) {
\r
449 // Rule 13a (ALetter | Numeric | Katakana | ExtendNumLet) x ExtendNumLet
\r
450 if ((fALetterSet.contains(c1) || fNumericSet.contains(c1) ||
\r
451 fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
\r
452 fExtendNumLetSet.contains(c2)) {
\r
455 // Rule 13b ExtendNumLet x (ALetter | Numeric | Katakana | ExtendNumLet)
\r
456 if (fExtendNumLetSet.contains(c1) &&
\r
457 (fALetterSet.contains(c2) || fNumericSet.contains(c2) ||
\r
458 fKatakanaSet.contains(c2) || fExtendNumLetSet.contains(c2))) {
\r
462 // Rule 14. Break found here.
\r
473 static class RBBILineMonkey extends RBBIMonkeyKind {
\r
514 StringBuffer fText;
\r
515 int fOrigPositions;
\r
521 fCharProperty = UProperty.LINE_BREAK;
\r
522 fSets = new ArrayList();
\r
524 fBK = new UnicodeSet("[\\p{Line_Break=BK}]");
\r
525 fCR = new UnicodeSet("[\\p{Line_break=CR}]");
\r
526 fLF = new UnicodeSet("[\\p{Line_break=LF}]");
\r
527 fCM = new UnicodeSet("[\\p{Line_break=CM}]");
\r
528 fNL = new UnicodeSet("[\\p{Line_break=NL}]");
\r
529 fWJ = new UnicodeSet("[\\p{Line_break=WJ}]");
\r
530 fZW = new UnicodeSet("[\\p{Line_break=ZW}]");
\r
531 fGL = new UnicodeSet("[\\p{Line_break=GL}]");
\r
532 fCB = new UnicodeSet("[\\p{Line_break=CB}]");
\r
533 fSP = new UnicodeSet("[\\p{Line_break=SP}]");
\r
534 fB2 = new UnicodeSet("[\\p{Line_break=B2}]");
\r
535 fBA = new UnicodeSet("[\\p{Line_break=BA}]");
\r
536 fBB = new UnicodeSet("[\\p{Line_break=BB}]");
\r
537 fHY = new UnicodeSet("[\\p{Line_break=HY}]");
\r
538 fCL = new UnicodeSet("[\\p{Line_break=CL}]");
\r
539 fEX = new UnicodeSet("[\\p{Line_break=EX}]");
\r
540 fIN = new UnicodeSet("[\\p{Line_break=IN}]");
\r
541 fNS = new UnicodeSet("[\\p{Line_break=NS}]");
\r
542 fOP = new UnicodeSet("[\\p{Line_break=OP}]");
\r
543 fQU = new UnicodeSet("[\\p{Line_break=QU}]");
\r
544 fIS = new UnicodeSet("[\\p{Line_break=IS}]");
\r
545 fNU = new UnicodeSet("[\\p{Line_break=NU}]");
\r
546 fPO = new UnicodeSet("[\\p{Line_break=PO}]");
\r
547 fPR = new UnicodeSet("[\\p{Line_break=PR}]");
\r
548 fSY = new UnicodeSet("[\\p{Line_break=SY}]");
\r
549 fAI = new UnicodeSet("[\\p{Line_break=AI}]");
\r
550 fAL = new UnicodeSet("[\\p{Line_break=AL}]");
\r
551 fID = new UnicodeSet("[\\p{Line_break=ID}]");
\r
552 fSA = new UnicodeSet("[\\p{Line_break=SA}]");
\r
553 fJL = new UnicodeSet("[\\p{Line_break=JL}]");
\r
554 fJV = new UnicodeSet("[\\p{Line_break=JV}]");
\r
555 fJT = new UnicodeSet("[\\p{Line_break=JT}]");
\r
556 fH2 = new UnicodeSet("[\\p{Line_break=H2}]");
\r
557 fH3 = new UnicodeSet("[\\p{Line_break=H3}]");
\r
558 fSG = new UnicodeSet("[\\ud800-\\udfff]");
\r
559 fXX = new UnicodeSet("[\\p{Line_break=XX}]");
\r
562 fAL.addAll(fXX); // Default behavior for XX is identical to AL
\r
563 fAL.addAll(fAI); // Default behavior for AI is identical to AL
\r
564 fAL.addAll(fSA); // Default behavior for SA is XX, which defaults to AL
\r
565 fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL
\r
608 void setText(StringBuffer s) {
\r
615 int next(int startPos) {
\r
616 int pos; // Index of the char following a potential break position
\r
617 int thisChar; // Character at above position "pos"
\r
619 int prevPos; // Index of the char preceding a potential break position
\r
620 int prevChar; // Character at above position. Note that prevChar
\r
621 // and thisChar may not be adjacent because combining
\r
622 // characters between them will be ignored.
\r
624 int nextPos; // Index of the next character following pos.
\r
625 // Usually skips over combining marks.
\r
626 int tPos; // temp value.
\r
627 int matchVals[] = null; // Number Expression Match Results
\r
630 if (startPos >= fText.length()) {
\r
635 // Initial values for loop. Loop will run the first time without finding breaks,
\r
636 // while the invalid values shift out and the "this" and
\r
637 // "prev" positions are filled in with good values.
\r
638 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.
\r
639 thisChar = prevChar = 0;
\r
640 nextPos = startPos;
\r
643 // Loop runs once per position in the test text, until a break position
\r
644 // is found. In each iteration, we are testing for a possible break
\r
645 // just preceding the character at index "pos". The character preceding
\r
646 // this char is at postion "prevPos"; because of combining sequences,
\r
647 // "prevPos" can be arbitrarily far before "pos".
\r
649 // Advance to the next position to be tested.
\r
651 prevChar = thisChar;
\r
653 nextPos = moveIndex32(fText, pos, 1);
\r
655 // Rule LB2 - Break at end of text.
\r
656 if (pos >= fText.length()) {
\r
660 // Rule LB 9 - adjust for combining sequences.
\r
661 // We do this rule out-of-order because the adjustment does
\r
662 // not effect the way that rules LB 3 through LB 6 match,
\r
663 // and doing it here rather than after LB 6 is substantially
\r
664 // simpler when combining sequences do occur.
\r
667 // LB 9 Keep combining sequences together.
\r
668 // advance over any CM class chars at "pos",
\r
669 // result is "nextPos" for the following loop iteration.
\r
670 thisChar = UTF16.charAt(fText, pos);
\r
671 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||
\r
672 thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {
\r
674 if (nextPos == fText.length()) {
\r
677 int nextChar = UTF16.charAt(fText, nextPos);
\r
678 if (!fCM.contains(nextChar)) {
\r
681 nextPos = moveIndex32(fText, nextPos, 1);
\r
685 // LB 9 Treat X CM* as if it were X
\r
686 // No explicit action required.
\r
688 // LB 10 Treat any remaining combining mark as AL
\r
689 if (fCM.contains(thisChar)) {
\r
694 // If the loop is still warming up - if we haven't shifted the initial
\r
695 // -1 positions out of prevPos yet - loop back to advance the
\r
696 // position in the input without any further looking for breaks.
\r
697 if (prevPos == -1) {
\r
701 // LB 4 Always break after hard line breaks,
\r
702 if (fBK.contains(prevChar)) {
\r
706 // LB 5 Break after CR, LF, NL, but not inside CR LF
\r
707 if (fCR.contains(prevChar) && fLF.contains(thisChar)) {
\r
710 if (fCR.contains(prevChar) ||
\r
711 fLF.contains(prevChar) ||
\r
712 fNL.contains(prevChar)) {
\r
716 // LB 6 Don't break before hard line breaks
\r
717 if (fBK.contains(thisChar) || fCR.contains(thisChar) ||
\r
718 fLF.contains(thisChar) || fNL.contains(thisChar) ) {
\r
723 // LB 7 Don't break before spaces or zero-width space.
\r
724 if (fSP.contains(thisChar)) {
\r
728 if (fZW.contains(thisChar)) {
\r
732 // LB 8 Break after zero width space
\r
733 if (fZW.contains(prevChar)) {
\r
737 // LB 9, 10 Already done, at top of loop.
\r
744 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
\r
751 if (fGL.contains(prevChar)) {
\r
756 // [^SP BA HY] x GL
\r
757 if (!(fSP.contains(prevChar) ||
\r
758 fBA.contains(prevChar) ||
\r
759 fHY.contains(prevChar) ) && fGL.contains(thisChar)) {
\r
765 // LB 13 Don't break before closings.
\r
766 // NU x CL and NU x IS are not matched here so that they will
\r
767 // fall into LB 17 and the more general number regular expression.
\r
769 if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
\r
770 fEX.contains(thisChar) ||
\r
771 !fNU.contains(prevChar) && fIS.contains(thisChar) ||
\r
772 !fNU.contains(prevChar) && fSY.contains(thisChar)) {
\r
776 // LB 14 Don't break after OP SP*
\r
777 // Scan backwards, checking for this sequence.
\r
778 // The OP char could include combining marks, so we actually check for
\r
781 if (fSP.contains(prevChar)) {
\r
782 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
\r
783 tPos=moveIndex32(fText, tPos, -1);
\r
786 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
\r
787 tPos=moveIndex32(fText, tPos, -1);
\r
789 if (fOP.contains(UTF16.charAt(fText, tPos))) {
\r
793 // LB 15 Do not break within "[
\r
795 if (fOP.contains(thisChar)) {
\r
796 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
\r
798 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
\r
799 tPos = moveIndex32(fText, tPos, -1);
\r
801 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
\r
802 tPos = moveIndex32(fText, tPos, -1);
\r
804 if (fQU.contains(UTF16.charAt(fText, tPos))) {
\r
809 // LB 16 CL SP* x NS
\r
810 if (fNS.contains(thisChar)) {
\r
812 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
\r
813 tPos = moveIndex32(fText, tPos, -1);
\r
815 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
\r
816 tPos = moveIndex32(fText, tPos, -1);
\r
818 if (fCL.contains(UTF16.charAt(fText, tPos))) {
\r
824 // LB 17 B2 SP* x B2
\r
825 if (fB2.contains(thisChar)) {
\r
827 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
\r
828 tPos = moveIndex32(fText, tPos, -1);
\r
830 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
\r
831 tPos = moveIndex32(fText, tPos, -1);
\r
833 if (fB2.contains(UTF16.charAt(fText, tPos))) {
\r
838 // LB 18 break after space
\r
839 if (fSP.contains(prevChar)) {
\r
846 if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
\r
850 // LB 20 Break around a CB
\r
851 if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
\r
856 if (fBA.contains(thisChar) ||
\r
857 fHY.contains(thisChar) ||
\r
858 fNS.contains(thisChar) ||
\r
859 fBB.contains(prevChar) ) {
\r
864 if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
\r
865 fID.contains(prevChar) && fIN.contains(thisChar) ||
\r
866 fIN.contains(prevChar) && fIN.contains(thisChar) ||
\r
867 fNU.contains(prevChar) && fIN.contains(thisChar) ) {
\r
872 // LB 23 ID x PO (Note: Leading CM behaves like ID)
\r
875 if (fID.contains(prevChar) && fPO.contains(thisChar) ||
\r
876 fAL.contains(prevChar) && fNU.contains(thisChar) ||
\r
877 fNU.contains(prevChar) && fAL.contains(thisChar) ) {
\r
881 // LB 24 Do not break between prefix and letters or ideographs.
\r
885 if (fPR.contains(prevChar) && fID.contains(thisChar) ||
\r
886 fPR.contains(prevChar) && fAL.contains(thisChar) ||
\r
887 fPO.contains(prevChar) && fAL.contains(thisChar)) {
\r
893 matchVals = LBNumberCheck(fText, prevPos, matchVals);
\r
894 if (matchVals[0] != -1) {
\r
895 // Matched a number. But could have been just a single digit, which would
\r
896 // not represent a "no break here" between prevChar and thisChar
\r
897 int numEndIdx = matchVals[1]; // idx of first char following num
\r
898 if (numEndIdx > pos) {
\r
899 // Number match includes at least the two chars being checked
\r
900 if (numEndIdx > nextPos) {
\r
901 // Number match includes additional chars. Update pos and nextPos
\r
902 // so that next loop iteration will continue at the end of the number,
\r
903 // checking for breaks between last char in number & whatever follows.
\r
904 nextPos = numEndIdx;
\r
907 pos = moveIndex32(fText, pos, -1);
\r
908 thisChar = UTF16.charAt(fText, pos);
\r
910 while (fCM.contains(thisChar));
\r
917 // LB 26 Do not break Korean Syllables
\r
918 if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
\r
919 fJV.contains(thisChar) ||
\r
920 fH2.contains(thisChar) ||
\r
921 fH3.contains(thisChar))) {
\r
925 if ((fJV.contains(prevChar) || fH2.contains(prevChar)) &&
\r
926 (fJV.contains(thisChar) || fJT.contains(thisChar))) {
\r
930 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&
\r
931 fJT.contains(thisChar)) {
\r
935 // LB 27 Treat a Korean Syllable Block the same as ID
\r
936 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
\r
937 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
\r
938 fIN.contains(thisChar)) {
\r
941 if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
\r
942 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
\r
943 fPO.contains(thisChar)) {
\r
946 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||
\r
947 fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {
\r
953 // LB 28 Do not break between alphabetics
\r
954 if (fAL.contains(prevChar) && fAL.contains(thisChar)) {
\r
958 // LB 29 Do not break between numeric punctuation and alphabetics
\r
959 if (fIS.contains(prevChar) && fAL.contains(thisChar)) {
\r
963 // LB 30 (Withdrawn as of Unicode 5.1)
\r
965 // LB 31 Break everywhere else
\r
974 // Match the following regular expression in the input text.
\r
975 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)?
\r
976 // 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states)
\r
977 // retVals array [0] index of the start of the match, or -1 if no match
\r
978 // [1] index of first char following the match.
\r
979 // Can not use Java regex because need supplementary character support,
\r
980 // and because Unicode char properties version must be the same as in
\r
981 // the version of ICU being tested.
\r
982 private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
\r
983 if (retVals == null) {
\r
984 retVals = new int[2];
\r
986 retVals[0] = -1; // Indicates no match.
\r
987 int matchState = 0;
\r
988 int idx = startIdx;
\r
990 matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
\r
991 int c = UTF16.charAt(s, idx);
\r
992 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
\r
993 switch (matchState) {
\r
995 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||
\r
996 cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
\r
1000 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
\r
1004 if (cLBType == UCharacter.LineBreak.HYPHEN) {
\r
1008 if (cLBType == UCharacter.LineBreak.NUMERIC) {
\r
1012 break matchLoop; /* No Match */
\r
1015 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
\r
1019 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
\r
1023 if (cLBType == UCharacter.LineBreak.HYPHEN) {
\r
1027 if (cLBType == UCharacter.LineBreak.NUMERIC) {
\r
1031 break matchLoop; /* No Match */
\r
1035 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
\r
1039 if (cLBType == UCharacter.LineBreak.NUMERIC) {
\r
1043 break matchLoop; /* No Match */
\r
1044 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)?
\r
1045 // 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states)
\r
1048 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
\r
1052 if (cLBType == UCharacter.LineBreak.NUMERIC) {
\r
1056 if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
\r
1060 if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
\r
1064 if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
\r
1068 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
\r
1072 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
\r
1077 break matchLoop; // Match Complete.
\r
1079 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
\r
1083 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
\r
1087 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
\r
1091 break matchLoop; // Match Complete.
\r
1093 if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
\r
1097 break matchLoop; // Match Complete.
\r
1100 if (matchState > 4) {
\r
1101 retVals[0] = startIdx;
\r
1102 retVals[1] = idx;
\r
1108 List charClasses() {
\r
1119 * Sentence Monkey Test Class
\r
1124 static class RBBISentenceMonkey extends RBBIMonkeyKind {
\r
1126 StringBuffer fText;
\r
1128 UnicodeSet fSepSet;
\r
1129 UnicodeSet fFormatSet;
\r
1130 UnicodeSet fSpSet;
\r
1131 UnicodeSet fLowerSet;
\r
1132 UnicodeSet fUpperSet;
\r
1133 UnicodeSet fOLetterSet;
\r
1134 UnicodeSet fNumericSet;
\r
1135 UnicodeSet fATermSet;
\r
1136 UnicodeSet fSContinueSet;
\r
1137 UnicodeSet fSTermSet;
\r
1138 UnicodeSet fCloseSet;
\r
1139 UnicodeSet fOtherSet;
\r
1140 UnicodeSet fExtendSet;
\r
1144 RBBISentenceMonkey() {
\r
1145 fCharProperty = UProperty.SENTENCE_BREAK;
\r
1147 fSets = new ArrayList();
\r
1149 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
\r
1150 // set and made into character classes of their own. For the monkey impl,
\r
1151 // they remain in SEP, since Sep always appears with CR and LF in the rules.
\r
1152 fSepSet = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");
\r
1153 fFormatSet = new UnicodeSet("[\\p{Sentence_Break = Format}]");
\r
1154 fSpSet = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
\r
1155 fLowerSet = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
\r
1156 fUpperSet = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
\r
1157 fOLetterSet = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");
\r
1158 fNumericSet = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");
\r
1159 fATermSet = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
\r
1160 fSContinueSet = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");
\r
1161 fSTermSet = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
\r
1162 fCloseSet = new UnicodeSet("[\\p{Sentence_Break = Close}]");
\r
1163 fExtendSet = new UnicodeSet("[\\p{Sentence_Break = Extend}]");
\r
1164 fOtherSet = new UnicodeSet();
\r
1167 fOtherSet.complement();
\r
1168 fOtherSet.removeAll(fSepSet);
\r
1169 fOtherSet.removeAll(fFormatSet);
\r
1170 fOtherSet.removeAll(fSpSet);
\r
1171 fOtherSet.removeAll(fLowerSet);
\r
1172 fOtherSet.removeAll(fUpperSet);
\r
1173 fOtherSet.removeAll(fOLetterSet);
\r
1174 fOtherSet.removeAll(fNumericSet);
\r
1175 fOtherSet.removeAll(fATermSet);
\r
1176 fOtherSet.removeAll(fSContinueSet);
\r
1177 fOtherSet.removeAll(fSTermSet);
\r
1178 fOtherSet.removeAll(fCloseSet);
\r
1179 fOtherSet.removeAll(fExtendSet);
\r
1181 fSets.add(fSepSet);
\r
1182 fSets.add(fFormatSet);
\r
1184 fSets.add(fSpSet);
\r
1185 fSets.add(fLowerSet);
\r
1186 fSets.add(fUpperSet);
\r
1187 fSets.add(fOLetterSet);
\r
1188 fSets.add(fNumericSet);
\r
1189 fSets.add(fATermSet);
\r
1190 fSets.add(fSContinueSet);
\r
1191 fSets.add(fSTermSet);
\r
1192 fSets.add(fCloseSet);
\r
1193 fSets.add(fOtherSet);
\r
1194 fSets.add(fExtendSet);
\r
1198 List charClasses() {
\r
1202 void setText(StringBuffer s) {
\r
1207 // moveBack() Find the "significant" code point preceding the index i.
\r
1208 // Skips over ($Extend | $Format)*
\r
1210 private int moveBack(int i) {
\r
1219 j = moveIndex32(fText, j, -1);
\r
1220 c = UTF16.charAt(fText, j);
\r
1222 while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));
\r
1227 int moveForward(int i) {
\r
1228 if (i>=fText.length()) {
\r
1229 return fText.length();
\r
1234 j = moveIndex32(fText, j, 1);
\r
1237 while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));
\r
1242 int cAt(int pos) {
\r
1243 if (pos<0 || pos>=fText.length()) {
\r
1246 return UTF16.charAt(fText, pos);
\r
1249 int next(int prevPos) {
\r
1250 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the
\r
1251 // break position being tested. The candidate break
\r
1252 // location is before p2.
\r
1253 int breakPos = -1;
\r
1255 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
\r
1258 // Prev break at end of string. return DONE.
\r
1259 if (prevPos >= fText.length()) {
\r
1262 /*p0 =*/ p1 = p2 = p3 = prevPos;
\r
1263 c3 = UTF16.charAt(fText, prevPos);
\r
1266 // Loop runs once per "significant" character position in the input text.
\r
1268 // Move all of the positions forward in the input string.
\r
1269 /*p0 = p1;*/ c0 = c1;
\r
1273 // Advancd p3 by X(Extend | Format)* Rule 4
\r
1274 p3 = moveForward(p3);
\r
1277 // Rule (3) CR x LF
\r
1278 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
\r
1282 // Rule (4) Sep <break>
\r
1283 if (fSepSet.contains(c1)) {
\r
1284 p2 = p1+1; // Separators don't combine with Extend or Format
\r
1288 if (p2 >= fText.length()) {
\r
1289 // Reached end of string. Always a break position.
\r
1293 if (p2 == prevPos) {
\r
1294 // Still warming up the loop. (won't work with zero length strings, but we don't care)
\r
1298 // Rule (6). ATerm x Numeric
\r
1299 if (fATermSet.contains(c1) && fNumericSet.contains(c2)) {
\r
1303 // Rule (7). Upper ATerm x Uppper
\r
1304 if (fUpperSet.contains(c0) && fATermSet.contains(c1) && fUpperSet.contains(c2)) {
\r
1308 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep))* Lower
\r
1309 // Note: Sterm | ATerm are added to the negated part of the expression by a
\r
1310 // note to the Unicode 5.0 documents.
\r
1312 while (p8>0 && fSpSet.contains(cAt(p8))) {
\r
1313 p8 = moveBack(p8);
\r
1315 while (p8>0 && fCloseSet.contains(cAt(p8))) {
\r
1316 p8 = moveBack(p8);
\r
1318 if (fATermSet.contains(cAt(p8))) {
\r
1322 if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||
\r
1323 fLowerSet.contains(c) || fSepSet.contains(c) ||
\r
1324 fATermSet.contains(c) || fSTermSet.contains(c))
\r
1328 p8 = moveForward(p8);
\r
1330 if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {
\r
1335 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)
\r
1336 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {
\r
1338 while (setContains(fSpSet, cAt(p8))) {
\r
1339 p8 = moveBack(p8);
\r
1341 while (setContains(fCloseSet, cAt(p8))) {
\r
1342 p8 = moveBack(p8);
\r
1345 if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {
\r
1351 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
\r
1353 while (p9>0 && fCloseSet.contains(cAt(p9))) {
\r
1354 p9 = moveBack(p9);
\r
1357 if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
\r
1358 if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {
\r
1363 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
\r
1365 while (p10>0 && fSpSet.contains(cAt(p10))) {
\r
1366 p10 = moveBack(p10);
\r
1368 while (p10>0 && fCloseSet.contains(cAt(p10))) {
\r
1369 p10 = moveBack(p10);
\r
1371 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {
\r
1372 if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
\r
1377 // Rule (11) (STerm | ATerm) Close* Sp* <break>
\r
1379 if (p11>0 && fSepSet.contains(cAt(p11))) {
\r
1380 p11 = moveBack(p11);
\r
1382 while (p11>0 && fSpSet.contains(cAt(p11))) {
\r
1383 p11 = moveBack(p11);
\r
1385 while (p11>0 && fCloseSet.contains(cAt(p11))) {
\r
1386 p11 = moveBack(p11);
\r
1388 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {
\r
1392 // Rule (12) Any x Any
\r
1405 * Move an index into a string by n code points.
\r
1406 * Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
\r
1407 * complicating usage.
\r
1408 * @param s a Text string
\r
1409 * @param pos The starting code unit index into the text string
\r
1410 * @param amt The amount to adjust the string by.
\r
1411 * @return The adjusted code unit index, pinned to the string's length, or
\r
1412 * unchanged if input index was outside of the string.
\r
1414 static int moveIndex32(StringBuffer s, int pos, int amt) {
\r
1418 for (i=0; i<amt; i++) {
\r
1419 if (pos >= s.length()) {
\r
1420 return s.length();
\r
1422 c = s.charAt(pos);
\r
1424 if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
\r
1425 c = s.charAt(pos);
\r
1426 if (UTF16.isTrailSurrogate(c)) {
\r
1432 for (i=0; i>amt; i--) {
\r
1437 c = s.charAt(pos);
\r
1438 if (UTF16.isTrailSurrogate(c) && pos >= 0) {
\r
1439 c = s.charAt(pos);
\r
1440 if (UTF16.isLeadSurrogate(c)) {
\r
1450 * No-exceptions form of UnicodeSet.contains(c).
\r
1451 * Simplifies loops that terminate with an end-of-input character value.
\r
1452 * @param s A unicode set
\r
1453 * @param c A code point value
\r
1454 * @return true if the set contains c.
\r
1456 static boolean setContains(UnicodeSet s, int c) {
\r
1457 if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {
\r
1460 return s.contains(c);
\r
1465 * return the index of the next code point in the input text.
\r
1466 * @param i the preceding index
\r
1470 static int nextCP(StringBuffer s, int i) {
\r
1472 // End of Input indication. Continue to return end value.
\r
1475 int retVal = i + 1;
\r
1476 if (retVal > s.length()) {
\r
1479 int c = UTF16.charAt(s, i);
\r
1480 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {
\r
1488 * random number generator. Not using Java's built-in Randoms for two reasons:
\r
1489 * 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
\r
1490 * 2. We need to get and restore the seed from values occurring in the middle
\r
1491 * of a long sequence, to more easily reproduce failing cases.
\r
1493 private static int m_seed = 1;
\r
1494 private static int m_rand()
\r
1496 m_seed = m_seed * 1103515245 + 12345;
\r
1497 return (int)(m_seed >>> 16) % 32768;
\r
1500 // Helper function for formatting error output.
\r
1501 // Append a string into a fixed-size field in a StringBuffer.
\r
1502 // Blank-pad the string if it is shorter than the field.
\r
1503 // Truncate the source string if it is too long.
\r
1505 private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {
\r
1506 int appendLen = src.length();
\r
1507 if (appendLen >= fieldLen) {
\r
1508 dest.append(src.substring(0, fieldLen));
\r
1511 while (appendLen < fieldLen) {
\r
1518 // Helper function for formatting error output.
\r
1519 // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format
\r
1520 private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {
\r
1521 String hexChars = "0123456789abcdef";
\r
1522 if (c < 0x10000) {
\r
1523 dest.append("\\u");
\r
1524 for (int bn=12; bn>=0; bn-=4) {
\r
1525 dest.append(hexChars.charAt((((int)c)>>bn)&0xf));
\r
1527 appendToBuf(dest, " ", fieldLen-6);
\r
1529 dest.append("\\U");
\r
1530 for (int bn=28; bn>=0; bn-=4) {
\r
1531 dest.append(hexChars.charAt((((int)c)>>bn)&0xf));
\r
1533 appendToBuf(dest, " ", fieldLen-10);
\r
1539 * Run a RBBI monkey test. Common routine, for all break iterator types.
\r
1541 * bi - the break iterator to use
\r
1542 * mk - MonkeyKind, abstraction for obtaining expected results
\r
1543 * name - Name of test (char, word, etc.) for use in error messages
\r
1544 * seed - Seed for starting random number generator (parameter from user)
\r
1547 void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations) {
\r
1548 int TESTSTRINGLEN = 500;
\r
1549 StringBuffer testText = new StringBuffer();
\r
1550 int numCharClasses;
\r
1552 int[] expected = new int[TESTSTRINGLEN*2 + 1];
\r
1553 int expectedCount = 0;
\r
1554 boolean[] expectedBreaks = new boolean[TESTSTRINGLEN*2 + 1];
\r
1555 boolean[] forwardBreaks = new boolean[TESTSTRINGLEN*2 + 1];
\r
1556 boolean[] reverseBreaks = new boolean[TESTSTRINGLEN*2 + 1];
\r
1557 boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
\r
1558 boolean[] followingBreaks = new boolean[TESTSTRINGLEN*2 + 1];
\r
1559 boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1];
\r
1561 int loopCount = 0;
\r
1562 boolean printTestData = false;
\r
1563 boolean printBreaksFromBI = false;
\r
1567 numCharClasses = mk.charClasses().size();
\r
1568 chClasses = mk.charClasses();
\r
1570 // Verify that the character classes all have at least one member.
\r
1571 for (i=0; i<numCharClasses; i++) {
\r
1572 UnicodeSet s = (UnicodeSet)chClasses.get(i);
\r
1573 if (s == null || s.size() == 0) {
\r
1574 errln("Character Class " + i + " is null or of zero size.");
\r
1579 //--------------------------------------------------------------------------------------------
\r
1581 // Debugging settings. Comment out everything in the following block for normal operation
\r
1583 //--------------------------------------------------------------------------------------------
\r
1584 // numIterations = -1;
\r
1585 // RuleBasedBreakIterator_New.fTrace = true;
\r
1586 // m_seed = 859056465;
\r
1587 // TESTSTRINGLEN = 50;
\r
1588 // printTestData = true;
\r
1589 // printBreaksFromBI = true;
\r
1590 // ((RuleBasedBreakIterator_New)bi).dump();
\r
1592 //--------------------------------------------------------------------------------------------
\r
1594 // End of Debugging settings.
\r
1596 //--------------------------------------------------------------------------------------------
\r
1598 int dotsOnLine = 0;
\r
1599 while (loopCount < numIterations || numIterations == -1) {
\r
1600 if (numIterations == -1 && loopCount % 10 == 0) {
\r
1601 // If test is running in an infinite loop, display a periodic tic so
\r
1602 // we can tell that it is making progress.
\r
1603 System.out.print(".");
\r
1604 if (dotsOnLine++ >= 80){
\r
1605 System.out.println();
\r
1609 // Save current random number seed, so that we can recreate the random numbers
\r
1610 // for this loop iteration in event of an error.
\r
1613 testText.setLength(0);
\r
1614 // Populate a test string with data.
\r
1615 if (printTestData) {
\r
1616 System.out.println("Test Data string ...");
\r
1618 for (i=0; i<TESTSTRINGLEN; i++) {
\r
1619 int aClassNum = m_rand() % numCharClasses;
\r
1620 UnicodeSet classSet = (UnicodeSet)chClasses.get(aClassNum);
\r
1621 int charIdx = m_rand() % classSet.size();
\r
1622 int c = classSet.charAt(charIdx);
\r
1623 if (c < 0) { // TODO: deal with sets containing strings.
\r
1626 UTF16.appendCodePoint(testText, c);
\r
1627 if (printTestData) {
\r
1628 System.out.print(Integer.toHexString(c) + " ");
\r
1631 if (printTestData) {
\r
1632 System.out.println();
\r
1635 Arrays.fill(expected, 0);
\r
1636 Arrays.fill(expectedBreaks, false);
\r
1637 Arrays.fill(forwardBreaks, false);
\r
1638 Arrays.fill(reverseBreaks, false);
\r
1639 Arrays.fill(isBoundaryBreaks, false);
\r
1640 Arrays.fill(followingBreaks, false);
\r
1641 Arrays.fill(precedingBreaks, false);
\r
1643 // Calculate the expected results for this test string.
\r
1644 mk.setText(testText);
\r
1645 expectedCount = 0;
\r
1646 expectedBreaks[0] = true;
\r
1647 expected[expectedCount ++] = 0;
\r
1649 int lastBreakPos = -1;
\r
1651 lastBreakPos = breakPos;
\r
1652 breakPos = mk.next(breakPos);
\r
1653 if (breakPos == -1) {
\r
1656 if (breakPos > testText.length()) {
\r
1657 errln("breakPos > testText.length()");
\r
1659 if (lastBreakPos >= breakPos) {
\r
1660 errln("Next() not increasing.");
\r
1663 expectedBreaks[breakPos] = true;
\r
1664 expected[expectedCount ++] = breakPos;
\r
1667 // Find the break positions using forward iteration
\r
1668 if (printBreaksFromBI) {
\r
1669 System.out.println("Breaks from BI...");
\r
1671 bi.setText(testText.toString());
\r
1672 for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {
\r
1673 if (i < 0 || i > testText.length()) {
\r
1674 errln(name + " break monkey test: Out of range value returned by breakIterator::next()");
\r
1677 if (printBreaksFromBI) {
\r
1678 System.out.print(Integer.toHexString(i) + " ");
\r
1680 forwardBreaks[i] = true;
\r
1682 if (printBreaksFromBI) {
\r
1683 System.out.println();
\r
1686 // Find the break positions using reverse iteration
\r
1687 for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {
\r
1688 if (i < 0 || i > testText.length()) {
\r
1689 errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);
\r
1692 reverseBreaks[i] = true;
\r
1695 // Find the break positions using isBoundary() tests.
\r
1696 for (i=0; i<=testText.length(); i++) {
\r
1697 isBoundaryBreaks[i] = bi.isBoundary(i);
\r
1700 // Find the break positions using the following() function.
\r
1702 followingBreaks[0] = true;
\r
1703 for (i=0; i<testText.length(); i++) {
\r
1704 breakPos = bi.following(i);
\r
1705 if (breakPos <= i ||
\r
1706 breakPos < lastBreakPos ||
\r
1707 breakPos > testText.length() ||
\r
1708 breakPos > lastBreakPos && lastBreakPos > i ) {
\r
1709 errln(name + " break monkey test: " +
\r
1710 "Out of range value returned by BreakIterator::following().\n" +
\r
1711 "index=" + i + "following returned=" + breakPos +
\r
1712 "lastBreak=" + lastBreakPos);
\r
1713 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
\r
1715 followingBreaks[breakPos] = true;
\r
1716 lastBreakPos = breakPos;
\r
1720 // Find the break positions using the preceding() function.
\r
1721 lastBreakPos = testText.length();
\r
1722 precedingBreaks[testText.length()] = true;
\r
1723 for (i=testText.length(); i>0; i--) {
\r
1724 breakPos = bi.preceding(i);
\r
1725 if (breakPos >= i ||
\r
1726 breakPos > lastBreakPos ||
\r
1728 breakPos < lastBreakPos && lastBreakPos < i ) {
\r
1729 errln(name + " break monkey test: " +
\r
1730 "Out of range value returned by BreakIterator::preceding().\n" +
\r
1731 "index=" + i + "preceding returned=" + breakPos +
\r
1732 "lastBreak=" + lastBreakPos);
\r
1733 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
\r
1735 precedingBreaks[breakPos] = true;
\r
1736 lastBreakPos = breakPos;
\r
1742 // Compare the expected and actual results.
\r
1743 for (i=0; i<=testText.length(); i++) {
\r
1744 String errorType = null;
\r
1745 if (forwardBreaks[i] != expectedBreaks[i]) {
\r
1746 errorType = "next()";
\r
1747 } else if (reverseBreaks[i] != forwardBreaks[i]) {
\r
1748 errorType = "previous()";
\r
1749 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
\r
1750 errorType = "isBoundary()";
\r
1751 } else if (followingBreaks[i] != expectedBreaks[i]) {
\r
1752 errorType = "following()";
\r
1753 } else if (precedingBreaks[i] != expectedBreaks[i]) {
\r
1754 errorType = "preceding()";
\r
1758 if (errorType != null) {
\r
1759 // Format a range of the test text that includes the failure as
\r
1760 // a data item that can be included in the rbbi test data file.
\r
1762 // Start of the range is the last point where expected and actual results
\r
1763 // both agreed that there was a break position.
\r
1764 int startContext = i;
\r
1767 if (startContext==0) { break; }
\r
1769 if (expectedBreaks[startContext]) {
\r
1770 if (count == 2) break;
\r
1775 // End of range is two expected breaks past the start position.
\r
1776 int endContext = i + 1;
\r
1778 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
\r
1780 if (endContext >= testText.length()) {break;}
\r
1781 if (expectedBreaks[endContext-1]) {
\r
1782 if (count == 0) break;
\r
1789 // Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
\r
1790 StringBuffer errorText = new StringBuffer();
\r
1792 int c; // Char from test data
\r
1793 for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testText, ci)) {
\r
1795 // This is the location of the error.
\r
1796 errorText.append("<?>---------------------------------\n");
\r
1797 } else if (expectedBreaks[ci]) {
\r
1798 // This a non-error expected break position.
\r
1799 errorText.append("------------------------------------\n");
\r
1801 if (ci < testText.length()) {
\r
1802 c = UTF16.charAt(testText, ci);
\r
1803 appendCharToBuf(errorText, c, 11);
\r
1804 String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT);
\r
1805 appendToBuf(errorText, gc, 8);
\r
1806 int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty);
\r
1807 String extraPropValue =
\r
1808 UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG);
\r
1809 appendToBuf(errorText, extraPropValue, 20);
\r
1811 String charName = UCharacter.getExtendedName(c);
\r
1812 appendToBuf(errorText, charName, 40);
\r
1813 errorText.append('\n');
\r
1816 if (ci == testText.length() && ci != -1) {
\r
1817 errorText.append("<>");
\r
1819 errorText.append("</data>\n");
\r
1821 // Output the error
\r
1822 errln(name + " break monkey test error. " +
\r
1823 (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") +
\r
1824 "\nOperation = " + errorType + "; random seed = " + seed + "; buf Idx = " + i + "\n" +
\r
1834 public void TestCharMonkey() {
\r
1836 int loopCount = 500;
\r
1839 if (params.inclusion >= 9) {
\r
1840 loopCount = 10000;
\r
1843 RBBICharMonkey m = new RBBICharMonkey();
\r
1844 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US);
\r
1845 RunMonkey(bi, m, "char", seed, loopCount);
\r
1848 public void TestWordMonkey() {
\r
1850 int loopCount = 500;
\r
1853 if (params.inclusion >= 9) {
\r
1854 loopCount = 10000;
\r
1857 logln("Word Break Monkey Test");
\r
1858 RBBIWordMonkey m = new RBBIWordMonkey();
\r
1859 BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
\r
1860 RunMonkey(bi, m, "word", seed, loopCount);
\r
1863 public void TestLineMonkey() {
\r
1865 int loopCount = 500;
\r
1868 if (params.inclusion >= 9) {
\r
1869 loopCount = 10000;
\r
1872 logln("Line Break Monkey Test");
\r
1873 RBBILineMonkey m = new RBBILineMonkey();
\r
1874 BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
\r
1875 if (params == null) {
\r
1878 RunMonkey(bi, m, "line", seed, loopCount);
\r
1881 public void TestSentMonkey() {
\r
1883 int loopCount = 500;
\r
1886 if (params.inclusion >= 9) {
\r
1890 logln("Sentence Break Monkey Test");
\r
1891 RBBISentenceMonkey m = new RBBISentenceMonkey();
\r
1892 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
\r
1893 if (params == null) {
\r
1896 RunMonkey(bi, m, "sent", seed, loopCount);
\r
1899 // Round-trip monkey tests.
\r
1900 // Verify that break iterators created from the rule source from the default
\r
1901 // break iterators still pass the monkey test for the iterator type.
\r
1903 // This is a major test for the Rule Compiler. The default break iterators are built
\r
1904 // from pre-compiled binary rule data that was created using ICU4C; these
\r
1905 // round-trip rule recompile tests verify that the Java rule compiler can
\r
1906 // rebuild break iterators from the original source rules.
\r
1908 public void TestRTCharMonkey() {
\r
1910 int loopCount = 200;
\r
1913 if (params.inclusion >= 9) {
\r
1917 RBBICharMonkey m = new RBBICharMonkey();
\r
1918 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US);
\r
1919 String rules = bi.toString();
\r
1920 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
\r
1921 RunMonkey(rtbi, m, "char", seed, loopCount);
\r
1924 public void TestRTWordMonkey() {
\r
1926 int loopCount = 200;
\r
1929 if (params.inclusion >= 9) {
\r
1933 logln("Word Break Monkey Test");
\r
1934 RBBIWordMonkey m = new RBBIWordMonkey();
\r
1935 BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
\r
1936 String rules = bi.toString();
\r
1937 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
\r
1938 RunMonkey(rtbi, m, "word", seed, loopCount);
\r
1941 public void TestRTLineMonkey() {
\r
1943 int loopCount = 200;
\r
1946 if (params.inclusion >= 9) {
\r
1950 logln("Line Break Monkey Test");
\r
1951 RBBILineMonkey m = new RBBILineMonkey();
\r
1952 BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
\r
1953 String rules = bi.toString();
\r
1954 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
\r
1955 if (params == null) {
\r
1958 RunMonkey(rtbi, m, "line", seed, loopCount);
\r
1961 public void TestRTSentMonkey() {
\r
1963 int loopCount = 200;
\r
1966 if (params.inclusion >= 9) {
\r
1970 logln("Sentence Break Monkey Test");
\r
1971 RBBISentenceMonkey m = new RBBISentenceMonkey();
\r
1972 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
\r
1973 String rules = bi.toString();
\r
1974 BreakIterator rtbi = new RuleBasedBreakIterator(rules);
\r
1975 if (params == null) {
\r
1978 RunMonkey(rtbi, m, "sent", seed, loopCount);
\r