/*
*******************************************************************************
- * Copyright (C) 2003-2010 International Business Machines Corporation and *
- * others. All Rights Reserved. *
+ * Copyright (C) 2003-2013 International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.dev.test.rbbi;
// Set the test text on which subsequent calls to next() will operate
abstract void setText(StringBuffer text);
- // Find the next break postion, starting from the specified position.
+ // Find the next break position, starting from the specified position.
// Return -1 after reaching end of string.
abstract int next(int i);
// A Character Property, one of the constants defined in class UProperty.
- // The value fo this property will be displayed for the characters
+ // The value of this property will be displayed for the characters
// near any test failure.
int fCharProperty;
}
/**
* Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
+ * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets
*/
static class RBBICharMonkey extends RBBIMonkeyKind {
List fSets;
UnicodeSet fCRLFSet;
UnicodeSet fControlSet;
UnicodeSet fExtendSet;
+ UnicodeSet fRegionalIndicatorSet;
UnicodeSet fPrependSet;
UnicodeSet fSpacingSet;
UnicodeSet fLSet;
fCRLFSet = new UnicodeSet("[\\r\\n]");
fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
+ fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]");
fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
fLSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
fSets.add(fCRLFSet);
fSets.add(fControlSet);
fSets.add(fExtendSet);
- fSets.add(fPrependSet);
+ fSets.add(fRegionalIndicatorSet);
+ if (!fPrependSet.isEmpty()) {
+ fSets.add(fPrependSet);
+ }
fSets.add(fSpacingSet);
fSets.add(fHangulSet);
fSets.add(fAnySet);
continue;
}
+ // Rule (GB8a) Regional_Indicator x Regional_Indicator
+ if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
+ continue;
+ }
+
// Rule (GB9) Numeric x ALetter
if (fExtendSet.contains(c2)) {
continue;
}
-
+
// Rule (GB9a) x SpacingMark
if (fSpacingSet.contains(c2)) {
continue;
UnicodeSet fCRSet;
UnicodeSet fLFSet;
UnicodeSet fNewlineSet;
+ UnicodeSet fRegionalIndicatorSet;
UnicodeSet fKatakanaSet;
+ UnicodeSet fHebrew_LetterSet;
UnicodeSet fALetterSet;
+ UnicodeSet fSingle_QuoteSet;
+ UnicodeSet fDouble_QuoteSet;
UnicodeSet fMidNumLetSet;
UnicodeSet fMidLetterSet;
UnicodeSet fMidNumSet;
UnicodeSet fFormatSet;
UnicodeSet fExtendSet;
UnicodeSet fExtendNumLetSet;
- UnicodeSet fOtherSet;
+ UnicodeSet fOtherSet;
+ UnicodeSet fDictionaryCjkSet;
RBBIWordMonkey() {
fCharProperty = UProperty.WORD_BREAK;
+ fDictionaryCjkSet= new UnicodeSet("[[:Script=Hangul:][:Han:][:Hiragana:][:Katakana:]]");
fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]");
fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]");
fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]");
- fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]");
+ fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]");
fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]");
+ fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]");
+ fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]");
+ fALetterSet.removeAll(fDictionaryCjkSet);
+ fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]");
+ fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]");
fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]");
fOtherSet.removeAll(fLFSet);
fOtherSet.removeAll(fNewlineSet);
fOtherSet.removeAll(fALetterSet);
+ fOtherSet.removeAll(fSingle_QuoteSet);
+ fOtherSet.removeAll(fDouble_QuoteSet);
fOtherSet.removeAll(fKatakanaSet);
+ fOtherSet.removeAll(fHebrew_LetterSet);
fOtherSet.removeAll(fMidLetterSet);
fOtherSet.removeAll(fMidNumSet);
fOtherSet.removeAll(fNumericSet);
fOtherSet.removeAll(fFormatSet);
fOtherSet.removeAll(fExtendSet);
fOtherSet.removeAll(fExtendNumLetSet);
+ fOtherSet.removeAll(fRegionalIndicatorSet);
// Inhibit dictionary characters from being tested at all.
- fOtherSet.removeAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
+ // remove surrogates so as to not generate higher CJK characters
+ fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]"));
+ fOtherSet.removeAll(fDictionaryCjkSet);
fSets = new ArrayList();
fSets.add(fCRSet);
fSets.add(fLFSet);
fSets.add(fNewlineSet);
+ fSets.add(fRegionalIndicatorSet);
+ fSets.add(fHebrew_LetterSet);
fSets.add(fALetterSet);
- fSets.add(fKatakanaSet);
+ //fSets.add(fKatakanaSet); // TODO: work out how to test katakana
+ fSets.add(fSingle_QuoteSet);
+ fSets.add(fDouble_QuoteSet);
fSets.add(fMidLetterSet);
fSets.add(fMidNumLetSet);
fSets.add(fMidNumSet);
p1 = p2; c1 = c2;
p2 = p3; c2 = c3;
- // Advancd p3 by X(Extend | Format)* Rule 4
+ // Advance p3 by X(Extend | Format)* Rule 4
// But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
do {
p3 = moveIndex32(fText, p3, 1);
break;
}
- // Rule (5). ALetter x ALetter
- if (fALetterSet.contains(c1) &&
- fALetterSet.contains(c2)) {
+ // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
+ if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
+ (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
continue;
}
-
- // Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
+
+ // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
//
- if ( fALetterSet.contains(c1) &&
- (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
- setContains(fALetterSet, c3)) {
+ if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
+ (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
+ (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) {
continue;
}
-
-
- // Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
- if (fALetterSet.contains(c0) &&
- (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1)) &&
- fALetterSet.contains(c2)) {
+
+ // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
+ if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) &&
+ (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
+ (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
+ continue;
+ }
+
+ // Rule (7a) Hebrew_Letter x Single_Quote
+ if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) {
+ continue;
+ }
+
+ // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
+ if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) {
+ continue;
+ }
+
+ // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
+ if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) {
continue;
}
continue;
}
- // Rule (9) ALetter x Numeric
- if (fALetterSet.contains(c1) &&
- fNumericSet.contains(c2)) {
+ // Rule (9) (ALetter | Hebrew_Letter) x Numeric
+ if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
+ fNumericSet.contains(c2)) {
continue;
}
- // Rule (10) Numeric x ALetter
+ // Rule (10) Numeric x (ALetter | Hebrew_Letter)
if (fNumericSet.contains(c1) &&
- fALetterSet.contains(c2)) {
+ (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
continue;
}
-
- // Rule (11) Numeric (MidNum | MidNumLet) x Numeric
- if ( fNumericSet.contains(c0) &&
- (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1)) &&
+
+ // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
+ if (fNumericSet.contains(c0) &&
+ (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
fNumericSet.contains(c2)) {
continue;
}
- // Rule (12) Numeric x (MidNum | MidNumLet) Numeric
+ // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
if (fNumericSet.contains(c1) &&
- (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
- setContains(fNumericSet, c3)) {
+ (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
+ setContains(fNumericSet, c3)) {
continue;
}
fKatakanaSet.contains(c2)) {
continue;
}
-
- // Rule 13a (ALetter | Numeric | Katakana | ExtendNumLet) x ExtendNumLet
- if ((fALetterSet.contains(c1) || fNumericSet.contains(c1) ||
+
+ // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
+ if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) ||
fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
fExtendNumLetSet.contains(c2)) {
continue;
}
- // Rule 13b ExtendNumLet x (ALetter | Numeric | Katakana | ExtendNumLet)
+
+ // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
if (fExtendNumLetSet.contains(c1) &&
- (fALetterSet.contains(c2) || fNumericSet.contains(c2) ||
- fKatakanaSet.contains(c2) || fExtendNumLetSet.contains(c2))) {
+ (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) ||
+ fNumericSet.contains(c2) || fKatakanaSet.contains(c2))) {
continue;
}
-
+
+
+ // Rule 13c Do not break between Regional Indicators.
+ // Regional_Indicator × Regional_Indicator
+ if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
+ continue;
+ }
+
// Rule 14. Break found here.
break;
}
UnicodeSet fSY;
UnicodeSet fAI;
UnicodeSet fAL;
+ UnicodeSet fHL;
UnicodeSet fID;
UnicodeSet fSA;
UnicodeSet fJL;
UnicodeSet fJT;
UnicodeSet fH2;
UnicodeSet fH3;
+ UnicodeSet fRI;
UnicodeSet fXX;
StringBuffer fText;
fSY = new UnicodeSet("[\\p{Line_break=SY}]");
fAI = new UnicodeSet("[\\p{Line_break=AI}]");
fAL = new UnicodeSet("[\\p{Line_break=AL}]");
+ fHL = new UnicodeSet("[\\p{Line_break=HL}]");
fID = new UnicodeSet("[\\p{Line_break=ID}]");
fSA = new UnicodeSet("[\\p{Line_break=SA}]");
fJL = new UnicodeSet("[\\p{Line_break=JL}]");
fH2 = new UnicodeSet("[\\p{Line_break=H2}]");
fH3 = new UnicodeSet("[\\p{Line_break=H3}]");
fSG = new UnicodeSet("[\\ud800-\\udfff]");
+ fRI = new UnicodeSet("[\\p{Line_break=RI}]");
fXX = new UnicodeSet("[\\p{Line_break=XX}]");
fSets.add(fSY);
fSets.add(fAI);
fSets.add(fAL);
+ fSets.add(fHL);
fSets.add(fID);
fSets.add(fWJ);
fSets.add(fSA);
fSets.add(fSG);
-
+ fSets.add(fRI);
}
void setText(StringBuffer s) {
int prevChar; // Character at above position. Note that prevChar
// and thisChar may not be adjacent because combining
// characters between them will be ignored.
+ int prevCharX2; // Character before prevChar, more contex for LB 21a
int nextPos; // Index of the next character following pos.
// Usually skips over combining marks.
// while the invalid values shift out and the "this" and
// "prev" positions are filled in with good values.
pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.
- thisChar = prevChar = 0;
+ thisChar = prevChar = prevCharX2 = 0;
nextPos = startPos;
// "prevPos" can be arbitrarily far before "pos".
for (;;) {
// Advance to the next position to be tested.
+ prevCharX2 = prevChar;
prevPos = pos;
prevChar = thisChar;
pos = nextPos;
continue;
}
- // LB 22
+ // LB 21a, HL (HY | BA) x
+ if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) {
+ continue;
+ }
+
+ // LB 21b, SY x HL
+ if (fSY.contains(prevChar) && fHL.contains(thisChar)) {
+ continue;
+ }
+
+ // LB 22
if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
+ fHL.contains(prevChar) && fIN.contains(thisChar) ||
fID.contains(prevChar) && fIN.contains(thisChar) ||
fIN.contains(prevChar) && fIN.contains(thisChar) ||
fNU.contains(prevChar) && fIN.contains(thisChar) ) {
// NU x AL
if (fID.contains(prevChar) && fPO.contains(thisChar) ||
fAL.contains(prevChar) && fNU.contains(thisChar) ||
- fNU.contains(prevChar) && fAL.contains(thisChar) ) {
- continue;
+ fHL.contains(prevChar) && fNU.contains(thisChar) ||
+ fNU.contains(prevChar) && fAL.contains(thisChar) ||
+ fNU.contains(prevChar) && fHL.contains(thisChar) ) {
+ continue;
}
// LB 24 Do not break between prefix and letters or ideographs.
// PR x AL
// PO x AL
if (fPR.contains(prevChar) && fID.contains(thisChar) ||
- fPR.contains(prevChar) && fAL.contains(thisChar) ||
- fPO.contains(prevChar) && fAL.contains(thisChar)) {
+ fPR.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar)) ||
+ fPO.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
continue;
}
// LB 28 Do not break between alphabetics
- if (fAL.contains(prevChar) && fAL.contains(thisChar)) {
+ if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
continue;
}
// LB 29 Do not break between numeric punctuation and alphabetics
- if (fIS.contains(prevChar) && fAL.contains(thisChar)) {
+ if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
continue;
}
// LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
// (AL | NU) x OP
// CP x (AL | NU)
- if ((fAL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
+ if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
continue;
}
- if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fNU.contains(thisChar))) {
+ if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) {
continue;
}
-
+ // LB 30a Do not break between regional indicators. RI × RI
+ if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
+ continue;
+ }
+
// LB 31 Break everywhere else
break;
}
/**
* return the index of the next code point in the input text.
* @param i the preceding index
- * @return
*/
static int nextCP(StringBuffer s, int i) {
if (i == -1) {
if (params.inclusion >= 9) {
loopCount = 2000;
}
-
logln("Word Break Monkey Test");
RBBIWordMonkey m = new RBBIWordMonkey();
BreakIterator bi = BreakIterator.getWordInstance(Locale.US);