]> gitweb.fperrin.net Git - Dictionary.git/blobdiff - jars/icu4j-52_1/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
Upgrade ICU4J.
[Dictionary.git] / jars / icu4j-52_1 / main / tests / core / src / com / ibm / icu / dev / test / rbbi / RBBITestMonkey.java
similarity index 90%
rename from jars/icu4j-4_8_1_1/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
rename to jars/icu4j-52_1/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
index 8b30deb123bffb88bd3105e677710fed125794ba..25e2d067890c443c0724619f6f602767b382aa06 100644 (file)
@@ -1,7 +1,7 @@
 /*
  *******************************************************************************
- * Copyright (C) 2003-2010 International Business Machines Corporation and     *
- * others. All Rights Reserved.                                                *
+ * Copyright (C) 2003-2013 International Business Machines Corporation and
+ * others. All Rights Reserved.
  *******************************************************************************
  */
  package com.ibm.icu.dev.test.rbbi;
@@ -57,12 +57,12 @@ public class RBBITestMonkey extends TestFmwk {
         // Set the test text on which subsequent calls to next() will operate
         abstract  void   setText(StringBuffer text);
 
-        // Find the next break postion, starting from the specified position.
+        // Find the next break position, starting from the specified position.
         // Return -1 after reaching end of string.
         abstract   int   next(int i);
         
         // A Character Property, one of the constants defined in class UProperty.
-        //   The value fo this property will be displayed for the characters
+        //   The value of this property will be displayed for the characters
         //    near any test failure.  
         int   fCharProperty;
     }
@@ -70,6 +70,7 @@ public class RBBITestMonkey extends TestFmwk {
  
     /**
      * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
+     * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets
      */
     static class RBBICharMonkey extends RBBIMonkeyKind {
         List                      fSets;
@@ -77,6 +78,7 @@ public class RBBITestMonkey extends TestFmwk {
         UnicodeSet                fCRLFSet;
         UnicodeSet                fControlSet;
         UnicodeSet                fExtendSet;
+        UnicodeSet                fRegionalIndicatorSet;
         UnicodeSet                fPrependSet;
         UnicodeSet                fSpacingSet;
         UnicodeSet                fLSet;
@@ -96,6 +98,7 @@ public class RBBITestMonkey extends TestFmwk {
         fCRLFSet    = new UnicodeSet("[\\r\\n]");
         fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
         fExtendSet  = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
+        fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]");
         fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
         fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
         fLSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
@@ -116,7 +119,10 @@ public class RBBITestMonkey extends TestFmwk {
         fSets.add(fCRLFSet);
         fSets.add(fControlSet);
         fSets.add(fExtendSet);
-        fSets.add(fPrependSet);
+        fSets.add(fRegionalIndicatorSet);
+        if (!fPrependSet.isEmpty()) {
+            fSets.add(fPrependSet);
+        }
         fSets.add(fSpacingSet);
         fSets.add(fHangulSet);
         fSets.add(fAnySet);
@@ -212,11 +218,16 @@ public class RBBITestMonkey extends TestFmwk {
                 continue;
             }
     
+            // Rule (GB8a)   Regional_Indicator x Regional_Indicator
+            if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
+                continue;
+            }
+            
             // Rule (GB9)    Numeric x ALetter
             if (fExtendSet.contains(c2))  {
                 continue;
             }
-    
+            
             // Rule (GB9a)   x  SpacingMark
             if (fSpacingSet.contains(c2)) {
                 continue;
@@ -251,8 +262,12 @@ public class RBBITestMonkey extends TestFmwk {
         UnicodeSet                fCRSet;
         UnicodeSet                fLFSet;
         UnicodeSet                fNewlineSet;
+        UnicodeSet                fRegionalIndicatorSet;
         UnicodeSet                fKatakanaSet;
+        UnicodeSet                fHebrew_LetterSet;
         UnicodeSet                fALetterSet;
+        UnicodeSet                fSingle_QuoteSet;
+        UnicodeSet                fDouble_QuoteSet;
         UnicodeSet                fMidNumLetSet;
         UnicodeSet                fMidLetterSet;
         UnicodeSet                fMidNumSet;
@@ -260,17 +275,24 @@ public class RBBITestMonkey extends TestFmwk {
         UnicodeSet                fFormatSet;
         UnicodeSet                fExtendSet;
         UnicodeSet                fExtendNumLetSet;
-        UnicodeSet                fOtherSet;
+        UnicodeSet                fOtherSet;        
+        UnicodeSet                fDictionaryCjkSet;
 
         
         RBBIWordMonkey() {
             fCharProperty    = UProperty.WORD_BREAK;
 
+            fDictionaryCjkSet= new UnicodeSet("[[:Script=Hangul:][:Han:][:Hiragana:][:Katakana:]]");
             fCRSet           = new UnicodeSet("[\\p{Word_Break = CR}]");
             fLFSet           = new UnicodeSet("[\\p{Word_Break = LF}]");
             fNewlineSet      = new UnicodeSet("[\\p{Word_Break = Newline}]");
-            fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]");
+            fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]");            
             fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}]");
+            fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]");            
+            fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]");
+            fALetterSet.removeAll(fDictionaryCjkSet);
+            fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]");
+            fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]");           
             fMidNumLetSet    = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
             fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
             fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]");
@@ -285,22 +307,32 @@ public class RBBITestMonkey extends TestFmwk {
             fOtherSet.removeAll(fLFSet);
             fOtherSet.removeAll(fNewlineSet);
             fOtherSet.removeAll(fALetterSet);
+            fOtherSet.removeAll(fSingle_QuoteSet);
+            fOtherSet.removeAll(fDouble_QuoteSet);
             fOtherSet.removeAll(fKatakanaSet);
+            fOtherSet.removeAll(fHebrew_LetterSet);
             fOtherSet.removeAll(fMidLetterSet);
             fOtherSet.removeAll(fMidNumSet);
             fOtherSet.removeAll(fNumericSet);
             fOtherSet.removeAll(fFormatSet);
             fOtherSet.removeAll(fExtendSet);
             fOtherSet.removeAll(fExtendNumLetSet);
+            fOtherSet.removeAll(fRegionalIndicatorSet);
             // Inhibit dictionary characters from being tested at all.
-            fOtherSet.removeAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
+            // remove surrogates so as to not generate higher CJK characters
+            fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]"));
+            fOtherSet.removeAll(fDictionaryCjkSet);
 
             fSets            = new ArrayList();
             fSets.add(fCRSet);
             fSets.add(fLFSet);
             fSets.add(fNewlineSet);
+            fSets.add(fRegionalIndicatorSet);
+            fSets.add(fHebrew_LetterSet);
             fSets.add(fALetterSet);
-            fSets.add(fKatakanaSet);
+            //fSets.add(fKatakanaSet); // TODO: work out how to test katakana
+            fSets.add(fSingle_QuoteSet);
+            fSets.add(fDouble_QuoteSet);
             fSets.add(fMidLetterSet);
             fSets.add(fMidNumLetSet);
             fSets.add(fMidNumSet);
@@ -345,7 +377,7 @@ public class RBBITestMonkey extends TestFmwk {
                 p1 = p2;  c1 = c2;
                 p2 = p3;  c2 = c3;
                 
-                // Advancd p3 by    X(Extend | Format)*   Rule 4
+                // Advance p3 by    X(Extend | Format)*   Rule 4
                 //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
                 do {
                     p3 = moveIndex32(fText, p3, 1);
@@ -386,25 +418,39 @@ public class RBBITestMonkey extends TestFmwk {
                     break;
                 }
 
-                // Rule (5).   ALetter x ALetter
-                if (fALetterSet.contains(c1) &&
-                        fALetterSet.contains(c2))  {
+                // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
+                if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
+                    (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
                     continue;
                 }
-                
-                // Rule (6)  ALetter  x  (MidLetter | MidNumLet)  ALetter
+               
+                // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
                 //
-                if ( fALetterSet.contains(c1) &&
-                        (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
-                        setContains(fALetterSet, c3)) {
+                if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1))   &&
+                     (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
+                     (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) {
                     continue;
                 }
-                
-                
-                // Rule (7)  ALetter (MidLetter | MidNumLet)   x  ALetter
-                if (fALetterSet.contains(c0) &&
-                        (fMidLetterSet.contains(c1) ||  fMidNumLetSet.contains(c1))  &&
-                        fALetterSet.contains(c2)) {
+
+                // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
+                if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) &&
+                    (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
+                    (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
+                    continue;
+                }
+
+                // Rule (7a)     Hebrew_Letter x Single_Quote
+                if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) {
+                    continue;
+                }
+
+                // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
+                if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) {
+                    continue;
+                }
+
+                // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
+                if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) {
                     continue;
                 }
                 
@@ -414,29 +460,29 @@ public class RBBITestMonkey extends TestFmwk {
                     continue;
                 }
                 
-                // Rule (9)    ALetter x Numeric
-                if (fALetterSet.contains(c1) &&
-                        fNumericSet.contains(c2))  {
+                // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
+                if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
+                    fNumericSet.contains(c2))  {
                     continue;
                 }
 
-                // Rule (10)    Numeric x ALetter
+                // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
                 if (fNumericSet.contains(c1) &&
-                        fALetterSet.contains(c2))  {
+                    (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
                     continue;
                 }
-                
-                // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
-                if ( fNumericSet.contains(c0) &&
-                        (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1))  && 
+
+                // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
+                if (fNumericSet.contains(c0) &&
+                        (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1))  &&
                         fNumericSet.contains(c2)) {
                     continue;
                 }
                 
-                // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
+                // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
                 if (fNumericSet.contains(c1) &&
-                        (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
-                        setContains(fNumericSet, c3)) {
+                    (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2))  &&
+                    setContains(fNumericSet, c3)) {
                     continue;
                 }
                 
@@ -445,20 +491,28 @@ public class RBBITestMonkey extends TestFmwk {
                         fKatakanaSet.contains(c2))  {
                     continue;
                 }
-                
-                // Rule 13a  (ALetter | Numeric | Katakana | ExtendNumLet) x ExtendNumLet
-                if ((fALetterSet.contains(c1) || fNumericSet.contains(c1) ||
+
+                // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
+                if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) ||
                         fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
                         fExtendNumLetSet.contains(c2)) {
                     continue;
                 }
-                // Rule 13b   ExtendNumLet x (ALetter | Numeric | Katakana | ExtendNumLet)
+                
+                // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
                 if (fExtendNumLetSet.contains(c1) &&
-                        (fALetterSet.contains(c2) || fNumericSet.contains(c2) ||
-                        fKatakanaSet.contains(c2) || fExtendNumLetSet.contains(c2))) {
+                        (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) ||
+                         fNumericSet.contains(c2) || fKatakanaSet.contains(c2)))  {
                     continue;
                 }
-               
+
+                
+                // Rule 13c   Do not break between Regional Indicators. 
+                //            Regional_Indicator  ×   Regional_Indicator
+                if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
+                    continue;
+                }
+                
                 // Rule 14.  Break found here.
                 break;
             }
@@ -503,6 +557,7 @@ public class RBBITestMonkey extends TestFmwk {
         UnicodeSet  fSY;
         UnicodeSet  fAI;
         UnicodeSet  fAL;
+        UnicodeSet  fHL;
         UnicodeSet  fID;
         UnicodeSet  fSA;
         UnicodeSet  fJL;
@@ -510,6 +565,7 @@ public class RBBITestMonkey extends TestFmwk {
         UnicodeSet  fJT;
         UnicodeSet  fH2;
         UnicodeSet  fH3;
+        UnicodeSet  fRI;
         UnicodeSet  fXX;
         
         StringBuffer  fText;
@@ -550,6 +606,7 @@ public class RBBITestMonkey extends TestFmwk {
             fSY    = new UnicodeSet("[\\p{Line_break=SY}]");
             fAI    = new UnicodeSet("[\\p{Line_break=AI}]");
             fAL    = new UnicodeSet("[\\p{Line_break=AL}]");
+            fHL    = new UnicodeSet("[\\p{Line_break=HL}]");
             fID    = new UnicodeSet("[\\p{Line_break=ID}]");
             fSA    = new UnicodeSet("[\\p{Line_break=SA}]");
             fJL    = new UnicodeSet("[\\p{Line_break=JL}]");
@@ -558,6 +615,7 @@ public class RBBITestMonkey extends TestFmwk {
             fH2    = new UnicodeSet("[\\p{Line_break=H2}]");
             fH3    = new UnicodeSet("[\\p{Line_break=H3}]");
             fSG    = new UnicodeSet("[\\ud800-\\udfff]");
+            fRI    = new UnicodeSet("[\\p{Line_break=RI}]");
             fXX    = new UnicodeSet("[\\p{Line_break=XX}]");
 
             
@@ -601,11 +659,12 @@ public class RBBITestMonkey extends TestFmwk {
             fSets.add(fSY);
             fSets.add(fAI);
             fSets.add(fAL);
+            fSets.add(fHL);
             fSets.add(fID);
             fSets.add(fWJ);
             fSets.add(fSA);
             fSets.add(fSG);
-            
+            fSets.add(fRI);
         }
         
         void setText(StringBuffer s) {
@@ -623,6 +682,7 @@ public class RBBITestMonkey extends TestFmwk {
             int    prevChar;  //  Character at above position.  Note that prevChar
                               //   and thisChar may not be adjacent because combining
                               //   characters between them will be ignored.
+            int    prevCharX2; //  Character before prevChar, more contex for LB 21a
             
             int    nextPos;   //  Index of the next character following pos.
                               //     Usually skips over combining marks.
@@ -639,7 +699,7 @@ public class RBBITestMonkey extends TestFmwk {
             //                           while the invalid values shift out and the "this" and
             //                           "prev" positions are filled in with good values.
             pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
-            thisChar = prevChar  = 0;
+            thisChar = prevChar  = prevCharX2 = 0;
             nextPos  = startPos;
             
             
@@ -650,6 +710,7 @@ public class RBBITestMonkey extends TestFmwk {
             //  "prevPos" can be arbitrarily far before "pos".
             for (;;) {
                 // Advance to the next position to be tested.
+                prevCharX2 = prevChar;
                 prevPos   = pos;
                 prevChar  = thisChar;
                 pos       = nextPos;
@@ -864,8 +925,19 @@ public class RBBITestMonkey extends TestFmwk {
                     continue;
                 }
                 
-                // LB 22
+                 // LB 21a, HL (HY | BA) x
+                if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) {
+                    continue;
+                }
+
+                 // LB 21b, SY x HL
+                if (fSY.contains(prevChar) && fHL.contains(thisChar)) {
+                    continue;
+                }
+                
+               // LB 22
                 if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
+                        fHL.contains(prevChar) && fIN.contains(thisChar) ||
                         fID.contains(prevChar) && fIN.contains(thisChar) ||
                         fIN.contains(prevChar) && fIN.contains(thisChar) ||
                         fNU.contains(prevChar) && fIN.contains(thisChar) )   {
@@ -878,8 +950,10 @@ public class RBBITestMonkey extends TestFmwk {
                 //          NU x AL
                 if (fID.contains(prevChar) && fPO.contains(thisChar) ||
                         fAL.contains(prevChar) && fNU.contains(thisChar) ||
-                        fNU.contains(prevChar) && fAL.contains(thisChar) )   {
-                    continue;
+                        fHL.contains(prevChar) && fNU.contains(thisChar) ||
+                        fNU.contains(prevChar) && fAL.contains(thisChar) ||
+                        fNU.contains(prevChar) && fHL.contains(thisChar) )   {
+                   continue;
                 }
                 
                 // LB 24  Do not break between prefix and letters or ideographs.
@@ -887,8 +961,8 @@ public class RBBITestMonkey extends TestFmwk {
                 //        PR x AL
                 //        PO x AL
                 if (fPR.contains(prevChar) && fID.contains(thisChar) ||
-                    fPR.contains(prevChar) && fAL.contains(thisChar) ||
-                    fPO.contains(prevChar) && fAL.contains(thisChar))  {
+                    fPR.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar)) ||
+                    fPO.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar)))  {
                     continue;
                 }
                 
@@ -955,26 +1029,30 @@ public class RBBITestMonkey extends TestFmwk {
                 
                 
                 // LB 28 Do not break between alphabetics
-                if (fAL.contains(prevChar) && fAL.contains(thisChar)) {
+                if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
                     continue;
                 }
                 
                 // LB 29  Do not break between numeric punctuation and alphabetics
-                if (fIS.contains(prevChar) && fAL.contains(thisChar)) {
+                if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
                     continue;
                 }
                 
                 // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
                 //          (AL | NU) x OP
                 //          CP x (AL | NU)
-                if ((fAL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
+                if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
                     continue;
                 }
-                if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fNU.contains(thisChar))) {
+                if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) {
                     continue;
                 }
 
-              
+                // LB 30a   Do not break between regional indicators.  RI × RI
+                if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
+                    continue;
+                }
+                
                 // LB 31    Break everywhere else
                 break;            
             }
@@ -1481,7 +1559,6 @@ public class RBBITestMonkey extends TestFmwk {
     /**
      * return the index of the next code point in the input text.
      * @param i the preceding index
-     * @return
      */
     static int  nextCP(StringBuffer s, int i) {
         if (i == -1) {
@@ -1944,7 +2021,6 @@ public void TestRTWordMonkey() {
     if (params.inclusion >= 9) {
         loopCount = 2000;
     }
-    
     logln("Word Break Monkey Test");
     RBBIWordMonkey  m = new RBBIWordMonkey();
     BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);