jars/icu4j-4_8_1_1/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UnicodeNormalizerConformanceTest.java

   1 /*
   2  *******************************************************************************
   3  * Copyright (C) 1996-2008, International Business Machines Corporation and    *
   4  * others. All Rights Reserved.                                                *
   5  *******************************************************************************
   6  */
   7
   8 package com.ibm.icu.dev.test.normalizer;
   9
  10 import java.io.BufferedReader;
  11 import java.io.IOException;
  12
  13 import com.ibm.icu.dev.test.TestFmwk;
  14 import com.ibm.icu.dev.test.TestUtil;
  15 import com.ibm.icu.text.UTF16;
  16 import com.ibm.icu.text.UnicodeSet;
  17
  18
  19 public class UnicodeNormalizerConformanceTest extends TestFmwk {
  20
  21     UnicodeNormalizer normalizer_C, normalizer_D, normalizer_KC, normalizer_KD;
  22
  23     public static void main(String[] args) throws Exception {
  24         new UnicodeNormalizerConformanceTest().run(args);
  25     }
  26
  27     public UnicodeNormalizerConformanceTest() {
  28         // Doesn't matter what the string and mode are; we'll change
  29         // them later as needed.
  30         normalizer_C = new UnicodeNormalizer(UnicodeNormalizer.C, true);
  31         normalizer_D = new UnicodeNormalizer(UnicodeNormalizer.D, false);
  32         normalizer_KC = new UnicodeNormalizer(UnicodeNormalizer.KC, false);
  33         normalizer_KD = new UnicodeNormalizer(UnicodeNormalizer.KD, false);
  34
  35     }
  36     // more interesting conformance test cases, not in the unicode.org NormalizationTest.txt
  37     static  String[] moreCases ={
  38         // Markus 2001aug30
  39         "0061 0332 0308;00E4 0332;0061 0332 0308;00E4 0332;0061 0332 0308; # Markus 0",
  40
  41         // Markus 2001oct26 - test edge case for iteration: U+0f73.cc==0 but decomposition.lead.cc==129
  42         "0061 0301 0F73;00E1 0F71 0F72;0061 0F71 0F72 0301;00E1 0F71 0F72;0061 0F71 0F72 0301; # Markus 1"
  43     };
  44
  45     /**
  46      * Test the conformance of NewNormalizer to
  47      * http://www.unicode.org/unicode/reports/tr15/conformance/Draft-TestSuite.txt.
  48      * This file must be located at the path specified as TEST_SUITE_FILE.
  49      */
  50     public void TestConformance() throws Exception{
  51         BufferedReader input = null;
  52         String line = null;
  53         String[] fields = new String[5];
  54         StringBuffer buf = new StringBuffer();
  55         int passCount = 0;
  56         int failCount = 0;
  57         UnicodeSet other = new UnicodeSet(0, 0x10ffff);
  58         int c=0;
  59         try {
  60             input = TestUtil.getDataReader("unicode/NormalizationTest.txt");
  61             for (int count = 0;;++count) {
  62                 line = input.readLine();
  63                 if (line == null) {
  64                     //read the extra test cases
  65                     if(count > moreCases.length) {
  66                         count = 0;
  67                     } else if(count == moreCases.length) {
  68                         // all done
  69                         break;
  70                     }
  71                     line = moreCases[count++];
  72                 }
  73                 if (line.length() == 0) continue;
  74
  75                 // Expect 5 columns of this format:
  76                 // 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments>
  77
  78                 // Skip comments
  79                 if (line.charAt(0) == '#'  || line.charAt(0)=='@') continue;
  80
  81                 // Parse out the fields
  82                 hexsplit(line, ';', fields, buf);
  83
  84                 // Remove a single code point from the "other" UnicodeSet
  85                 if(fields[0].length()==UTF16.moveCodePointOffset(fields[0],0, 1)) {
  86                     c=UTF16.charAt(fields[0],0);
  87                     if(0xac20<=c && c<=0xd73f) {
  88                         // not an exhaustive test run: skip most Hangul syllables
  89                         if(c==0xac20) {
  90                             other.remove(0xac20, 0xd73f);
  91                         }
  92                         continue;
  93                     }
  94                     other.remove(c);
  95                 }
  96                 if (checkConformance(fields, line)) {
  97                     ++passCount;
  98                 } else {
  99                     ++failCount;
 100                 }
 101                 if ((count % 1000) == 999) {
 102                     logln("Line " + (count+1));
 103                 }
 104             }
 105         } catch (IOException ex) {
 106             if (input != null) {
 107                 try {
 108                     input.close();
 109                 } catch (Exception ex2) {
 110                     System.out.print("");
 111                 }
 112             }
 113             ex.printStackTrace();
 114             throw new IllegalArgumentException("Couldn't read file "
 115               + ex.getClass().getName() + " " + ex.getMessage()
 116               + " line = " + line
 117               );
 118         }
 119
 120         if (failCount != 0) {
 121             errln("Total: " + failCount + " lines failed, " +
 122                   passCount + " lines passed");
 123         } else {
 124             logln("Total: " + passCount + " lines passed");
 125         }
 126     }
 127
 128     /**
 129      * Verify the conformance of the given line of the Unicode
 130      * normalization (UTR 15) test suite file.  For each line,
 131      * there are five columns, corresponding to field[0]..field[4].
 132      *
 133      * The following invariants must be true for all conformant implementations
 134      *  c2 == NFC(c1) == NFC(c2) == NFC(c3)
 135      *  c3 == NFD(c1) == NFD(c2) == NFD(c3)
 136      *  c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
 137      *  c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
 138      *
 139      * @param field the 5 columns
 140      * @param line the source line from the test suite file
 141      * @return true if the test passes
 142      */
 143     private boolean checkConformance(String[] field, String line) throws Exception{
 144         boolean pass = true;
 145        // StringBuffer buf = new StringBuffer(); // scratch
 146         String out;
 147         int i=0;
 148         for (i=0; i<5; ++i) {
 149             if (i<3) {
 150                 out = normalizer_C.normalize(field[i]);
 151                 pass &= assertEqual("C", field[i], out, field[1], "c2!=C(c" + (i+1));
 152
 153                 out = normalizer_D.normalize(field[i]);
 154                 pass &= assertEqual("D", field[i], out, field[2], "c3!=D(c" + (i+1));
 155
 156             }
 157             out = normalizer_KC.normalize(field[i]);
 158             pass &= assertEqual("KC", field[i], out, field[3], "c4!=KC(c" + (i+1));
 159
 160             out = normalizer_KD.normalize(field[i]);
 161             pass &= assertEqual("KD", field[i], out, field[4], "c5!=KD(c" + (i+1));
 162
 163         }
 164
 165         if (!pass) {
 166             errln("FAIL: " + line);
 167         }
 168
 169         return pass;
 170     }
 171
 172     /**
 173      * @param op name of normalization form, e.g., "KC"
 174      * @param s string being normalized
 175      * @param got value received
 176      * @param exp expected value
 177      * @param msg description of this test
 178      * @returns true if got == exp
 179      */
 180     private boolean assertEqual(String op, String s, String got,
 181                                 String exp, String msg) {
 182         if (exp.equals(got)) {
 183             return true;
 184         }
 185         errln(("      " + msg + ") " + op + "(" + s + ")=" + hex(got) +
 186                              ", exp. " + hex(exp)));
 187         return false;
 188     }
 189
 190     /**
 191      * Split a string into pieces based on the given delimiter
 192      * character.  Then, parse the resultant fields from hex into
 193      * characters.  That is, "0040 0400;0C00;0899" -> new String[] {
 194      * "\u0040\u0400", "\u0C00", "\u0899" }.  The output is assumed to
 195      * be of the proper length already, and exactly output.length
 196      * fields are parsed.  If there are too few an exception is
 197      * thrown.  If there are too many the extras are ignored.
 198      *
 199      * @param buf scratch buffer
 200      */
 201     private static void hexsplit(String s, char delimiter,
 202                                  String[] output, StringBuffer buf) {
 203         int i;
 204         int pos = 0;
 205         for (i=0; i<output.length; ++i) {
 206             int delim = s.indexOf(delimiter, pos);
 207             if (delim < 0) {
 208                 throw new IllegalArgumentException("Missing field in " + s);
 209             }
 210             // Our field is from pos..delim-1.
 211             buf.setLength(0);
 212
 213             String toHex = s.substring(pos,delim);
 214             pos = delim;
 215             int index = 0;
 216             int len = toHex.length();
 217             while(index< len){
 218                 if(toHex.charAt(index)==' '){
 219                     index++;
 220                 }else{
 221                     int spacePos = toHex.indexOf(' ', index);
 222                     if(spacePos==-1){
 223                         appendInt(buf,toHex.substring(index,len),s);
 224                         spacePos = len;
 225                     }else{
 226                         appendInt(buf,toHex.substring(index, spacePos),s);
 227                     }
 228                     index = spacePos+1;
 229                 }
 230             }
 231
 232             if (buf.length() < 1) {
 233                 throw new IllegalArgumentException("Empty field " + i + " in " + s);
 234             }
 235             output[i] = buf.toString();
 236             ++pos; // Skip over delim
 237         }
 238     }
 239     public static void appendInt(StringBuffer buf, String strToHex, String s){
 240         int hex = Integer.parseInt(strToHex,16);
 241         if (hex < 0 ) {
 242             throw new IllegalArgumentException("Out of range hex " +
 243                                                 hex + " in " + s);
 244         }else if (hex > 0xFFFF){
 245             buf.append((char)((hex>>10)+0xd7c0));
 246             buf.append((char)((hex&0x3ff)|0xdc00));
 247         }else{
 248             buf.append((char) hex);
 249         }
 250     }
 251
 252     // Specific tests for debugging.  These are generally failures
 253     // taken from the conformance file, but culled out to make
 254     // debugging easier.  These can be eliminated without affecting
 255     // coverage.
 256
 257     public void _hideTestCase6() throws Exception{
 258         _testOneLine("0385;0385;00A8 0301;0020 0308 0301;0020 0308 0301;");
 259     }
 260
 261     public void _testOneLine(String line) throws Exception{
 262         String[] fields = new String[5];
 263         StringBuffer buf = new StringBuffer();
 264         // Parse out the fields
 265         hexsplit(line, ';', fields, buf);
 266         checkConformance(fields, line);
 267     }
 268
 269
 270 }