2 *******************************************************************************
\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.dev.test.normalizer;
\r
10 import java.io.BufferedReader;
\r
11 import java.io.IOException;
\r
13 import com.ibm.icu.dev.test.TestFmwk;
\r
14 import com.ibm.icu.dev.test.TestUtil;
\r
15 import com.ibm.icu.text.UTF16;
\r
16 import com.ibm.icu.text.UnicodeSet;
\r
19 public class UnicodeNormalizerConformanceTest extends TestFmwk {
\r
21 UnicodeNormalizer normalizer_C, normalizer_D, normalizer_KC, normalizer_KD;
\r
23 public static void main(String[] args) throws Exception {
\r
24 new UnicodeNormalizerConformanceTest().run(args);
\r
27 public UnicodeNormalizerConformanceTest() {
\r
28 // Doesn't matter what the string and mode are; we'll change
\r
29 // them later as needed.
\r
30 normalizer_C = new UnicodeNormalizer(UnicodeNormalizer.C, true);
\r
31 normalizer_D = new UnicodeNormalizer(UnicodeNormalizer.D, false);
\r
32 normalizer_KC = new UnicodeNormalizer(UnicodeNormalizer.KC, false);
\r
33 normalizer_KD = new UnicodeNormalizer(UnicodeNormalizer.KD, false);
\r
36 // more interesting conformance test cases, not in the unicode.org NormalizationTest.txt
\r
37 static String[] moreCases ={
\r
39 "0061 0332 0308;00E4 0332;0061 0332 0308;00E4 0332;0061 0332 0308; # Markus 0",
\r
41 // Markus 2001oct26 - test edge case for iteration: U+0f73.cc==0 but decomposition.lead.cc==129
\r
42 "0061 0301 0F73;00E1 0F71 0F72;0061 0F71 0F72 0301;00E1 0F71 0F72;0061 0F71 0F72 0301; # Markus 1"
\r
46 * Test the conformance of NewNormalizer to
\r
47 * http://www.unicode.org/unicode/reports/tr15/conformance/Draft-TestSuite.txt.
\r
48 * This file must be located at the path specified as TEST_SUITE_FILE.
\r
50 public void TestConformance() throws Exception{
\r
51 BufferedReader input = null;
\r
53 String[] fields = new String[5];
\r
54 StringBuffer buf = new StringBuffer();
\r
57 UnicodeSet other = new UnicodeSet(0, 0x10ffff);
\r
60 input = TestUtil.getDataReader("unicode/NormalizationTest.txt");
\r
61 for (int count = 0;;++count) {
\r
62 line = input.readLine();
\r
64 //read the extra test cases
\r
65 if(count > moreCases.length) {
\r
67 } else if(count == moreCases.length) {
\r
71 line = moreCases[count++];
\r
73 if (line.length() == 0) continue;
\r
75 // Expect 5 columns of this format:
\r
76 // 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments>
\r
79 if (line.charAt(0) == '#' || line.charAt(0)=='@') continue;
\r
81 // Parse out the fields
\r
82 hexsplit(line, ';', fields, buf);
\r
84 // Remove a single code point from the "other" UnicodeSet
\r
85 if(fields[0].length()==UTF16.moveCodePointOffset(fields[0],0, 1)) {
\r
86 c=UTF16.charAt(fields[0],0);
\r
87 if(0xac20<=c && c<=0xd73f) {
\r
88 // not an exhaustive test run: skip most Hangul syllables
\r
90 other.remove(0xac20, 0xd73f);
\r
96 if (checkConformance(fields, line)) {
\r
101 if ((count % 1000) == 999) {
\r
102 logln("Line " + (count+1));
\r
105 } catch (IOException ex) {
\r
106 if (input != null) {
\r
109 } catch (Exception ex2) {
\r
110 System.out.print("");
\r
113 ex.printStackTrace();
\r
114 throw new IllegalArgumentException("Couldn't read file "
\r
115 + ex.getClass().getName() + " " + ex.getMessage()
\r
116 + " line = " + line
\r
120 if (failCount != 0) {
\r
121 errln("Total: " + failCount + " lines failed, " +
\r
122 passCount + " lines passed");
\r
124 logln("Total: " + passCount + " lines passed");
\r
129 * Verify the conformance of the given line of the Unicode
\r
130 * normalization (UTR 15) test suite file. For each line,
\r
131 * there are five columns, corresponding to field[0]..field[4].
\r
133 * The following invariants must be true for all conformant implementations
\r
134 * c2 == NFC(c1) == NFC(c2) == NFC(c3)
\r
135 * c3 == NFD(c1) == NFD(c2) == NFD(c3)
\r
136 * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
\r
137 * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
\r
139 * @param field the 5 columns
\r
140 * @param line the source line from the test suite file
\r
141 * @return true if the test passes
\r
143 private boolean checkConformance(String[] field, String line) throws Exception{
\r
144 boolean pass = true;
\r
145 // StringBuffer buf = new StringBuffer(); // scratch
\r
148 for (i=0; i<5; ++i) {
\r
150 out = normalizer_C.normalize(field[i]);
\r
151 pass &= assertEqual("C", field[i], out, field[1], "c2!=C(c" + (i+1));
\r
153 out = normalizer_D.normalize(field[i]);
\r
154 pass &= assertEqual("D", field[i], out, field[2], "c3!=D(c" + (i+1));
\r
157 out = normalizer_KC.normalize(field[i]);
\r
158 pass &= assertEqual("KC", field[i], out, field[3], "c4!=KC(c" + (i+1));
\r
160 out = normalizer_KD.normalize(field[i]);
\r
161 pass &= assertEqual("KD", field[i], out, field[4], "c5!=KD(c" + (i+1));
\r
166 errln("FAIL: " + line);
\r
173 * @param op name of normalization form, e.g., "KC"
\r
174 * @param s string being normalized
\r
175 * @param got value received
\r
176 * @param exp expected value
\r
177 * @param msg description of this test
\r
178 * @returns true if got == exp
\r
180 private boolean assertEqual(String op, String s, String got,
\r
181 String exp, String msg) {
\r
182 if (exp.equals(got)) {
\r
185 errln((" " + msg + ") " + op + "(" + s + ")=" + hex(got) +
\r
186 ", exp. " + hex(exp)));
\r
191 * Split a string into pieces based on the given delimiter
\r
192 * character. Then, parse the resultant fields from hex into
\r
193 * characters. That is, "0040 0400;0C00;0899" -> new String[] {
\r
194 * "\u0040\u0400", "\u0C00", "\u0899" }. The output is assumed to
\r
195 * be of the proper length already, and exactly output.length
\r
196 * fields are parsed. If there are too few an exception is
\r
197 * thrown. If there are too many the extras are ignored.
\r
199 * @param buf scratch buffer
\r
201 private static void hexsplit(String s, char delimiter,
\r
202 String[] output, StringBuffer buf) {
\r
205 for (i=0; i<output.length; ++i) {
\r
206 int delim = s.indexOf(delimiter, pos);
\r
208 throw new IllegalArgumentException("Missing field in " + s);
\r
210 // Our field is from pos..delim-1.
\r
213 String toHex = s.substring(pos,delim);
\r
216 int len = toHex.length();
\r
218 if(toHex.charAt(index)==' '){
\r
221 int spacePos = toHex.indexOf(' ', index);
\r
223 appendInt(buf,toHex.substring(index,len),s);
\r
226 appendInt(buf,toHex.substring(index, spacePos),s);
\r
228 index = spacePos+1;
\r
232 if (buf.length() < 1) {
\r
233 throw new IllegalArgumentException("Empty field " + i + " in " + s);
\r
235 output[i] = buf.toString();
\r
236 ++pos; // Skip over delim
\r
239 public static void appendInt(StringBuffer buf, String strToHex, String s){
\r
240 int hex = Integer.parseInt(strToHex,16);
\r
242 throw new IllegalArgumentException("Out of range hex " +
\r
244 }else if (hex > 0xFFFF){
\r
245 buf.append((char)((hex>>10)+0xd7c0));
\r
246 buf.append((char)((hex&0x3ff)|0xdc00));
\r
248 buf.append((char) hex);
\r
252 // Specific tests for debugging. These are generally failures
\r
253 // taken from the conformance file, but culled out to make
\r
254 // debugging easier. These can be eliminated without affecting
\r
257 public void _hideTestCase6() throws Exception{
\r
258 _testOneLine("0385;0385;00A8 0301;0020 0308 0301;0020 0308 0301;");
\r
261 public void _testOneLine(String line) throws Exception{
\r
262 String[] fields = new String[5];
\r
263 StringBuffer buf = new StringBuffer();
\r
264 // Parse out the fields
\r
265 hexsplit(line, ';', fields, buf);
\r
266 checkConformance(fields, line);
\r