2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.dev.test.normalizer;
\r
10 import java.io.BufferedReader;
\r
11 import java.io.IOException;
\r
12 import java.text.StringCharacterIterator;
\r
14 import com.ibm.icu.dev.test.TestFmwk;
\r
15 import com.ibm.icu.dev.test.TestUtil;
\r
16 import com.ibm.icu.impl.Utility;
\r
17 import com.ibm.icu.text.Normalizer;
\r
18 import com.ibm.icu.text.UTF16;
\r
19 import com.ibm.icu.text.UnicodeSet;
\r
21 public class ConformanceTest extends TestFmwk {
\r
23 Normalizer normalizer;
\r
25 public static void main(String[] args) throws Exception {
\r
26 new ConformanceTest().run(args);
\r
29 public ConformanceTest() {
\r
30 // Doesn't matter what the string and mode are; we'll change
\r
31 // them later as needed.
\r
32 normalizer = new Normalizer("", Normalizer.NFC, 0);
\r
34 // more interesting conformance test cases, not in the unicode.org NormalizationTest.txt
\r
35 static String[] moreCases ={
\r
37 "0061 0332 0308;00E4 0332;0061 0332 0308;00E4 0332;0061 0332 0308; # Markus 0",
\r
39 // Markus 2001oct26 - test edge case for iteration: U+0f73.cc==0 but decomposition.lead.cc==129
\r
40 "0061 0301 0F73;00E1 0F71 0F72;0061 0F71 0F72 0301;00E1 0F71 0F72;0061 0F71 0F72 0301; # Markus 1"
\r
44 * Test the conformance of Normalizer to
\r
45 * http://www.unicode.org/unicode/reports/tr15/conformance/Draft-TestSuite.txt.* http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
\r
46 * This file must be located at the path specified as TEST_SUITE_FILE.
\r
48 public void TestConformance() throws Exception{
\r
49 runConformance("unicode/NormalizationTest.txt",0);
\r
51 public void TestConformance_3_2() throws Exception{
\r
52 runConformance("unicode/NormalizationTest-3.2.0.txt",Normalizer.UNICODE_3_2);
\r
55 public void runConformance(String fileName, int options) throws Exception{
\r
56 BufferedReader input = null;
\r
58 String[] fields = new String[5];
\r
59 StringBuffer buf = new StringBuffer();
\r
62 UnicodeSet other = new UnicodeSet(0, 0x10ffff);
\r
65 input = TestUtil.getDataReader(fileName);
\r
66 for (int count = 0;;++count) {
\r
67 line = input.readLine();
\r
69 //read the extra test cases
\r
70 if(count > moreCases.length) {
\r
72 } else if(count == moreCases.length) {
\r
76 line = moreCases[count++];
\r
78 if (line.length() == 0) continue;
\r
80 // Expect 5 columns of this format:
\r
81 // 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments>
\r
84 if (line.charAt(0) == '#' || line.charAt(0)=='@') continue;
\r
86 // Parse out the fields
\r
87 hexsplit(line, ';', fields, buf);
\r
89 // Remove a single code point from the "other" UnicodeSet
\r
90 if(fields[0].length()==UTF16.moveCodePointOffset(fields[0],0, 1)) {
\r
91 c=UTF16.charAt(fields[0],0);
\r
92 if(0xac20<=c && c<=0xd73f) {
\r
93 // not an exhaustive test run: skip most Hangul syllables
\r
95 other.remove(0xac20, 0xd73f);
\r
101 if (checkConformance(fields, line,options)) {
\r
106 if ((count % 1000) == 999) {
\r
107 logln("Line " + (count+1));
\r
110 } catch (IOException ex) {
\r
111 if (input != null) {
\r
114 } catch (Exception ex2) {
\r
115 System.out.print("");
\r
118 ex.printStackTrace();
\r
119 throw new IllegalArgumentException("Couldn't read file "
\r
120 + ex.getClass().getName() + " " + ex.getMessage()
\r
121 + " line = " + line
\r
125 if (failCount != 0) {
\r
126 errln("Total: " + failCount + " lines failed, " +
\r
127 passCount + " lines passed");
\r
129 logln("Total: " + passCount + " lines passed");
\r
134 * Verify the conformance of the given line of the Unicode
\r
135 * normalization (UTR 15) test suite file. For each line,
\r
136 * there are five columns, corresponding to field[0]..field[4].
\r
138 * The following invariants must be true for all conformant implementations
\r
139 * c2 == NFC(c1) == NFC(c2) == NFC(c3)
\r
140 * c3 == NFD(c1) == NFD(c2) == NFD(c3)
\r
141 * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
\r
142 * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
\r
144 * @param field the 5 columns
\r
145 * @param line the source line from the test suite file
\r
146 * @return true if the test passes
\r
148 private boolean checkConformance(String[] field, String line, int options) throws Exception{
\r
149 boolean pass = true;
\r
150 StringBuffer buf = new StringBuffer(); // scratch
\r
153 for (i=0; i<5; ++i) {
\r
155 out = Normalizer.normalize(field[i], Normalizer.NFC, options);
\r
156 pass &= assertEqual("C", field[i], out, field[1], "c2!=C(c" + (i+1));
\r
158 out = iterativeNorm(field[i], Normalizer.NFC, buf, +1,options);
\r
159 pass &= assertEqual("C(+1)", field[i], out, field[1], "c2!=C(c" + (i+1));
\r
161 out = iterativeNorm(field[i], Normalizer.NFC, buf, -1,options);
\r
162 pass &= assertEqual("C(-1)", field[i], out, field[1], "c2!=C(c" + (i+1));
\r
164 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFC, buf, +1,options);
\r
165 pass &= assertEqual("C(+1)", field[i], out, field[1], "c2!=C(c" + (i+1));
\r
167 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFC, buf, -1,options);
\r
168 pass &= assertEqual("C(-1)", field[i], out, field[1], "c2!=C(c" + (i+1));
\r
170 out = Normalizer.normalize(field[i], Normalizer.NFD);
\r
171 pass &= assertEqual("D", field[i], out, field[2], "c3!=D(c" + (i+1));
\r
173 out = iterativeNorm(field[i], Normalizer.NFD, buf, +1,options);
\r
174 pass &= assertEqual("D(+1)", field[i], out, field[2], "c3!=D(c" + (i+1));
\r
176 out = iterativeNorm(field[i], Normalizer.NFD, buf, -1,options);
\r
177 pass &= assertEqual("D(-1)", field[i], out, field[2], "c3!=D(c" + (i+1));
\r
179 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFD, buf, +1,options);
\r
180 pass &= assertEqual("D(+1)", field[i], out, field[2], "c3!=D(c" + (i+1));
\r
182 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFD, buf, -1,options);
\r
183 pass &= assertEqual("D(-1)", field[i], out, field[2], "c3!=D(c" + (i+1));
\r
185 cross(field[2] /*NFD String*/, field[1]/*NFC String*/, Normalizer.NFC);
\r
186 cross(field[1] /*NFC String*/, field[2]/*NFD String*/, Normalizer.NFD);
\r
188 out = Normalizer.normalize(field[i], Normalizer.NFKC,options);
\r
189 pass &= assertEqual("KC", field[i], out, field[3], "c4!=KC(c" + (i+1));
\r
191 out = iterativeNorm(field[i], Normalizer.NFKC, buf, +1,options);
\r
192 pass &= assertEqual("KD(+1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
\r
194 out = iterativeNorm(field[i], Normalizer.NFKC, buf, -1,options);
\r
195 pass &= assertEqual("KD(-1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
\r
197 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKC, buf, +1,options);
\r
198 pass &= assertEqual("KD(+1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
\r
200 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKC, buf, -1,options);
\r
201 pass &= assertEqual("KD(-1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
\r
204 out = Normalizer.normalize(field[i], Normalizer.NFKD,options);
\r
205 pass &= assertEqual("KD", field[i], out, field[4], "c5!=KD(c" + (i+1));
\r
207 out = iterativeNorm(field[i], Normalizer.NFKD, buf, +1,options);
\r
208 pass &= assertEqual("KD(+1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
\r
210 out = iterativeNorm(field[i], Normalizer.NFKD, buf, -1,options);
\r
211 pass &= assertEqual("KD(-1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
\r
213 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKD, buf, +1,options);
\r
214 pass &= assertEqual("KD(+1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
\r
216 out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKD, buf, -1,options);
\r
217 pass &= assertEqual("KD(-1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
\r
219 cross(field[4] /*NFKD String*/, field[3]/*NFKC String*/, Normalizer.NFKC);
\r
220 cross(field[3] /*NFKC String*/, field[4]/*NFKD String*/, Normalizer.NFKD);
\r
223 compare(field[1],field[2]);
\r
224 compare(field[0],field[1]);
\r
225 compare(field[0],field[2]);
\r
226 // test quick checks
\r
227 if(Normalizer.NO == Normalizer.quickCheck(field[1], Normalizer.NFC,options)) {
\r
228 errln("Normalizer error: quickCheck(NFC(s), Normalizer.NFC) is Normalizer.NO");
\r
231 if(Normalizer.NO == Normalizer.quickCheck(field[2], Normalizer.NFD,options)) {
\r
232 errln("Normalizer error: quickCheck(NFD(s), Normalizer.NFD) is Normalizer.NO");
\r
235 if(Normalizer.NO == Normalizer.quickCheck(field[3], Normalizer.NFKC,options)) {
\r
236 errln("Normalizer error: quickCheck(NFKC(s), Normalizer.NFKC) is Normalizer.NO");
\r
239 if(Normalizer.NO == Normalizer.quickCheck(field[4], Normalizer.NFKD,options)) {
\r
240 errln("Normalizer error: quickCheck(NFKD(s), Normalizer.NFKD) is Normalizer.NO");
\r
244 if(!Normalizer.isNormalized(field[1], Normalizer.NFC, options)) {
\r
245 errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false");
\r
248 if(!field[0].equals(field[1]) && Normalizer.isNormalized(field[0], Normalizer.NFC, options)) {
\r
249 errln("Normalizer error: isNormalized(s, Normalizer.NFC) is TRUE");
\r
252 if(!Normalizer.isNormalized(field[3], Normalizer.NFKC, options)) {
\r
253 errln("Normalizer error: isNormalized(NFKC(s), Normalizer.NFKC) is false");
\r
256 if(!field[0].equals(field[3]) && Normalizer.isNormalized(field[0], Normalizer.NFKC, options)) {
\r
257 errln("Normalizer error: isNormalized(s, Normalizer.NFKC) is TRUE");
\r
260 // test api that takes a char[]
\r
261 if(!Normalizer.isNormalized(field[1].toCharArray(),0,field[1].length(), Normalizer.NFC,options)) {
\r
262 errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false");
\r
265 // test api that takes a codepoint
\r
266 if(!Normalizer.isNormalized(UTF16.charAt(field[1],0), Normalizer.NFC,options)) {
\r
267 errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false");
\r
270 // test FCD quick check and "makeFCD"
\r
271 fcd=Normalizer.normalize(field[0], Normalizer.FCD);
\r
272 if(Normalizer.NO == Normalizer.quickCheck(fcd, Normalizer.FCD,options)) {
\r
273 errln("Normalizer error: quickCheck(FCD(s), Normalizer.FCD) is Normalizer.NO");
\r
276 // check FCD return length
\r
278 char[] fcd2 = new char[ fcd.length() * 2 ];
\r
279 char[] src = field[0].toCharArray();
\r
280 int fcdLen = Normalizer.normalize(src, 0, src.length, fcd2, fcd.length(), fcd2.length,Normalizer.FCD, 0);
\r
281 if(fcdLen != fcd.length()){
\r
282 errln("makeFCD did not return the correct length");
\r
285 if(Normalizer.NO == Normalizer.quickCheck(fcd, Normalizer.FCD, options)) {
\r
286 errln("Normalizer error: quickCheck(FCD(s), Normalizer.FCD) is Normalizer.NO");
\r
289 if(Normalizer.NO == Normalizer.quickCheck(field[2], Normalizer.FCD, options)) {
\r
290 errln("Normalizer error: quickCheck(NFD(s), Normalizer.FCD) is Normalizer.NO");
\r
294 if(Normalizer.NO == Normalizer.quickCheck(field[4], Normalizer.FCD, options)) {
\r
295 errln("Normalizer error: quickCheck(NFKD(s), Normalizer.FCD) is Normalizer.NO");
\r
299 out = iterativeNorm(new StringCharacterIterator(field[0]), Normalizer.FCD, buf, +1,options);
\r
300 out = iterativeNorm(new StringCharacterIterator(field[0]), Normalizer.FCD, buf, -1,options);
\r
302 out = iterativeNorm(new StringCharacterIterator(field[2]), Normalizer.FCD, buf, +1,options);
\r
303 out = iterativeNorm(new StringCharacterIterator(field[2]), Normalizer.FCD, buf, -1,options);
\r
305 out = iterativeNorm(new StringCharacterIterator(field[4]), Normalizer.FCD, buf, +1,options);
\r
306 out = iterativeNorm(new StringCharacterIterator(field[4]), Normalizer.FCD, buf, -1,options);
\r
308 out=Normalizer.normalize(fcd, Normalizer.NFD);
\r
309 if(!out.equals(field[2])) {
\r
310 errln("Normalizer error: NFD(FCD(s))!=NFD(s)");
\r
314 errln("FAIL: " + line);
\r
316 if(field[0]!=field[2]) {
\r
317 // two strings that are canonically equivalent must test
\r
318 // equal under a canonical caseless match
\r
319 // see UAX #21 Case Mappings and Jitterbug 2021 and
\r
320 // Unicode Technical Committee meeting consensus 92-C31
\r
322 if((rc = Normalizer.compare(field[0], field[2], (options<<Normalizer.COMPARE_NORM_OPTIONS_SHIFT)|Normalizer.COMPARE_IGNORE_CASE))!=0){
\r
323 errln("Normalizer.compare(original, NFD, case-insensitive) returned "+rc+" instead of 0 for equal");
\r
330 // two strings that are canonically equivalent must test
\r
331 // equal under a canonical caseless match
\r
332 // see UAX #21 Case Mappings and Jitterbug 2021 and
\r
333 // Unicode Technical Committee meeting consensus 92-C31
\r
334 private void compare(String s1, String s2){
\r
335 if(s1.length()==1 && s2.length()==1){
\r
336 if(Normalizer.compare(UTF16.charAt(s1,0),UTF16.charAt(s2,0),Normalizer.COMPARE_IGNORE_CASE)!=0){
\r
337 errln("Normalizer.compare(int,int) failed for s1: "
\r
338 +Utility.hex(s1) + " s2: " + Utility.hex(s2));
\r
341 if(s1.length()==1 && s2.length()>1){
\r
342 if(Normalizer.compare(UTF16.charAt(s1,0),s2,Normalizer.COMPARE_IGNORE_CASE)!=0){
\r
343 errln("Normalizer.compare(int,String) failed for s1: "
\r
344 +Utility.hex(s1) + " s2: " + Utility.hex(s2));
\r
347 if(s1.length()>1 && s2.length()>1){
\r
348 // TODO: Re-enable this tests after UTC fixes UAX 21
\r
349 if(Normalizer.compare(s1.toCharArray(),s2.toCharArray(),Normalizer.COMPARE_IGNORE_CASE)!=0){
\r
350 errln("Normalizer.compare(char[],char[]) failed for s1: "
\r
351 +Utility.hex(s1) + " s2: " + Utility.hex(s2));
\r
355 private void cross(String s1, String s2,Normalizer.Mode mode){
\r
356 String result = Normalizer.normalize(s1,mode);
\r
357 if(!result.equals(s2)){
\r
358 errln("cross test failed s1: " + Utility.hex(s1) + " s2: "
\r
363 * Do a normalization using the iterative API in the given direction.
\r
364 * @param buf scratch buffer
\r
365 * @param dir either +1 or -1
\r
367 private String iterativeNorm(String str, Normalizer.Mode mode,
\r
368 StringBuffer buf, int dir ,int options) throws Exception{
\r
369 normalizer.setText(str);
\r
370 normalizer.setMode(mode);
\r
372 normalizer.setOption(-1, false); // reset all options
\r
373 normalizer.setOption(options, true); // set desired options
\r
377 for (ch = normalizer.first(); ch != Normalizer.DONE;
\r
378 ch = normalizer.next()) {
\r
379 buf.append(UTF16.valueOf(ch));
\r
382 for (ch = normalizer.last(); ch != Normalizer.DONE;
\r
383 ch = normalizer.previous()) {
\r
384 buf.insert(0, UTF16.valueOf(ch));
\r
387 return buf.toString();
\r
391 * Do a normalization using the iterative API in the given direction.
\r
392 * @param str a Java StringCharacterIterator
\r
393 * @param buf scratch buffer
\r
394 * @param dir either +1 or -1
\r
396 private String iterativeNorm(StringCharacterIterator str, Normalizer.Mode mode,
\r
397 StringBuffer buf, int dir,int options) throws Exception{
\r
398 normalizer.setText(str);
\r
399 normalizer.setMode(mode);
\r
401 normalizer.setOption(-1, false); // reset all options
\r
402 normalizer.setOption(options, true); // set desired options
\r
406 for (ch = normalizer.first(); ch != Normalizer.DONE;
\r
407 ch = normalizer.next()) {
\r
408 buf.append(UTF16.valueOf(ch));
\r
411 for (ch = normalizer.last(); ch != Normalizer.DONE;
\r
412 ch = normalizer.previous()) {
\r
413 buf.insert(0, UTF16.valueOf(ch));
\r
416 return buf.toString();
\r
420 * @param op name of normalization form, e.g., "KC"
\r
421 * @param s string being normalized
\r
422 * @param got value received
\r
423 * @param exp expected value
\r
424 * @param msg description of this test
\r
425 * @returns true if got == exp
\r
427 private boolean assertEqual(String op, String s, String got,
\r
428 String exp, String msg) {
\r
429 if (exp.equals(got)) {
\r
432 errln((" " + msg + ") " + op + "(" + s + ")=" + hex(got) +
\r
433 ", exp. " + hex(exp)));
\r
438 * Split a string into pieces based on the given delimiter
\r
439 * character. Then, parse the resultant fields from hex into
\r
440 * characters. That is, "0040 0400;0C00;0899" -> new String[] {
\r
441 * "\u0040\u0400", "\u0C00", "\u0899" }. The output is assumed to
\r
442 * be of the proper length already, and exactly output.length
\r
443 * fields are parsed. If there are too few an exception is
\r
444 * thrown. If there are too many the extras are ignored.
\r
446 * @param buf scratch buffer
\r
448 private static void hexsplit(String s, char delimiter,
\r
449 String[] output, StringBuffer buf) {
\r
452 for (i=0; i<output.length; ++i) {
\r
453 int delim = s.indexOf(delimiter, pos);
\r
455 throw new IllegalArgumentException("Missing field in " + s);
\r
457 // Our field is from pos..delim-1.
\r
460 String toHex = s.substring(pos,delim);
\r
463 int len = toHex.length();
\r
465 if(toHex.charAt(index)==' '){
\r
468 int spacePos = toHex.indexOf(' ', index);
\r
470 appendInt(buf,toHex.substring(index,len),s);
\r
473 appendInt(buf,toHex.substring(index, spacePos),s);
\r
475 index = spacePos+1;
\r
479 if (buf.length() < 1) {
\r
480 throw new IllegalArgumentException("Empty field " + i + " in " + s);
\r
482 output[i] = buf.toString();
\r
483 ++pos; // Skip over delim
\r
486 public static void appendInt(StringBuffer buf, String strToHex, String s){
\r
487 int hex = Integer.parseInt(strToHex,16);
\r
489 throw new IllegalArgumentException("Out of range hex " +
\r
491 }else if (hex > 0xFFFF){
\r
492 buf.append((char)((hex>>10)+0xd7c0));
\r
493 buf.append((char)((hex&0x3ff)|0xdc00));
\r
495 buf.append((char) hex);
\r
499 // Specific tests for debugging. These are generally failures
\r
500 // taken from the conformance file, but culled out to make
\r
501 // debugging easier. These can be eliminated without affecting
\r
504 public void _hideTestCase6(int options) throws Exception{
\r
505 _testOneLine("0385;0385;00A8 0301;0020 0308 0301;0020 0308 0301;",options);
\r
508 public void _testOneLine(String line,int options) throws Exception{
\r
509 String[] fields = new String[5];
\r
510 StringBuffer buf = new StringBuffer();
\r
511 // Parse out the fields
\r
512 hexsplit(line, ';', fields, buf);
\r
513 checkConformance(fields, line,options);
\r