2 *******************************************************************************
3 * Copyright (C) 1996-2012, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
8 package com.ibm.icu.dev.test.normalizer;
10 import java.text.StringCharacterIterator;
11 import java.util.Random;
13 import com.ibm.icu.dev.test.TestFmwk;
14 import com.ibm.icu.impl.Norm2AllModes;
15 import com.ibm.icu.impl.Normalizer2Impl;
16 import com.ibm.icu.impl.USerializedSet;
17 import com.ibm.icu.impl.Utility;
18 import com.ibm.icu.lang.UCharacter;
19 import com.ibm.icu.lang.UCharacterCategory;
20 import com.ibm.icu.lang.UProperty;
21 import com.ibm.icu.text.FilteredNormalizer2;
22 import com.ibm.icu.text.Normalizer;
23 import com.ibm.icu.text.Normalizer2;
24 import com.ibm.icu.text.UCharacterIterator;
25 import com.ibm.icu.text.UTF16;
26 import com.ibm.icu.text.UnicodeSet;
27 import com.ibm.icu.text.UnicodeSetIterator;
30 public class BasicTest extends TestFmwk {
31 public static void main(String[] args) throws Exception {
32 new BasicTest().run(args);
35 String[][] canonTests = {
36 // Input Decomposed Composed
37 { "cat", "cat", "cat" },
38 { "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", },
40 { "\u1e0a", "D\u0307", "\u1e0a" }, // D-dot_above
41 { "D\u0307", "D\u0307", "\u1e0a" }, // D dot_above
43 { "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_below dot_above
44 { "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_above dot_below
45 { "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D dot_below dot_above
47 { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above
48 { "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below
50 { "\u1E14", "E\u0304\u0300", "\u1E14" }, // E-macron-grave
51 { "\u0112\u0300", "E\u0304\u0300", "\u1E14" }, // E-macron + grave
52 { "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" }, // E-grave + macron
54 { "\u212b", "A\u030a", "\u00c5" }, // angstrom_sign
55 { "\u00c5", "A\u030a", "\u00c5" }, // A-ring
57 { "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" },
58 { "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" },
60 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated with 3.0
61 { "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" }, //updated with 3.0
63 { "Henry IV", "Henry IV", "Henry IV" },
64 { "Henry \u2163", "Henry \u2163", "Henry \u2163" },
66 { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana)
67 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten
68 { "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" }, // hw_ka + hw_ten
69 { "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" }, // ka + hw_ten
70 { "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" }, // hw_ka + ten
72 { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" },
73 {"\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e","\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165", "\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165"},
76 String[][] compatTests = {
77 // Input Decomposed Composed
78 { "cat", "cat", "cat" },
79 { "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", }, // Alef-Lamed vs. Alef, Lamed
81 { "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" },
82 { "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" }, // ffi ligature -> f + f + i
84 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated for 3.0
85 { "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" }, // ffi ligature -> f + f + i
87 { "Henry IV", "Henry IV", "Henry IV" },
88 { "Henry \u2163", "Henry IV", "Henry IV" },
90 { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana)
91 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten
93 { "\uFF76\u3099", "\u30AB\u3099", "\u30AC" }, // hw_ka + ten
95 /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/
96 { "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" }, // hw_ka + hw_ten
97 { "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" }, // ka + hw_ten
101 // With Canonical decomposition, Hangul syllables should get decomposed
102 // into Jamo, but Jamo characters should not be decomposed into
104 String[][] hangulCanon = {
105 // Input Decomposed Composed
106 { "\ud4db", "\u1111\u1171\u11b6", "\ud4db" },
107 { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" },
110 // With compatibility decomposition turned on,
111 // it should go all the way down to conjoining Jamo characters.
112 // THIS IS NO LONGER TRUE IN UNICODE v2.1.8, SO THIS TEST IS OBSOLETE
113 String[][] hangulCompat = {
114 // Input Decomposed Composed
115 // { "\ud4db", "\u1111\u116e\u1175\u11af\u11c2", "\ud478\u1175\u11af\u11c2" },
118 public void TestHangulCompose()
120 // Make sure that the static composition methods work
121 logln("Canonical composition...");
122 staticTest(Normalizer.NFC, hangulCanon, 2);
123 logln("Compatibility composition...");
124 staticTest(Normalizer.NFKC, hangulCompat, 2);
125 // Now try iterative composition....
126 logln("Iterative composition...");
127 Normalizer norm = new Normalizer("", Normalizer.NFC,0);
128 iterateTest(norm, hangulCanon, 2);
130 norm.setMode(Normalizer.NFKD);
131 iterateTest(norm, hangulCompat, 2);
133 // And finally, make sure you can do it in reverse too
134 logln("Reverse iteration...");
135 norm.setMode(Normalizer.NFC);
136 backAndForth(norm, hangulCanon);
139 public void TestHangulDecomp() throws Exception{
140 // Make sure that the static decomposition methods work
141 logln("Canonical decomposition...");
142 staticTest(Normalizer.NFD, hangulCanon, 1);
143 logln("Compatibility decomposition...");
144 staticTest(Normalizer.NFKD, hangulCompat, 1);
146 // Now the iterative decomposition methods...
147 logln("Iterative decomposition...");
148 Normalizer norm = new Normalizer("", Normalizer.NFD,0);
149 iterateTest(norm, hangulCanon, 1);
151 norm.setMode(Normalizer.NFKD);
152 iterateTest(norm, hangulCompat, 1);
154 // And finally, make sure you can do it in reverse too
155 logln("Reverse iteration...");
156 norm.setMode(Normalizer.NFD);
157 backAndForth(norm, hangulCanon);
159 public void TestNone() throws Exception{
160 Normalizer norm = new Normalizer("", Normalizer.NONE,0);
161 iterateTest(norm, canonTests, 0);
162 staticTest(Normalizer.NONE, canonTests, 0);
164 public void TestDecomp() throws Exception{
165 Normalizer norm = new Normalizer("", Normalizer.NFD,0);
166 iterateTest(norm, canonTests, 1);
167 staticTest(Normalizer.NFD, canonTests, 1);
168 decomposeTest(Normalizer.NFD, canonTests, 1);
171 public void TestCompatDecomp() throws Exception{
172 Normalizer norm = new Normalizer("", Normalizer.NFKD,0);
173 iterateTest(norm, compatTests, 1);
174 staticTest(Normalizer.NFKD,compatTests, 1);
175 decomposeTest(Normalizer.NFKD,compatTests, 1);
178 public void TestCanonCompose() throws Exception{
179 Normalizer norm = new Normalizer("", Normalizer.NFC,0);
180 iterateTest(norm, canonTests, 2);
181 staticTest(Normalizer.NFC, canonTests, 2);
182 composeTest(Normalizer.NFC, canonTests, 2);
185 public void TestCompatCompose() throws Exception{
186 Normalizer norm = new Normalizer("", Normalizer.NFKC,0);
187 iterateTest(norm, compatTests, 2);
188 staticTest(Normalizer.NFKC,compatTests, 2);
189 composeTest(Normalizer.NFKC,compatTests, 2);
192 public void TestExplodingBase() throws Exception{
193 // \u017f - Latin small letter long s
194 // \u0307 - combining dot above
195 // \u1e61 - Latin small letter s with dot above
196 // \u1e9b - Latin small letter long s with dot above
198 // Input Decomposed Composed
199 { "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" },
200 { "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" },
202 String[][] compat = {
203 // Input Decomposed Composed
204 { "\u017f", "s", "s" },
205 { "\u1e9b", "s\u0307", "\u1e61" },
208 staticTest(Normalizer.NFD, canon, 1);
209 staticTest(Normalizer.NFC, canon, 2);
211 staticTest(Normalizer.NFKD, compat, 1);
212 staticTest(Normalizer.NFKC, compat, 2);
217 * The Tibetan vowel sign AA, 0f71, was messed up prior to
218 * Unicode version 2.1.9.
219 * Once 2.1.9 or 3.0 is released, uncomment this test.
221 public void TestTibetan() throws Exception{
222 String[][] decomp = {
223 { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" }
225 String[][] compose = {
226 { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" }
229 staticTest(Normalizer.NFD, decomp, 1);
230 staticTest(Normalizer.NFKD,decomp, 2);
231 staticTest(Normalizer.NFC, compose, 1);
232 staticTest(Normalizer.NFKC,compose, 2);
236 * Make sure characters in the CompositionExclusion.txt list do not get
239 public void TestCompositionExclusion()
241 // This list is generated from CompositionExclusion.txt.
242 // Update whenever the normalizer tables are updated. Note
243 // that we test all characters listed, even those that can be
244 // derived from the Unicode DB and are therefore commented
247 "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" +
248 "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" +
249 "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" +
250 "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" +
251 "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" +
252 "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" +
253 "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u1FDB" +
254 "\u1FE3\u1FEB\u1FEE\u1FEF\u1FF9\u1FFB\u1FFD\u2000" +
255 "\u2001\u2126\u212A\u212B\u2329\u232A\uF900\uFA10" +
256 "\uFA12\uFA15\uFA20\uFA22\uFA25\uFA26\uFA2A\uFB1F" +
257 "\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E\uFB2F\uFB30\uFB31" +
258 "\uFB32\uFB33\uFB34\uFB35\uFB36\uFB38\uFB39\uFB3A" +
259 "\uFB3B\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46" +
260 "\uFB47\uFB48\uFB49\uFB4A\uFB4B\uFB4C\uFB4D\uFB4E";
261 for (int i=0; i<EXCLUDED.length(); ++i) {
262 String a = String.valueOf(EXCLUDED.charAt(i));
263 String b = Normalizer.normalize(a, Normalizer.NFKD);
264 String c = Normalizer.normalize(b, Normalizer.NFC);
266 errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " +
267 hex(b) + " x COMPOSE => " +
269 } else if (isVerbose()) {
270 logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " +
271 hex(b) + " x COMPOSE => " +
275 // The following method works too, but it is somewhat
276 // incestuous. It uses UInfo, which is the same database that
277 // NormalizerBuilder uses, so if something is wrong with
278 // UInfo, the following test won't show it. All it will show
279 // is that NormalizerBuilder has been run with whatever the
282 // We comment this out in favor of the test above, which
283 // provides independent verification (but also requires
284 // independent updating).
286 // UInfo uinfo = new UInfo();
287 // for (int i=0; i<=0xFFFF; ++i) {
288 // if (!uinfo.isExcludedComposition((char)i) ||
289 // (!uinfo.hasCanonicalDecomposition((char)i) &&
290 // !uinfo.hasCompatibilityDecomposition((char)i))) continue;
291 // String a = String.valueOf((char)i);
292 // String b = Normalizer.normalize(a,Normalizer.DECOMP_COMPAT,0);
293 // String c = Normalizer.normalize(b,Normalizer.COMPOSE,0);
294 // if (c.equals(a)) {
295 // errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " +
296 // hex(b) + " x COMPOSE => " +
298 // } else if (isVerbose()) {
299 // logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " +
300 // hex(b) + " x COMPOSE => " +
307 * Test for a problem that showed up just before ICU 1.6 release
308 * having to do with combining characters with an index of zero.
309 * Such characters do not participate in any canonical
310 * decompositions. However, having an index of zero means that
311 * they all share one typeMask[] entry, that is, they all have to
312 * map to the same canonical class, which is not the case, in
315 public void TestZeroIndex()
318 // Expect col1 x COMPOSE_COMPAT => col2
319 // Expect col2 x DECOMP => col3
320 "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300",
321 "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300",
322 "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300",
323 "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327",
324 "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321",
327 for (int i=0; i<DATA.length; i+=3) {
329 String b = Normalizer.normalize(a, Normalizer.NFKC);
330 String exp = DATA[i+1];
332 logln("Ok: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b));
334 errln("FAIL: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b) +
335 ", expect " + hex(exp));
337 a = Normalizer.normalize(b, Normalizer.NFD);
340 logln("Ok: " + hex(b) + " x DECOMP => " + hex(a));
342 errln("FAIL: " + hex(b) + " x DECOMP => " + hex(a) +
343 ", expect " + hex(exp));
349 * Test for a problem found by Verisign. Problem is that
350 * characters at the start of a string are not put in canonical
351 * order correctly by compose() if there is no starter.
353 public void TestVerisign()
356 "\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f",
357 "\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad"
360 "\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f",
361 "\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4"
364 for (int i = 0; i < inputs.length; ++i) {
365 String input = inputs[i];
366 String output = outputs[i];
367 String result = Normalizer.decompose(input, false);
368 if (!result.equals(output)) {
369 errln("FAIL input: " + hex(input));
370 errln(" decompose: " + hex(result));
371 errln(" expected: " + hex(output));
373 result = Normalizer.compose(input, false);
374 if (!result.equals(output)) {
375 errln("FAIL input: " + hex(input));
376 errln(" compose: " + hex(result));
377 errln(" expected: " + hex(output));
382 public void TestQuickCheckResultNO()
384 final char CPNFD[] = {0x00C5, 0x0407, 0x1E00, 0x1F57, 0x220C,
385 0x30AE, 0xAC00, 0xD7A3, 0xFB36, 0xFB4E};
386 final char CPNFC[] = {0x0340, 0x0F93, 0x1F77, 0x1FBB, 0x1FEB,
387 0x2000, 0x232A, 0xF900, 0xFA1E, 0xFB4E};
388 final char CPNFKD[] = {0x00A0, 0x02E4, 0x1FDB, 0x24EA, 0x32FE,
389 0xAC00, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D};
390 final char CPNFKC[] = {0x00A0, 0x017F, 0x2000, 0x24EA, 0x32FE,
391 0x33FE, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D};
397 for (; count < SIZE; count ++)
399 if (Normalizer.quickCheck(String.valueOf(CPNFD[count]),
400 Normalizer.NFD,0) != Normalizer.NO)
402 errln("ERROR in NFD quick check at U+" +
403 Integer.toHexString(CPNFD[count]));
406 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),
407 Normalizer.NFC,0) !=Normalizer.NO)
409 errln("ERROR in NFC quick check at U+"+
410 Integer.toHexString(CPNFC[count]));
413 if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]),
414 Normalizer.NFKD,0) != Normalizer.NO)
416 errln("ERROR in NFKD quick check at U+"+
417 Integer.toHexString(CPNFKD[count]));
420 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
421 Normalizer.NFKC,0) !=Normalizer.NO)
423 errln("ERROR in NFKC quick check at U+"+
424 Integer.toHexString(CPNFKC[count]));
427 // for improving coverage
428 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
429 Normalizer.NFKC) !=Normalizer.NO)
431 errln("ERROR in NFKC quick check at U+"+
432 Integer.toHexString(CPNFKC[count]));
439 public void TestQuickCheckResultYES()
441 final char CPNFD[] = {0x00C6, 0x017F, 0x0F74, 0x1000, 0x1E9A,
442 0x2261, 0x3075, 0x4000, 0x5000, 0xF000};
443 final char CPNFC[] = {0x0400, 0x0540, 0x0901, 0x1000, 0x1500,
444 0x1E9A, 0x3000, 0x4000, 0x5000, 0xF000};
445 final char CPNFKD[] = {0x00AB, 0x02A0, 0x1000, 0x1027, 0x2FFB,
446 0x3FFF, 0x4FFF, 0xA000, 0xF000, 0xFA27};
447 final char CPNFKC[] = {0x00B0, 0x0100, 0x0200, 0x0A02, 0x1000,
448 0x2010, 0x3030, 0x4000, 0xA000, 0xFA0E};
456 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFD,0)
459 errln("ERROR in NFD quick check at U+"+
460 Integer.toHexString(cp));
463 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFC,0)
466 errln("ERROR in NFC quick check at U+"+
467 Integer.toHexString(cp));
470 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKD,0)
473 errln("ERROR in NFKD quick check at U+" +
474 Integer.toHexString(cp));
477 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC,0)
480 errln("ERROR in NFKC quick check at U+"+
481 Integer.toHexString(cp));
484 // improve the coverage
485 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC)
488 errln("ERROR in NFKC quick check at U+"+
489 Integer.toHexString(cp));
495 for (; count < SIZE; count ++)
497 if (Normalizer.quickCheck(String.valueOf(CPNFD[count]),
498 Normalizer.NFD,0)!=Normalizer.YES)
500 errln("ERROR in NFD quick check at U+"+
501 Integer.toHexString(CPNFD[count]));
504 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),
505 Normalizer.NFC,0)!=Normalizer.YES)
507 errln("ERROR in NFC quick check at U+"+
508 Integer.toHexString(CPNFC[count]));
511 if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]),
512 Normalizer.NFKD,0)!=Normalizer.YES)
514 errln("ERROR in NFKD quick check at U+"+
515 Integer.toHexString(CPNFKD[count]));
518 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
519 Normalizer.NFKC,0)!=Normalizer.YES)
521 errln("ERROR in NFKC quick check at U+"+
522 Integer.toHexString(CPNFKC[count]));
525 // improve the coverage
526 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
527 Normalizer.NFKC)!=Normalizer.YES)
529 errln("ERROR in NFKC quick check at U+"+
530 Integer.toHexString(CPNFKC[count]));
535 public void TestBengali() throws Exception{
536 String input = "\u09bc\u09be\u09cd\u09be";
537 String output=Normalizer.normalize(input,Normalizer.NFC);
538 if(!input.equals(output)){
539 errln("ERROR in NFC of string");
542 public void TestQuickCheckResultMAYBE()
545 final char[] CPNFC = {0x0306, 0x0654, 0x0BBE, 0x102E, 0x1161,
546 0x116A, 0x1173, 0x1175, 0x3099, 0x309A};
547 final char[] CPNFKC = {0x0300, 0x0654, 0x0655, 0x09D7, 0x0B3E,
548 0x0DCF, 0xDDF, 0x102E, 0x11A8, 0x3099};
555 /* NFD and NFKD does not have any MAYBE codepoints */
556 for (; count < SIZE; count ++)
558 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),
559 Normalizer.NFC,0)!=Normalizer.MAYBE)
561 errln("ERROR in NFC quick check at U+"+
562 Integer.toHexString(CPNFC[count]));
565 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
566 Normalizer.NFKC,0)!=Normalizer.MAYBE)
568 errln("ERROR in NFKC quick check at U+"+
569 Integer.toHexString(CPNFKC[count]));
572 if (Normalizer.quickCheck(new char[]{CPNFC[count]},
573 Normalizer.NFC,0)!=Normalizer.MAYBE)
575 errln("ERROR in NFC quick check at U+"+
576 Integer.toHexString(CPNFC[count]));
579 if (Normalizer.quickCheck(new char[]{CPNFKC[count]},
580 Normalizer.NFKC,0)!=Normalizer.MAYBE)
582 errln("ERROR in NFKC quick check at U+"+
583 Integer.toHexString(CPNFKC[count]));
586 if (Normalizer.quickCheck(new char[]{CPNFKC[count]},
587 Normalizer.NONE,0)!=Normalizer.YES)
589 errln("ERROR in NONE quick check at U+"+
590 Integer.toHexString(CPNFKC[count]));
596 public void TestQuickCheckStringResult()
602 for (count = 0; count < canonTests.length; count ++)
604 d = canonTests[count][1];
605 c = canonTests[count][2];
606 if (Normalizer.quickCheck(d,Normalizer.NFD,0)
609 errln("ERROR in NFD quick check for string at count " + count);
613 if (Normalizer.quickCheck(c, Normalizer.NFC,0)
616 errln("ERROR in NFC quick check for string at count " + count);
621 for (count = 0; count < compatTests.length; count ++)
623 d = compatTests[count][1];
624 c = compatTests[count][2];
625 if (Normalizer.quickCheck(d, Normalizer.NFKD,0)
628 errln("ERROR in NFKD quick check for string at count " + count);
632 if (Normalizer.quickCheck(c, Normalizer.NFKC,0)
635 errln("ERROR in NFKC quick check for string at count " + count);
641 static final int qcToInt(Normalizer.QuickCheckResult qc) {
642 if(qc==Normalizer.NO) {
644 } else if(qc==Normalizer.YES) {
646 } else /* Normalizer.MAYBE */ {
651 public void TestQuickCheckPerCP() {
654 int lccc1, lccc2, tccc1, tccc2;
658 UCharacter.getIntPropertyMaxValue(UProperty.NFD_QUICK_CHECK)!=1 || // YES
659 UCharacter.getIntPropertyMaxValue(UProperty.NFKD_QUICK_CHECK)!=1 ||
660 UCharacter.getIntPropertyMaxValue(UProperty.NFC_QUICK_CHECK)!=2 || // MAYBE
661 UCharacter.getIntPropertyMaxValue(UProperty.NFKC_QUICK_CHECK)!=2 ||
662 UCharacter.getIntPropertyMaxValue(UProperty.LEAD_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) ||
663 UCharacter.getIntPropertyMaxValue(UProperty.TRAIL_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS)
665 errln("wrong result from one of the u_getIntPropertyMaxValue(UCHAR_NF*_QUICK_CHECK) or UCHAR_*_CANONICAL_COMBINING_CLASS");
669 * compare the quick check property values for some code points
670 * to the quick check results for checking same-code point strings
676 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFC_QUICK_CHECK);
677 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFC));
679 errln("getIntPropertyValue(NFC)="+qc1+" != "+qc2+"=quickCheck(NFC) for U+"+Integer.toHexString(c));
682 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFD_QUICK_CHECK);
683 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFD));
685 errln("getIntPropertyValue(NFD)="+qc1+" != "+qc2+"=quickCheck(NFD) for U+"+Integer.toHexString(c));
688 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKC_QUICK_CHECK);
689 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKC));
691 errln("getIntPropertyValue(NFKC)="+qc1+" != "+qc2+"=quickCheck(NFKC) for U+"+Integer.toHexString(c));
694 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKD_QUICK_CHECK);
695 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKD));
697 errln("getIntPropertyValue(NFKD)="+qc1+" != "+qc2+"=quickCheck(NFKD) for U+"+Integer.toHexString(c));
700 nfd=Normalizer.normalize(s, Normalizer.NFD);
701 lead=UTF16.charAt(nfd, 0);
702 trail=UTF16.charAt(nfd, nfd.length()-1);
704 lccc1=UCharacter.getIntPropertyValue(c, UProperty.LEAD_CANONICAL_COMBINING_CLASS);
705 lccc2=UCharacter.getCombiningClass(lead);
706 tccc1=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS);
707 tccc2=UCharacter.getCombiningClass(trail);
710 errln("getIntPropertyValue(lccc)="+lccc1+" != "+lccc2+"=getCombiningClass(lead) for U+"+Integer.toHexString(c));
713 errln("getIntPropertyValue(tccc)="+tccc1+" != "+tccc2+"=getCombiningClass(trail) for U+"+Integer.toHexString(c));
716 /* skip some code points */
721 //------------------------------------------------------------------------
722 // Internal utilities
724 //------------------------------------------------------------------------
725 // Internal utilities
728 /* private void backAndForth(Normalizer iter, String input)
732 // Run through the iterator forwards and stick it into a StringBuffer
733 StringBuffer forward = new StringBuffer();
734 for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) {
738 // Now do it backwards
739 StringBuffer reverse = new StringBuffer();
740 for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) {
741 reverse.insert(0, ch);
744 if (!forward.toString().equals(reverse.toString())) {
745 errln("FAIL: Forward/reverse mismatch for input " + hex(input)
746 + ", forward: " + hex(forward) + ", backward: "+hex(reverse));
747 } else if (isVerbose()) {
748 logln("Ok: Forward/reverse for input " + hex(input)
749 + ", forward: " + hex(forward) + ", backward: "+hex(reverse));
753 private void backAndForth(Normalizer iter, String[][] tests)
755 for (int i = 0; i < tests.length; i++)
757 iter.setText(tests[i][0]);
759 // Run through the iterator forwards and stick it into a
761 StringBuffer forward = new StringBuffer();
762 for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) {
766 // Now do it backwards
767 StringBuffer reverse = new StringBuffer();
768 for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) {
769 reverse.insert(0, ch);
772 if (!forward.toString().equals(reverse.toString())) {
773 errln("FAIL: Forward/reverse mismatch for input "
774 + hex(tests[i][0]) + ", forward: " + hex(forward)
775 + ", backward: " + hex(reverse));
776 } else if (isVerbose()) {
777 logln("Ok: Forward/reverse for input " + hex(tests[i][0])
778 + ", forward: " + hex(forward) + ", backward: "
784 private void staticTest (Normalizer.Mode mode,
785 String[][] tests, int outCol) throws Exception{
786 for (int i = 0; i < tests.length; i++)
788 String input = Utility.unescape(tests[i][0]);
789 String expect = Utility.unescape(tests[i][outCol]);
791 logln("Normalizing '" + input + "' (" + hex(input) + ")" );
793 String output = Normalizer.normalize(input, mode);
795 if (!output.equals(expect)) {
796 errln("FAIL: case " + i
797 + " expected '" + expect + "' (" + hex(expect) + ")"
798 + " but got '" + output + "' (" + hex(output) + ")" );
801 char[] output = new char[1];
802 for (int i = 0; i < tests.length; i++)
804 char[] input = Utility.unescape(tests[i][0]).toCharArray();
805 String expect =Utility.unescape( tests[i][outCol]);
807 logln("Normalizing '" + new String(input) + "' (" +
808 hex(new String(input)) + ")" );
812 reqLength=Normalizer.normalize(input,output, mode,0);
813 if(reqLength<=output.length ){
816 }catch(IndexOutOfBoundsException e){
817 output= new char[Integer.parseInt(e.getMessage())];
821 if (!expect.equals(new String(output,0,reqLength))) {
822 errln("FAIL: case " + i
823 + " expected '" + expect + "' (" + hex(expect) + ")"
824 + " but got '" + new String(output)
825 + "' (" + hex(new String(output)) + ")" );
829 private void decomposeTest(Normalizer.Mode mode,
830 String[][] tests, int outCol) throws Exception{
831 for (int i = 0; i < tests.length; i++)
833 String input = Utility.unescape(tests[i][0]);
834 String expect = Utility.unescape(tests[i][outCol]);
836 logln("Normalizing '" + input + "' (" + hex(input) + ")" );
838 String output = Normalizer.decompose(input, mode==Normalizer.NFKD);
840 if (!output.equals(expect)) {
841 errln("FAIL: case " + i
842 + " expected '" + expect + "' (" + hex(expect) + ")"
843 + " but got '" + output + "' (" + hex(output) + ")" );
846 char[] output = new char[1];
847 for (int i = 0; i < tests.length; i++)
849 char[] input = Utility.unescape(tests[i][0]).toCharArray();
850 String expect = Utility.unescape(tests[i][outCol]);
852 logln("Normalizing '" + new String(input) + "' (" +
853 hex(new String(input)) + ")" );
857 reqLength=Normalizer.decompose(input,output, mode==Normalizer.NFKD,0);
858 if(reqLength<=output.length ){
861 }catch(IndexOutOfBoundsException e){
862 output= new char[Integer.parseInt(e.getMessage())];
866 if (!expect.equals(new String(output,0,reqLength))) {
867 errln("FAIL: case " + i
868 + " expected '" + expect + "' (" + hex(expect) + ")"
869 + " but got '" + new String(output)
870 + "' (" + hex(new String(output)) + ")" );
873 output = new char[1];
874 for (int i = 0; i < tests.length; i++)
876 char[] input = Utility.unescape(tests[i][0]).toCharArray();
877 String expect = Utility.unescape(tests[i][outCol]);
879 logln("Normalizing '" + new String(input) + "' (" +
880 hex(new String(input)) + ")" );
884 reqLength=Normalizer.decompose(input,0,input.length,output,0,output.length, mode==Normalizer.NFKD,0);
885 if(reqLength<=output.length ){
888 }catch(IndexOutOfBoundsException e){
889 output= new char[Integer.parseInt(e.getMessage())];
893 if (!expect.equals(new String(output,0,reqLength))) {
894 errln("FAIL: case " + i
895 + " expected '" + expect + "' (" + hex(expect) + ")"
896 + " but got '" + new String(output)
897 + "' (" + hex(new String(output)) + ")" );
899 char[] output2 = new char[reqLength * 2];
900 System.arraycopy(output, 0, output2, 0, reqLength);
901 int retLength = Normalizer.decompose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0);
902 if(retLength != reqLength){
903 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength);
908 private void composeTest(Normalizer.Mode mode,
909 String[][] tests, int outCol) throws Exception{
910 for (int i = 0; i < tests.length; i++)
912 String input = Utility.unescape(tests[i][0]);
913 String expect = Utility.unescape(tests[i][outCol]);
915 logln("Normalizing '" + input + "' (" + hex(input) + ")" );
917 String output = Normalizer.compose(input, mode==Normalizer.NFKC);
919 if (!output.equals(expect)) {
920 errln("FAIL: case " + i
921 + " expected '" + expect + "' (" + hex(expect) + ")"
922 + " but got '" + output + "' (" + hex(output) + ")" );
925 char[] output = new char[1];
926 for (int i = 0; i < tests.length; i++)
928 char[] input = Utility.unescape(tests[i][0]).toCharArray();
929 String expect = Utility.unescape(tests[i][outCol]);
931 logln("Normalizing '" + new String(input) + "' (" +
932 hex(new String(input)) + ")" );
936 reqLength=Normalizer.compose(input,output, mode==Normalizer.NFKC,0);
937 if(reqLength<=output.length ){
940 }catch(IndexOutOfBoundsException e){
941 output= new char[Integer.parseInt(e.getMessage())];
945 if (!expect.equals(new String(output,0,reqLength))) {
946 errln("FAIL: case " + i
947 + " expected '" + expect + "' (" + hex(expect) + ")"
948 + " but got '" + new String(output)
949 + "' (" + hex(new String(output)) + ")" );
952 output = new char[1];
953 for (int i = 0; i < tests.length; i++)
955 char[] input = Utility.unescape(tests[i][0]).toCharArray();
956 String expect = Utility.unescape(tests[i][outCol]);
958 logln("Normalizing '" + new String(input) + "' (" +
959 hex(new String(input)) + ")" );
963 reqLength=Normalizer.compose(input,0,input.length, output, 0, output.length, mode==Normalizer.NFKC,0);
964 if(reqLength<=output.length ){
967 }catch(IndexOutOfBoundsException e){
968 output= new char[Integer.parseInt(e.getMessage())];
972 if (!expect.equals(new String(output,0,reqLength))) {
973 errln("FAIL: case " + i
974 + " expected '" + expect + "' (" + hex(expect) + ")"
975 + " but got '" + new String(output)
976 + "' (" + hex(new String(output)) + ")" );
979 char[] output2 = new char[reqLength * 2];
980 System.arraycopy(output, 0, output2, 0, reqLength);
981 int retLength = Normalizer.compose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0);
982 if(retLength != reqLength){
983 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength);
987 private void iterateTest(Normalizer iter, String[][] tests, int outCol){
988 for (int i = 0; i < tests.length; i++)
990 String input = Utility.unescape(tests[i][0]);
991 String expect = Utility.unescape(tests[i][outCol]);
993 logln("Normalizing '" + input + "' (" + hex(input) + ")" );
996 assertEqual(expect, iter, "case " + i + " ");
1000 private void assertEqual(String expected, Normalizer iter, String msg)
1004 UCharacterIterator cIter = UCharacterIterator.getInstance(expected);
1006 while ((ch=iter.next())!= Normalizer.DONE){
1007 if (index >= expected.length()) {
1008 errln("FAIL: " + msg + "Unexpected character '" + (char)ch
1009 + "' (" + hex(ch) + ")"
1010 + " at index " + index);
1013 int want = UTF16.charAt(expected,index);
1015 errln("FAIL: " + msg + "got '" + (char)ch
1016 + "' (" + hex(ch) + ")"
1017 + " but expected '" + want + "' (" + hex(want)+ ")"
1018 + " at index " + index);
1020 index+= UTF16.getCharCount(ch);
1022 if (index < expected.length()) {
1023 errln("FAIL: " + msg + "Only got " + index + " chars, expected "
1024 + expected.length());
1028 while((ch=iter.previous())!=Normalizer.DONE){
1029 int want = cIter.previousCodePoint();
1031 errln("FAIL: " + msg + "got '" + (char)ch
1032 + "' (" + hex(ch) + ")"
1033 + " but expected '" + want + "' (" + hex(want) + ")"
1034 + " at index " + index);
1038 //--------------------------------------------------------------------------
1040 // NOTE: These tests are used for quick debugging so are not ported
1041 // to ICU4C tsnorm.cpp in intltest
1044 public void TestDebugStatic(){
1045 String in = Utility.unescape("\\U0001D157\\U0001D165");
1046 if(!Normalizer.isNormalized(in,Normalizer.NFC,0)){
1047 errln("isNormalized failed");
1050 String input = "\uAD8B\uAD8B\uAD8B\uAD8B"+
1051 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1052 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1053 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1054 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1055 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1056 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
1057 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
1058 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+
1059 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
1060 "\uAD8B\uAD8B\uAD8B\uAD8B"+
1061 "d\u031B\u0307\u0323";
1062 String expect = "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+
1063 "\u11AA\u1100\u116F\u11AA\uD834\uDD57\uD834\uDD65"+
1064 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1065 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1066 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1067 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1068 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1069 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1070 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1071 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1072 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1073 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1074 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1075 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1076 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1077 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1078 "\uD834\uDD57\uD834\uDD65aaaaaaaaaaaaaaaaaazzzzzz"+
1079 "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
1080 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
1081 "bbbbbbbbbbbbbbbbbbbbbbbbccccccccccccccccccccccccccccc"+
1082 "cccccccccccccccccccccccccccccccccccccccccccccccc"+
1083 "ddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
1084 "dddddddddddddddddddddddd"+
1085 "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+
1086 "\u11AA\u1100\u116F\u11AA\u0064\u031B\u0323\u0307";
1087 String output = Normalizer.normalize(Utility.unescape(input),
1089 if(!expect.equals(output)){
1090 errln("FAIL expected: "+hex(expect) + " got: "+hex(output));
1096 public void TestDebugIter(){
1097 String src = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e");
1098 String expected = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e");
1099 Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(src)),
1103 UCharacterIterator cIter = UCharacterIterator.getInstance(expected);
1105 while ((ch=iter.next())!= Normalizer.DONE){
1106 if (index >= expected.length()) {
1107 errln("FAIL: " + "Unexpected character '" + (char)ch
1108 + "' (" + hex(ch) + ")"
1109 + " at index " + index);
1112 int want = UTF16.charAt(expected,index);
1114 errln("FAIL: " + "got '" + (char)ch
1115 + "' (" + hex(ch) + ")"
1116 + " but expected '" + want + "' (" + hex(want)+ ")"
1117 + " at index " + index);
1119 index+= UTF16.getCharCount(ch);
1121 if (index < expected.length()) {
1122 errln("FAIL: " + "Only got " + index + " chars, expected "
1123 + expected.length());
1127 while((ch=iter.previous())!=Normalizer.DONE){
1128 int want = cIter.previousCodePoint();
1130 errln("FAIL: " + "got '" + (char)ch
1131 + "' (" + hex(ch) + ")"
1132 + " but expected '" + want + "' (" + hex(want) + ")"
1133 + " at index " + index);
1137 public void TestDebugIterOld(){
1138 String input = "\\U0001D15E";
1139 String expected = "\uD834\uDD57\uD834\uDD65";
1140 String expectedReverse = "\uD834\uDD65\uD834\uDD57";
1143 Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(input)),
1145 StringBuffer got = new StringBuffer();
1146 for (ch = iter.first();ch!=Normalizer.DONE;ch=iter.next())
1148 if (index >= expected.length()) {
1149 errln("FAIL: " + "Unexpected character '" + (char)ch +
1150 "' (" + hex(ch) + ")" + " at index " + index);
1153 got.append(UCharacter.toString(ch));
1156 if (!expected.equals(got.toString())) {
1157 errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")"
1158 + " but expected '" + expected + "' ("
1159 + hex(expected) + ")");
1161 if (got.length() < expected.length()) {
1162 errln("FAIL: " + "Only got " + index + " chars, expected "
1163 + expected.length());
1166 logln("Reverse Iteration\n");
1167 iter.setIndexOnly(iter.endIndex());
1169 for(ch=iter.previous();ch!=Normalizer.DONE;ch=iter.previous()){
1170 if (index >= expected.length()) {
1171 errln("FAIL: " + "Unexpected character '" + (char)ch
1172 + "' (" + hex(ch) + ")" + " at index " + index);
1175 got.append(UCharacter.toString(ch));
1177 if (!expectedReverse.equals(got.toString())) {
1178 errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")"
1179 + " but expected '" + expected
1180 + "' (" + hex(expected) + ")");
1182 if (got.length() < expected.length()) {
1183 errln("FAIL: " + "Only got " + index + " chars, expected "
1184 + expected.length());
1188 //--------------------------------------------------------------------------
1189 // helper class for TestPreviousNext()
1190 // simple UTF-32 character iterator
1191 class UCharIterator {
1193 public UCharIterator(int[] src, int len, int index){
1200 public int current() {
1216 public int previous() {
1224 public int getIndex() {
1229 private int length, i;
1231 public void TestPreviousNext() {
1232 // src and expect strings
1234 UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999),
1235 UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f),
1246 // expected src indexes corresponding to expect indexes
1252 6 // behind last character
1255 // initial indexes into the src and expect strings
1257 final int SRC_MIDDLE=4;
1258 final int EXPECT_MIDDLE=3;
1262 // - for previous(), 0 for current(), + for next()
1263 // not const so that we can terminate it below for the error message
1264 String moves="0+0+0--0-0-+++0--+++++++0--------";
1267 Normalizer iter = new Normalizer(new String(src),
1269 UCharIterator iter32 = new UCharIterator(expect, expect.length,
1275 // initially set the indexes into the middle of the strings
1276 iter.setIndexOnly(SRC_MIDDLE);
1278 // move around and compare the iteration code points with
1279 // the expected ones
1281 while(movesIndex<moves.length()) {
1282 m=moves.charAt(movesIndex++);
1285 c2=iter32.previous();
1288 c2=iter32.current();
1289 } else /* m=='+' */ {
1296 // copy the moves until the current (m) move, and terminate
1297 String history = moves.substring(0,movesIndex);
1298 errln("error: mismatch in Normalizer iteration at "+history+": "
1299 +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2));
1304 if(iter.getIndex()!=expectIndex[iter32.getIndex()]) {
1305 // copy the moves until the current (m) move, and terminate
1306 String history = moves.substring(0,movesIndex);
1307 errln("error: index mismatch in Normalizer iteration at "
1308 +history+ " : "+ "Normalizer index " +iter.getIndex()
1309 +" expected "+ expectIndex[iter32.getIndex()]);
1315 public void TestPreviousNextJCI() {
1316 // src and expect strings
1318 UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999),
1319 UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f),
1330 // expected src indexes corresponding to expect indexes
1336 6 // behind last character
1339 // initial indexes into the src and expect strings
1341 final int SRC_MIDDLE=4;
1342 final int EXPECT_MIDDLE=3;
1346 // - for previous(), 0 for current(), + for next()
1347 // not const so that we can terminate it below for the error message
1348 String moves="0+0+0--0-0-+++0--+++++++0--------";
1351 StringCharacterIterator text = new StringCharacterIterator(new String(src));
1352 Normalizer iter = new Normalizer(text,Normalizer.NFD,0);
1353 UCharIterator iter32 = new UCharIterator(expect, expect.length,
1359 // initially set the indexes into the middle of the strings
1360 iter.setIndexOnly(SRC_MIDDLE);
1362 // move around and compare the iteration code points with
1363 // the expected ones
1365 while(movesIndex<moves.length()) {
1366 m=moves.charAt(movesIndex++);
1369 c2=iter32.previous();
1372 c2=iter32.current();
1373 } else /* m=='+' */ {
1380 // copy the moves until the current (m) move, and terminate
1381 String history = moves.substring(0,movesIndex);
1382 errln("error: mismatch in Normalizer iteration at "+history+": "
1383 +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2));
1388 if(iter.getIndex()!=expectIndex[iter32.getIndex()]) {
1389 // copy the moves until the current (m) move, and terminate
1390 String history = moves.substring(0,movesIndex);
1391 errln("error: index mismatch in Normalizer iteration at "
1392 +history+ " : "+ "Normalizer index " +iter.getIndex()
1393 +" expected "+ expectIndex[iter32.getIndex()]);
1399 // test APIs that are not otherwise used - improve test coverage
1400 public void TestNormalizerAPI() throws Exception {
1402 // instantiate a Normalizer from a CharacterIterator
1403 String s=Utility.unescape("a\u0308\uac00\\U0002f800");
1404 // make s a bit longer and more interesting
1405 UCharacterIterator iter = UCharacterIterator.getInstance(s+s);
1406 Normalizer norm = new Normalizer(iter, Normalizer.NFC,0);
1407 if(norm.next()!=0xe4) {
1408 errln("error in Normalizer(CharacterIterator).next()");
1411 // test clone(), ==, and hashCode()
1412 Normalizer clone=(Normalizer)norm.clone();
1413 if(clone.equals(norm)) {
1414 errln("error in Normalizer(Normalizer(CharacterIterator)).clone()!=norm");
1418 if(clone.getLength()!= norm.getLength()){
1419 errln("error in Normalizer.getBeginIndex()");
1421 // clone must have the same hashCode()
1422 //if(clone.hashCode()!=norm.hashCode()) {
1423 // errln("error in Normalizer(Normalizer(CharacterIterator)).clone().hashCode()!=copy.hashCode()");
1425 if(clone.next()!=0xac00) {
1426 errln("error in Normalizer(Normalizer(CharacterIterator)).next()");
1428 int ch = clone.next();
1430 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next()");
1432 // position changed, must change hashCode()
1433 if(clone.hashCode()==norm.hashCode()) {
1434 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next().hashCode()==copy.hashCode()");
1437 // test compose() and decompose()
1440 tel=new StringBuffer("\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121");
1441 tel.insert(1,(char)0x0301);
1443 nfkc=Normalizer.compose(tel.toString(), true);
1444 nfkd=Normalizer.decompose(tel.toString(), true);
1446 !nfkc.equals(Utility.unescape("TE\u0139TELTELTELTELTELTELTELTELTEL"))||
1447 !nfkd.equals(Utility.unescape("TEL\u0301TELTELTELTELTELTELTELTELTEL"))
1449 errln("error in Normalizer::(de)compose(): wrong result(s)");
1453 // ch=norm.setIndex(3);
1455 // errln("error in Normalizer(CharacterIterator).setIndex(3)");
1458 // test setText(CharacterIterator) and getText()
1460 clone.setText(iter);
1462 out = clone.getText();
1463 out2 = iter.getText();
1464 if( !out.equals(out2) ||
1465 clone.startIndex()!=0||
1466 clone.endIndex()!=iter.getLength()
1468 errln("error in Normalizer::setText() or Normalizer::getText()");
1471 char[] fillIn1 = new char[clone.getLength()];
1472 char[] fillIn2 = new char[iter.getLength()];
1473 int len = clone.getText(fillIn1);
1474 iter.getText(fillIn2,0);
1475 if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){
1476 errln("error in Normalizer.getText(). Normalizer: "+
1477 Utility.hex(new String(fillIn1))+
1478 " Iter: " + Utility.hex(new String(fillIn2)));
1481 clone.setText(fillIn1);
1482 len = clone.getText(fillIn2);
1483 if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){
1484 errln("error in Normalizer.setText() or Normalizer.getText()"+
1485 Utility.hex(new String(fillIn1))+
1486 " Iter: " + Utility.hex(new String(fillIn2)));
1489 // test setText(UChar *), getUMode() and setMode()
1491 clone.setIndexOnly(1);
1492 clone.setMode(Normalizer.NFD);
1493 if(clone.getMode()!=Normalizer.NFD) {
1494 errln("error in Normalizer::setMode() or Normalizer::getMode()");
1496 if(clone.next()!=0x308 || clone.next()!=0x1100) {
1497 errln("error in Normalizer::setText() or Normalizer::setMode()");
1500 // test last()/previous() with an internal buffer overflow
1501 StringBuffer buf = new StringBuffer("aaaaaaaaaa");
1502 buf.setCharAt(10-1,'\u0308');
1504 if(clone.last()!=0x308) {
1505 errln("error in Normalizer(10*U+0308).last()");
1509 norm.setMode(Normalizer.NONE);
1510 if(norm.first()!=0x61 || norm.next()!=0x308 || norm.last()!=0x2f800) {
1511 errln("error in Normalizer(UNORM_NONE).first()/next()/last()");
1513 out=Normalizer.normalize(s, Normalizer.NONE);
1514 if(!out.equals(s)) {
1515 errln("error in Normalizer::normalize(UNORM_NONE)");
1518 String exp = "\\U0001D157\\U0001D165";
1519 String ns = Normalizer.normalize(ch,Normalizer.NFC);
1520 if(!ns.equals(Utility.unescape(exp))){
1521 errln("error in Normalizer.normalize(int,Mode)");
1523 ns = Normalizer.normalize(ch,Normalizer.NFC,0);
1524 if(!ns.equals(Utility.unescape(exp))){
1525 errln("error in Normalizer.normalize(int,Mode,int)");
1529 }catch(Exception e){
1534 public void TestConcatenate() {
1536 Object[][]cases=new Object[][]{
1537 /* mode, left, right, result */
1550 /* ### TODO: add more interesting cases */
1554 "\u0C4D\uD804\uDCBA\uD834\uDD69", // 0C4D 110BA 1D169
1555 "\u03B1\uD834\uDD69\uD804\uDCBA\u0C4D\u0345" // 03B1 1D169 110BA 0C4D 0345
1559 String left, right, expect, result;
1560 Normalizer.Mode mode;
1563 /* test concatenation */
1564 for(i=0; i<cases.length; ++i) {
1565 mode = (Normalizer.Mode)cases[i][0];
1567 left=(String)cases[i][1];
1568 right=(String)cases[i][2];
1569 expect=(String)cases[i][3];
1571 result=Normalizer.concatenate(left, right, mode,0);
1572 if(!result.equals(expect)) {
1573 errln("error in Normalizer.concatenate(), cases[] failed"
1574 +", result==expect: expected: "
1575 + hex(expect)+" =========> got: " + hex(result));
1579 result=Normalizer.concatenate(left.toCharArray(), right.toCharArray(), mode,0);
1580 if(!result.equals(expect)) {
1581 errln("error in Normalizer.concatenate(), cases[] failed"
1582 +", result==expect: expected: "
1583 + hex(expect)+" =========> got: " + hex(result));
1588 private final int RAND_MAX = 0x7fff;
1590 public void TestCheckFCD()
1592 char[] FAST = {0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
1593 0x0008, 0x0009, 0x000A};
1595 char[] FALSE = {0x0001, 0x0002, 0x02EA, 0x03EB, 0x0300, 0x0301,
1596 0x02B9, 0x0314, 0x0315, 0x0316};
1598 char[] TRUE = {0x0030, 0x0040, 0x0440, 0x056D, 0x064F, 0x06E7,
1599 0x0050, 0x0730, 0x09EE, 0x1E10};
1601 char[][] datastr= { {0x0061, 0x030A, 0x1E05, 0x0302, 0},
1602 {0x0061, 0x030A, 0x00E2, 0x0323, 0},
1603 {0x0061, 0x0323, 0x00E2, 0x0323, 0},
1604 {0x0061, 0x0323, 0x1E05, 0x0302, 0}
1606 Normalizer.QuickCheckResult result[] = {Normalizer.YES, Normalizer.NO, Normalizer.NO, Normalizer.YES};
1608 char[] datachar= { 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
1610 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
1612 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306,
1613 0x0307, 0x0308, 0x0309, 0x030a,
1614 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326,
1615 0x0327, 0x0328, 0x0329, 0x032a,
1616 0x1e00, 0x1e01, 0x1e02, 0x1e03, 0x1e04, 0x1e05, 0x1e06,
1617 0x1e07, 0x1e08, 0x1e09, 0x1e0a
1622 if (Normalizer.quickCheck(FAST,0,FAST.length, Normalizer.FCD,0) != Normalizer.YES)
1623 errln("Normalizer.quickCheck(FCD) failed: expected value for fast Normalizer.quickCheck is Normalizer.YES\n");
1624 if (Normalizer.quickCheck(FALSE,0, FALSE.length,Normalizer.FCD,0) != Normalizer.NO)
1625 errln("Normalizer.quickCheck(FCD) failed: expected value for error Normalizer.quickCheck is Normalizer.NO\n");
1626 if (Normalizer.quickCheck(TRUE,0,TRUE.length,Normalizer.FCD,0) != Normalizer.YES)
1627 errln("Normalizer.quickCheck(FCD) failed: expected value for correct Normalizer.quickCheck is Normalizer.YES\n");
1632 Normalizer.QuickCheckResult fcdresult = Normalizer.quickCheck(datastr[count],0,datastr[count].length, Normalizer.FCD,0);
1633 if (result[count] != fcdresult) {
1634 errln("Normalizer.quickCheck(FCD) failed: Data set "+ count
1635 + " expected value "+ result[count]);
1640 /* random checks of long strings */
1641 //srand((unsigned)time( NULL ));
1642 Random rand = createRandom(); // use test framework's random
1644 for (count = 0; count < 50; count ++)
1647 Normalizer.QuickCheckResult testresult = Normalizer.YES;
1648 char[] data= new char[20];
1649 char[] norm= new char[100];
1650 char[] nfd = new char[100];
1653 while (size != 19) {
1654 data[size] = datachar[rand.nextInt(RAND_MAX)*50/RAND_MAX];
1655 logln("0x"+data[size]);
1656 normStart += Normalizer.normalize(data,size,size+1,
1663 nfdsize = Normalizer.normalize(data,0,size, nfd,0,nfd.length,Normalizer.NFD,0);
1664 // nfdsize = unorm_normalize(data, size, UNORM_NFD, UCOL_IGNORE_HANGUL,
1665 // nfd, 100, &status);
1666 if (nfdsize != normStart || Utility.arrayRegionMatches(nfd,0, norm,0,nfdsize) ==false) {
1667 testresult = Normalizer.NO;
1669 if (testresult == Normalizer.YES) {
1670 logln("result Normalizer.YES\n");
1673 logln("result Normalizer.NO\n");
1676 if (Normalizer.quickCheck(data,0,data.length, Normalizer.FCD,0) != testresult) {
1677 errln("Normalizer.quickCheck(FCD) failed: expected "+ testresult +" for random data: "+hex(new String(data)) );
1683 // reference implementation of Normalizer::compare
1684 private int ref_norm_compare(String s1, String s2, int options) {
1685 String t1, t2,r1,r2;
1687 int normOptions=(int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT);
1689 if((options&Normalizer.COMPARE_IGNORE_CASE)!=0) {
1690 // NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
1691 r1 = Normalizer.decompose(s1,false,normOptions);
1692 r2 = Normalizer.decompose(s2,false,normOptions);
1693 r1 = UCharacter.foldCase(r1,options);
1694 r2 = UCharacter.foldCase(r2,options);
1700 t1 = Normalizer.decompose(r1, false, normOptions);
1701 t2 = Normalizer.decompose(r2, false, normOptions);
1703 if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) {
1704 UTF16.StringComparator comp
1705 = new UTF16.StringComparator(true, false,
1706 UTF16.StringComparator.FOLD_CASE_DEFAULT);
1707 return comp.compare(t1,t2);
1709 return t1.compareTo(t2);
1714 // test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately
1715 private int norm_compare(String s1, String s2, int options) {
1716 int normOptions=(int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT);
1718 if( Normalizer.YES==Normalizer.quickCheck(s1,Normalizer.FCD,normOptions) &&
1719 Normalizer.YES==Normalizer.quickCheck(s2,Normalizer.FCD,normOptions)) {
1720 options|=Normalizer.INPUT_IS_FCD;
1723 return Normalizer.compare(s1, s2, options);
1726 // reference implementation of UnicodeString::caseCompare
1727 private int ref_case_compare(String s1, String s2, int options) {
1733 t1 = UCharacter.foldCase(t1,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0));
1734 t2 = UCharacter.foldCase(t2,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0));
1736 if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) {
1737 UTF16.StringComparator comp
1738 = new UTF16.StringComparator(true, false,
1739 UTF16.StringComparator.FOLD_CASE_DEFAULT);
1740 return comp.compare(t1,t2);
1742 return t1.compareTo(t2);
1747 // reduce an integer to -1/0/1
1748 private static int sign(int value) {
1752 return (value>>31)|1;
1755 private static String signString(int value) {
1758 } else if(value==0) {
1760 } else /* value>0 */ {
1764 // test Normalizer::compare and unorm_compare (thinly wrapped by the former)
1765 // by comparing it with its semantic equivalent
1766 // since we trust the pieces, this is sufficient
1768 // test each string with itself and each other
1769 // each time with all options
1770 private String strings[]=new String[]{
1771 // some cases from NormalizationTest.txt
1773 "D\u031B\u0307\u0323",
1774 "\u1E0C\u031B\u0307",
1775 "D\u031B\u0323\u0307",
1776 "d\u031B\u0323\u0307",
1783 // Angstrom sign = A ring
1791 "a\u059A\u0316\u302A\u032Fb",
1792 "a\u302A\u0316\u032F\u059Ab",
1793 "a\u302A\u0316\u032F\u059Ab",
1794 "A\u059A\u0316\u302A\u032Fb",
1796 // from ICU case folding tests
1798 "A\u00df\u00b5\ufb03\\U0001040c\u0131",
1799 "ass\u03bcffi\\U00010434i",
1800 "\u0061\u0042\u0131\u03a3\u00df\ufb03\ud93f\udfff",
1801 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udfff",
1802 "\u0041\u0062\u0131\u03c3\u0053\u0073\u0066\u0046\u0069\ud93f\udfff",
1803 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udffd",
1805 // U+d800 U+10001 see implementation comment in unorm_cmpEquivFold
1806 // vs. U+10000 at bottom - code point order
1808 "\ud800\ud800\udc01",
1811 // other code point order tests from ustrtest.cpp
1814 "\u20ac\ud800\udc00",
1819 "\uff61\ud800\udc02",
1823 // long strings, see cnormtst.c/TestNormCoverage()
1824 // equivalent if case-insensitive
1826 "\uAD8B\uAD8B\uAD8B\uAD8B"+
1827 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1828 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1829 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1830 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1831 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1832 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
1833 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
1834 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+
1835 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
1836 "\uAD8B\uAD8B\uAD8B\uAD8B"+
1837 "d\u031B\u0307\u0323",
1839 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+
1840 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1841 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1842 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1843 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1844 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1845 "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
1846 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
1847 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+
1848 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
1849 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+
1850 "\u1E0C\u031B\u0307",
1852 // some strings that may make a difference whether the compare function
1853 // case-folds or decomposes first
1855 "\u0360\u0345\u0334",
1856 "\u0360\u03b9\u0334",
1858 "\u0360\u1f80\u0334",
1859 "\u0360\u03b1\u0313\u03b9\u0334",
1861 "\u0360\u1ffc\u0334",
1862 "\u0360\u03c9\u03b9\u0334",
1864 "a\u0360\u0345\u0360\u0345b",
1865 "a\u0345\u0360\u0345\u0360b",
1867 // interesting cases for canonical caseless match with turkic i handling
1872 // strings with post-Unicode 3.2 normalization or normalization corrections
1874 "\u00e4\u193b\\U0002f868",
1875 "\u0061\u193b\u0308\u36fc",
1880 // all combinations of options
1881 // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions
1885 public Temp(int opt,String str){
1891 // set UNORM_UNICODE_3_2 in one additional combination
1893 private Temp[] opt = new Temp[]{
1894 new Temp(0,"default"),
1895 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER, "code point order" ),
1896 new Temp(Normalizer.COMPARE_IGNORE_CASE, "ignore case" ),
1897 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE, "code point order & ignore case" ),
1898 new Temp(Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "ignore case & special i"),
1899 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "code point order & ignore case & special i"),
1900 new Temp(Normalizer.UNICODE_3_2 << Normalizer.COMPARE_NORM_OPTIONS_SHIFT, "Unicode 3.2")
1904 public void TestCompareDebug(){
1906 String[] s = new String[100]; // at least as many items as in strings[] !
1909 int i, j, k, count=strings.length;
1910 int result, refResult;
1912 // create the UnicodeStrings
1913 for(i=0; i<count; ++i) {
1914 s[i]=Utility.unescape(strings[i]);
1916 UTF16.StringComparator comp = new UTF16.StringComparator(true, false,
1917 UTF16.StringComparator.FOLD_CASE_DEFAULT);
1918 // test them each with each other
1923 // test Normalizer::compare
1924 result=norm_compare(s[i], s[j], opt[k].options);
1925 refResult=ref_norm_compare(s[i], s[j], opt[k].options);
1926 if(sign(result)!=sign(refResult)) {
1927 errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
1930 // test UnicodeString::caseCompare - same internal implementation function
1931 if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) {
1932 // result=s[i]. (s[j], opt[k].options);
1933 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)
1935 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);
1938 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
1941 result=comp.compare(s[i],s[j]);
1942 refResult=ref_case_compare(s[i], s[j], opt[k].options);
1943 if(sign(result)!=sign(refResult)) {
1944 errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
1947 String value1 = "\u00dater\u00fd";
1948 String value2 = "\u00fater\u00fd";
1949 if(Normalizer.compare(value1,value2,0)!=0){
1950 if(Normalizer.compare(value1,value2,Normalizer.COMPARE_IGNORE_CASE)==0){
1956 public void TestCompare() {
1958 String[] s = new String[100]; // at least as many items as in strings[] !
1960 int i, j, k, count=strings.length;
1961 int result, refResult;
1963 // create the UnicodeStrings
1964 for(i=0; i<count; ++i) {
1965 s[i]=Utility.unescape(strings[i]);
1967 UTF16.StringComparator comp = new UTF16.StringComparator();
1968 // test them each with each other
1969 for(i=0; i<count; ++i) {
1970 for(j=i; j<count; ++j) {
1971 for(k=0; k<opt.length; ++k) {
1972 // test Normalizer::compare
1973 result=norm_compare(s[i], s[j], opt[k].options);
1974 refResult=ref_norm_compare(s[i], s[j], opt[k].options);
1975 if(sign(result)!=sign(refResult)) {
1976 errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
1979 // test UnicodeString::caseCompare - same internal implementation function
1980 if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) {
1981 // result=s[i]. (s[j], opt[k].options);
1982 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)
1984 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);
1987 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
1990 comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0);
1991 // result=comp.caseCompare(s[i],s[j], opt[k].options);
1992 result=comp.compare(s[i],s[j]);
1993 refResult=ref_case_compare(s[i], s[j], opt[k].options);
1994 if(sign(result)!=sign(refResult)) {
1995 errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
2002 // test cases with i and I to make sure Turkic works
2003 char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 };
2004 UnicodeSet set = new UnicodeSet(), iSet = new UnicodeSet();
2005 Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstance().impl;
2006 nfcImpl.ensureCanonIterData();
2010 // collect all sets into one for contiguous output
2011 for(i=0; i<iI.length; ++i) {
2012 if(nfcImpl.getCanonStartSet(iI[i], iSet)) {
2017 // test all of these precomposed characters
2018 Normalizer2 nfcNorm2 = Normalizer2.getNFCInstance();
2019 UnicodeSetIterator it = new UnicodeSetIterator(set);
2021 while(it.next() && (c=it.codepoint)!=UnicodeSetIterator.IS_STRING) {
2022 s1 = UTF16.valueOf(c);
2023 s2 = nfcNorm2.getDecomposition(c);
2024 for(k=0; k<opt.length; ++k) {
2025 // test Normalizer::compare
2027 result= norm_compare(s1, s2, opt[k].options);
2028 refResult=ref_norm_compare(s1, s2, opt[k].options);
2029 if(sign(result)!=sign(refResult)) {
2030 errln("Normalizer.compare(U+"+hex(c)+" with its NFD, "+opt[k].name+")"
2031 + signString(result)+" should be "+signString(refResult));
2034 // test UnicodeString::caseCompare - same internal implementation function
2035 if((opt[k].options & Normalizer.COMPARE_IGNORE_CASE)>0) {
2036 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)
2038 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);
2041 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
2044 comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0);
2046 result=comp.compare(s1,s2);
2047 refResult=ref_case_compare(s1, s2, opt[k].options);
2048 if(sign(result)!=sign(refResult)) {
2049 errln("UTF16.compare(U+"+hex(c)+" with its NFD, "
2050 +opt[k].name+")"+signString(result) +" should be "+signString(refResult));
2056 // test getDecomposition() for some characters that do not decompose
2057 if( nfcNorm2.getDecomposition(0x20)!=null ||
2058 nfcNorm2.getDecomposition(0x4e00)!=null ||
2059 nfcNorm2.getDecomposition(0x20002)!=null
2061 errln("NFC.getDecomposition() returns TRUE for characters which do not have decompositions");
2064 // test getRawDecomposition() for some characters that do not decompose
2065 if( nfcNorm2.getRawDecomposition(0x20)!=null ||
2066 nfcNorm2.getRawDecomposition(0x4e00)!=null ||
2067 nfcNorm2.getRawDecomposition(0x20002)!=null
2069 errln("getRawDecomposition() returns TRUE for characters which do not have decompositions");
2072 // test composePair() for some pairs of characters that do not compose
2073 if( nfcNorm2.composePair(0x20, 0x301)>=0 ||
2074 nfcNorm2.composePair(0x61, 0x305)>=0 ||
2075 nfcNorm2.composePair(0x1100, 0x1160)>=0 ||
2076 nfcNorm2.composePair(0xac00, 0x11a7)>=0
2078 errln("NFC.composePair() incorrectly composes some pairs of characters");
2081 // test FilteredNormalizer2.getDecomposition()
2082 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff]");
2083 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter);
2084 if(fn2.getDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getDecomposition(0x100))) {
2085 errln("FilteredNormalizer2(NFC, ^A0-FF).getDecomposition() failed");
2088 // test FilteredNormalizer2.getRawDecomposition()
2089 if(fn2.getRawDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getRawDecomposition(0x100))) {
2090 errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed");
2093 // test FilteredNormalizer2::composePair()
2094 if( 0x100!=fn2.composePair(0x41, 0x304) ||
2095 fn2.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08
2097 errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed");
2101 // verify that case-folding does not un-FCD strings
2102 int countFoldFCDExceptions(int foldingOptions) {
2106 int/*unsigned*/ cc, trailCC, foldCC, foldTrailCC;
2107 Normalizer.QuickCheckResult qcResult;
2112 logln("Test if case folding may un-FCD a string (folding options 0x)"+hex(foldingOptions));
2115 for(c=0; c<=0x10ffff; ++c) {
2116 category=UCharacter.getType(c);
2117 if(category==UCharacterCategory.UNASSIGNED) {
2118 continue; // skip unassigned code points
2121 c=0xd7a3; // skip Hangul - no case folding there
2124 // skip Han blocks - no case folding there either
2138 s= UTF16.valueOf(c);
2140 // get leading and trailing cc for c
2141 d= Normalizer.decompose(s,false);
2143 cc=UCharacter.getCombiningClass(UTF16.charAt(d,0));
2144 trailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1));
2146 // get leading and trailing cc for the case-folding of c
2147 UCharacter.foldCase(s,(foldingOptions==0));
2148 d = Normalizer.decompose(s, false);
2149 foldCC=UCharacter.getCombiningClass(UTF16.charAt(d,0));
2150 foldTrailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1));
2152 qcResult=Normalizer.quickCheck(s, Normalizer.FCD,0);
2156 // - character maps to empty string: adjacent characters may then need reordering
2157 // - folding has different leading/trailing cc's, and they don't become just 0
2158 // - folding itself is not FCD
2159 if( qcResult!=Normalizer.YES ||
2161 (cc!=foldCC && foldCC!=0) || (trailCC!=foldTrailCC && foldTrailCC!=0)
2164 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")");
2165 //errln(" cc %02x trailCC %02x foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x quickCheck(folded)=%d", cc, trailCC, UTF16.charAt(d,0), foldCC, UTF16.charAt(d,d.length()-1), foldTrailCC, qcResult);
2170 // if a code point is in NFD but its case folding is not, then
2171 // unorm_compare will also fail
2172 if(isNFD && Normalizer.YES!=Normalizer.quickCheck(s, Normalizer.NFD,0)) {
2174 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")");
2178 logln("There are "+hex(count)+" code points for which case-folding may un-FCD a string (folding options"+foldingOptions+"x)" );
2182 public void TestFindFoldFCDExceptions() {
2185 count=countFoldFCDExceptions(0);
2186 count+=countFoldFCDExceptions(Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I);
2189 //* If case-folding un-FCDs any strings, then unorm_compare() must be
2191 //* It currently assumes that one can check for FCD then case-fold
2192 //* and then still have FCD strings for raw decomposition without reordering.
2194 errln("error: There are "+count+" code points for which case-folding"+
2195 " may un-FCD a string for all folding options.\n See comment"+
2196 " in BasicNormalizerTest::FindFoldFCDExceptions()!");
2200 public void TestCombiningMarks(){
2201 String src = "\u0f71\u0f72\u0f73\u0f74\u0f75";
2202 String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74";
2203 String result = Normalizer.decompose(src,false);
2204 if(!expected.equals(result)){
2205 errln("Reordering of combining marks failed. Expected: "+Utility.hex(expected)+" Got: "+ Utility.hex(result));
2210 * Re-enable this test when UTC fixes UAX 21
2211 public void TestUAX21Failure(){
2212 final String[][] cases = new String[][]{
2213 {"\u0061\u0345\u0360\u0345\u0062", "\u0061\u0360\u0345\u0345\u0062"},
2214 {"\u0061\u0345\u0345\u0360\u0062", "\u0061\u0360\u0345\u0345\u0062"},
2215 {"\u0061\u0345\u0360\u0362\u0360\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"},
2216 {"\u0061\u0360\u0345\u0360\u0362\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"},
2217 {"\u0061\u0345\u0360\u0362\u0361\u0062", "\u0061\u0362\u0360\u0361\u0345\u0062"},
2218 {"\u0061\u0361\u0345\u0360\u0362\u0062", "\u0061\u0362\u0361\u0360\u0345\u0062"},
2220 for(int i = 0; i< cases.length; i++){
2221 String s1 =cases[0][0];
2222 String s2 = cases[0][1];
2223 if( (Normalizer.compare(s1,s2,Normalizer.FOLD_CASE_DEFAULT ==0)//case sensitive compare
2225 (Normalizer.compare(s1,s2,Normalizer.COMPARE_IGNORE_CASE)!=0)){
2226 errln("Normalizer.compare() failed for s1: "
2227 + Utility.hex(s1) +" s2: " + Utility.hex(s2));
2232 public void TestFCNFKCClosure() {
2233 final class TestStruct{
2236 TestStruct(int cp, String src){
2242 TestStruct[] tests= new TestStruct[]{
2243 new TestStruct( 0x00C4, "" ),
2244 new TestStruct( 0x00E4, "" ),
2245 new TestStruct( 0x037A, "\u0020\u03B9" ),
2246 new TestStruct( 0x03D2, "\u03C5" ),
2247 new TestStruct( 0x20A8, "\u0072\u0073" ) ,
2248 new TestStruct( 0x210B, "\u0068" ),
2249 new TestStruct( 0x210C, "\u0068" ),
2250 new TestStruct( 0x2121, "\u0074\u0065\u006C" ),
2251 new TestStruct( 0x2122, "\u0074\u006D" ),
2252 new TestStruct( 0x2128, "\u007A" ),
2253 new TestStruct( 0x1D5DB,"\u0068" ),
2254 new TestStruct( 0x1D5ED,"\u007A" ),
2255 new TestStruct( 0x0061, "" )
2259 for(int i = 0; i < tests.length; ++ i) {
2260 String result=Normalizer.getFC_NFKC_Closure(tests[i].c);
2261 if(!result.equals(new String(tests[i].s))) {
2262 errln("getFC_NFKC_Closure(U+"+Integer.toHexString(tests[i].c)+") is wrong");
2266 /* error handling */
2268 int length=Normalizer.getFC_NFKC_Closure(0x5c, null);
2270 errln("getFC_NFKC_Closure did not perform error handling correctly");
2273 public void TestBugJ2324(){
2274 /* String[] input = new String[]{
2283 String troublesome = "\u309A";
2284 for(int i=0x3000; i<0x3100;i++){
2285 String input = ((char)i)+troublesome;
2287 /* String result =*/ Normalizer.compose(input,false);
2288 }catch(IndexOutOfBoundsException e){
2289 errln("compose() failed for input: " + Utility.hex(input) + " Exception: " + e.toString());
2295 static final int D = 0, C = 1, KD= 2, KC = 3, FCD=4, NONE=5;
2297 private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets) {
2298 skipSets[D].applyPattern("[[:NFD_QC=Yes:]&[:ccc=0:]]", false);
2299 skipSets[C].applyPattern("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false);
2300 skipSets[KD].applyPattern("[[:NFKD_QC=Yes:]&[:ccc=0:]]", false);
2301 skipSets[KC].applyPattern("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false);
2303 // Remove from the NFC and NFKC sets all those characters that change
2304 // when a back-combining character is added.
2305 // First, get all of the back-combining characters and their combining classes.
2306 UnicodeSet combineBack=new UnicodeSet("[:NFC_QC=Maybe:]");
2307 int numCombineBack=combineBack.size();
2308 int[] combineBackCharsAndCc=new int[numCombineBack*2];
2309 UnicodeSetIterator iter=new UnicodeSetIterator(combineBack);
2310 for(int i=0; i<numCombineBack; ++i) {
2312 int c=iter.codepoint;
2313 combineBackCharsAndCc[2*i]=c;
2314 combineBackCharsAndCc[2*i+1]=UCharacter.getCombiningClass(c);
2317 // We need not look at control codes, Han characters nor Hangul LVT syllables because they
2318 // do not combine forward. LV syllables are already removed.
2319 UnicodeSet notInteresting=new UnicodeSet("[[:C:][:Unified_Ideograph:][:HST=LVT:]]");
2320 UnicodeSet unsure=((UnicodeSet)(skipSets[C].clone())).removeAll(notInteresting);
2321 // System.out.format("unsure.size()=%d\n", unsure.size());
2323 // For each character about which we are unsure, see if it changes when we add
2324 // one of the back-combining characters.
2325 Normalizer2 norm2=Normalizer2.getNFCInstance();
2326 StringBuilder s=new StringBuilder();
2328 while(iter.next()) {
2329 int c=iter.codepoint;
2330 s.delete(0, 0x7fffffff).appendCodePoint(c);
2331 int cLength=s.length();
2332 int tccc=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS);
2333 for(int i=0; i<numCombineBack; ++i) {
2334 // If c's decomposition ends with a character with non-zero combining class, then
2335 // c can only change if it combines with a character with a non-zero combining class.
2336 int cc2=combineBackCharsAndCc[2*i+1];
2337 if(tccc==0 || cc2!=0) {
2338 int c2=combineBackCharsAndCc[2*i];
2339 s.appendCodePoint(c2);
2340 if(!norm2.isNormalized(s)) {
2341 // System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2);
2342 skipSets[C].remove(c);
2343 skipSets[KC].remove(c);
2346 s.delete(cLength, 0x7fffffff);
2353 public void TestSkippable() {
2354 UnicodeSet[] skipSets = new UnicodeSet[] {
2355 new UnicodeSet(), //NFD
2356 new UnicodeSet(), //NFC
2357 new UnicodeSet(), //NFKD
2358 new UnicodeSet() //NFKC
2360 UnicodeSet[] expectSets = new UnicodeSet[] {
2366 StringBuilder s, pattern;
2368 // build NF*Skippable sets from runtime data
2369 skipSets[D].applyPattern("[:NFD_Inert:]");
2370 skipSets[C].applyPattern("[:NFC_Inert:]");
2371 skipSets[KD].applyPattern("[:NFKD_Inert:]");
2372 skipSets[KC].applyPattern("[:NFKC_Inert:]");
2374 expectSets = initSkippables(expectSets);
2375 if(expectSets[D].contains(0x0350)){
2376 errln("expectSets[D] contains 0x0350");
2378 for(int i=0; i<expectSets.length; ++i) {
2379 if(!skipSets[i].equals(expectSets[i])) {
2380 errln("error: TestSkippable skipSets["+i+"]!=expectedSets["+i+"]\n");
2381 // Note: This used to depend on hardcoded UnicodeSet patterns generated by
2382 // Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by
2383 // running com.ibm.text.UCD.Main with the option NFSkippable.
2384 // Since ICU 4.6/Unicode 6, we are generating the
2385 // expectSets ourselves in initSkippables().
2387 s=new StringBuilder();
2389 s.append("\n\nskip= ");
2390 s.append(skipSets[i].toPattern(true));
2393 s.append("skip-expect=");
2394 pattern = new StringBuilder(((UnicodeSet)skipSets[i].clone()).removeAll(expectSets[i]).toPattern(true));
2397 pattern.delete(0,pattern.length());
2398 s.append("\n\nexpect-skip=");
2399 pattern = new StringBuilder(((UnicodeSet)expectSets[i].clone()).removeAll(skipSets[i]).toPattern(true));
2403 pattern.delete(0,pattern.length());
2404 s.append("\n\nintersection(expect,skip)=");
2405 UnicodeSet intersection = ((UnicodeSet) expectSets[i].clone()).retainAll(skipSets[i]);
2406 pattern = new StringBuilder(intersection.toPattern(true));
2410 errln(s.toString());
2415 public void TestBugJ2068(){
2416 String sample = "The quick brown fox jumped over the lazy dog";
2417 UCharacterIterator text = UCharacterIterator.getInstance(sample);
2418 Normalizer norm = new Normalizer(text,Normalizer.NFC,0);
2420 if(text.current() == norm.current()){
2421 errln("Normalizer is not cloning the UCharacterIterator");
2424 public void TestGetCombiningClass(){
2425 for(int i=0;i<0x10FFFF;i++){
2426 int cc = UCharacter.getCombiningClass(i);
2427 if(0xD800<= i && i<=0xDFFF && cc >0 ){
2428 cc = UCharacter.getCombiningClass(i);
2429 errln("CC: "+ cc + " for codepoint: " +Utility.hex(i,8));
2434 public void TestSerializedSet(){
2435 USerializedSet sset=new USerializedSet();
2436 UnicodeSet set = new UnicodeSet();
2439 char[] serialized = {
2445 sset.getSet(serialized, 0);
2447 // collect all sets into one for contiguous output
2448 int[] startEnd = new int[2];
2449 int count=sset.countRanges();
2450 for(int j=0; j<count; ++j) {
2451 sset.getRange(j, startEnd);
2452 set.add(startEnd[0], startEnd[1]);
2455 // test all of these characters
2456 UnicodeSetIterator it = new UnicodeSetIterator(set);
2457 while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) {
2459 end=it.codepointEnd;
2461 if(!sset.contains(start)){
2462 errln("USerializedSet.contains failed for "+Utility.hex(start,8));
2469 public void TestReturnFailure(){
2470 char[] term = {'r','\u00e9','s','u','m','\u00e9' };
2471 char[] decomposed_term = new char[10 + term.length + 2];
2472 int rc = Normalizer.decompose(term,0,term.length, decomposed_term,0,decomposed_term.length,true, 0);
2473 int rc1 = Normalizer.decompose(term,0,term.length, decomposed_term,10,decomposed_term.length,true, 0);
2475 errln("Normalizer decompose did not return correct length");
2479 private final static class TestCompositionCase {
2480 public Normalizer.Mode mode;
2482 public String input, expect;
2483 TestCompositionCase(Normalizer.Mode mode, int options, String input, String expect) {
2485 this.options=options;
2491 public void TestComposition() {
2492 final TestCompositionCase cases[]=new TestCompositionCase[]{
2494 * special cases for UAX #15 bug
2495 * see Unicode Corrigendum #5: Normalization Idempotency
2496 * at http://unicode.org/versions/corrigendum5.html
2497 * (was Public Review Issue #29)
2499 new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327", "\u1100\u0300\u1161\u0327"),
2500 new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327\u11a8","\u1100\u0300\u1161\u0327\u11a8"),
2501 new TestCompositionCase(Normalizer.NFC, 0, "\uac00\u0300\u0327\u11a8", "\uac00\u0327\u0300\u11a8"),
2502 new TestCompositionCase(Normalizer.NFC, 0, "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"),
2504 /* TODO: add test cases for UNORM_FCC here (j2151) */
2510 for(i=0; i<cases.length; ++i) {
2511 output=Normalizer.normalize(cases[i].input, cases[i].mode, cases[i].options);
2512 if(!output.equals(cases[i].expect)) {
2513 errln("unexpected result for case "+i);
2518 public void TestGetDecomposition() {
2519 Normalizer2 n2=Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE_CONTIGUOUS);
2520 String decomp=n2.getDecomposition(0x20);
2521 assertEquals("fcc.getDecomposition(space) failed", null, decomp);
2522 decomp=n2.getDecomposition(0xe4);
2523 assertEquals("fcc.getDecomposition(a-umlaut) failed", "a\u0308", decomp);
2524 decomp=n2.getDecomposition(0xac01);
2525 assertEquals("fcc.getDecomposition(Hangul syllable U+AC01) failed", "\u1100\u1161\u11a8", decomp);
2528 public void TestGetRawDecomposition() {
2529 Normalizer2 n2=Normalizer2.getNFKCInstance();
2531 * Raw decompositions from NFKC data are the Unicode Decomposition_Mapping values,
2532 * without recursive decomposition.
2535 String decomp=n2.getRawDecomposition(0x20);
2536 assertEquals("nfkc.getRawDecomposition(space) failed", null, decomp);
2537 decomp=n2.getRawDecomposition(0xe4);
2538 assertEquals("nfkc.getRawDecomposition(a-umlaut) failed", "a\u0308", decomp);
2539 /* U+1E08 LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE */
2540 decomp=n2.getRawDecomposition(0x1e08);
2541 assertEquals("nfkc.getRawDecomposition(c-cedilla-acute) failed", "\u00c7\u0301", decomp);
2542 /* U+212B ANGSTROM SIGN */
2543 decomp=n2.getRawDecomposition(0x212b);
2544 assertEquals("nfkc.getRawDecomposition(angstrom sign) failed", "\u00c5", decomp);
2545 decomp=n2.getRawDecomposition(0xac00);
2546 assertEquals("nfkc.getRawDecomposition(Hangul syllable U+AC00) failed", "\u1100\u1161", decomp);
2547 /* A Hangul LVT syllable has a raw decomposition of an LV syllable + T. */
2548 decomp=n2.getRawDecomposition(0xac01);
2549 assertEquals("nfkc.getRawDecomposition(Hangul syllable U+AC01) failed", "\uac00\u11a8", decomp);
2552 public void TestCustomComp() {
2554 { "\\uD801\\uE000\\uDFFE", "" },
2555 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" },
2556 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" },
2557 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE002\\U000110B9\\u0327\\u0345" },
2558 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" },
2559 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" },
2560 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" },
2561 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" }
2563 Normalizer2 customNorm2;
2565 Normalizer2.getInstance(
2566 BasicTest.class.getResourceAsStream("/com/ibm/icu/dev/data/testdata/testnorm.nrm"),
2568 Normalizer2.Mode.COMPOSE);
2569 for(int i=0; i<pairs.length; ++i) {
2570 String[] pair=pairs[i];
2571 String input=Utility.unescape(pair[0]);
2572 String expected=Utility.unescape(pair[1]);
2573 String result=customNorm2.normalize(input);
2574 if(!result.equals(expected)) {
2575 errln("custom compose Normalizer2 did not normalize input "+i+" as expected");
2580 public void TestCustomFCC() {
2582 { "\\uD801\\uE000\\uDFFE", "" },
2583 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" },
2584 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" },
2585 // The following expected result is different from CustomComp
2586 // because of only-contiguous composition.
2587 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE001\\U000110B9\\u0327\\u0308\\u0345" },
2588 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" },
2589 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" },
2590 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" },
2591 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" }
2593 Normalizer2 customNorm2;
2595 Normalizer2.getInstance(
2596 BasicTest.class.getResourceAsStream("/com/ibm/icu/dev/data/testdata/testnorm.nrm"),
2598 Normalizer2.Mode.COMPOSE_CONTIGUOUS);
2599 for(int i=0; i<pairs.length; ++i) {
2600 String[] pair=pairs[i];
2601 String input=Utility.unescape(pair[0]);
2602 String expected=Utility.unescape(pair[1]);
2603 String result=customNorm2.normalize(input);
2604 if(!result.equals(expected)) {
2605 errln("custom FCC Normalizer2 did not normalize input "+i+" as expected");
2610 public void TestCanonIterData() {
2611 // For now, just a regression test.
2612 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl.ensureCanonIterData();
2613 // U+0FB5 TIBETAN SUBJOINED LETTER SSA is the trailing character
2614 // in some decomposition mappings where there is a composition exclusion.
2615 // In fact, U+0FB5 is normalization-inert (NFC_QC=Yes, NFD_QC=Yes, ccc=0)
2616 // but it is not a segment starter because it occurs in a decomposition mapping.
2617 if(impl.isCanonSegmentStarter(0xfb5)) {
2618 errln("isCanonSegmentStarter(U+0fb5)=true is wrong");
2620 // For [:Segment_Starter:] to work right, not just the property function has to work right,
2621 // UnicodeSet also needs a correct range starts set.
2622 UnicodeSet segStarters=new UnicodeSet("[:Segment_Starter:]").freeze();
2623 if(segStarters.contains(0xfb5)) {
2624 errln("[:Segment_Starter:].contains(U+0fb5)=true is wrong");
2626 // Try characters up to Kana and miscellaneous CJK but below Han (for expediency).
2627 for(int c=0; c<=0x33ff; ++c) {
2628 boolean isStarter=impl.isCanonSegmentStarter(c);
2629 boolean isContained=segStarters.contains(c);
2630 if(isStarter!=isContained) {
2631 errln(String.format(
2632 "discrepancy: isCanonSegmentStarter(U+%04x)=%5b != " +
2633 "[:Segment_Starter:].contains(same)",
2639 public void TestFilteredNormalizer2() {
2640 Normalizer2 nfcNorm2=Normalizer2.getNFCInstance();
2641 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]");
2642 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter);
2644 for(c=0; c<=0x3ff; ++c) {
2645 int expectedCC= filter.contains(c) ? nfcNorm2.getCombiningClass(c) : 0;
2646 int cc=fn2.getCombiningClass(c);
2648 "FilteredNormalizer2(NFC, ^A0-FF,310-31F).getCombiningClass(U+"+hex(c)+
2649 ")==filtered NFC.getCC()",
2654 public void TestFilteredAppend() {
2655 Normalizer2 nfcNorm2=Normalizer2.getNFCInstance();
2656 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]");
2657 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter);
2659 // Append two strings that each contain a character outside the filter set.
2660 StringBuilder sb = new StringBuilder("a\u0313a");
2661 String second = "\u0301\u0313";
2662 assertEquals("append()", "a\u0313á\u0313", fn2.append(sb, second).toString());
2664 // Same, and also normalize the second string.
2665 sb.replace(0, 0x7fffffff, "a\u0313a");
2667 "normalizeSecondAndAppend()",
2668 "a\u0313á\u0313", fn2.normalizeSecondAndAppend(sb, second).toString());
2670 // Normalizer2.normalize(String) uses spanQuickCheckYes() and normalizeSecondAndAppend().
2671 assertEquals("normalize()", "a\u0313á\u0313", fn2.normalize("a\u0313a\u0301\u0313"));
2674 public void TestGetEasyToUseInstance() {
2675 // Test input string:
2676 // U+00A0 -> <noBreak> 0020
2677 // U+00C7 0301 = 1E08 = 0043 0327 0301
2678 String in="\u00A0\u00C7\u0301";
2679 Normalizer2 n2=Normalizer2.getNFCInstance();
2680 String out=n2.normalize(in);
2682 "getNFCInstance() did not return an NFC instance " +
2683 "(normalizes to " + prettify(out) + ')',
2684 "\u00A0\u1E08", out);
2686 n2=Normalizer2.getNFDInstance();
2687 out=n2.normalize(in);
2689 "getNFDInstance() did not return an NFD instance " +
2690 "(normalizes to " + prettify(out) + ')',
2691 "\u00A0C\u0327\u0301", out);
2693 n2=Normalizer2.getNFKCInstance();
2694 out=n2.normalize(in);
2696 "getNFKCInstance() did not return an NFKC instance " +
2697 "(normalizes to " + prettify(out) + ')',
2700 n2=Normalizer2.getNFKDInstance();
2701 out=n2.normalize(in);
2703 "getNFKDInstance() did not return an NFKD instance " +
2704 "(normalizes to " + prettify(out) + ')',
2705 " C\u0327\u0301", out);
2707 n2=Normalizer2.getNFKCCasefoldInstance();
2708 out=n2.normalize(in);
2710 "getNFKCCasefoldInstance() did not return an NFKC_Casefold instance " +
2711 "(normalizes to " + prettify(out) + ')',