2 *******************************************************************************
\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.dev.test.normalizer;
\r
10 import java.text.StringCharacterIterator;
\r
11 import java.util.Random;
\r
13 import com.ibm.icu.dev.test.TestFmwk;
\r
14 import com.ibm.icu.impl.NormalizerImpl;
\r
15 import com.ibm.icu.impl.USerializedSet;
\r
16 import com.ibm.icu.impl.Utility;
\r
17 import com.ibm.icu.lang.*;
\r
18 import com.ibm.icu.lang.UCharacter;
\r
19 import com.ibm.icu.lang.UCharacterCategory;
\r
20 import com.ibm.icu.text.Normalizer;
\r
21 import com.ibm.icu.text.UCharacterIterator;
\r
22 import com.ibm.icu.text.UTF16;
\r
23 import com.ibm.icu.text.UnicodeSet;
\r
24 import com.ibm.icu.text.UnicodeSetIterator;
\r
27 public class BasicTest extends TestFmwk {
\r
28 public static void main(String[] args) throws Exception {
\r
29 new BasicTest().run(args);
\r
32 String[][] canonTests = {
\r
33 // Input Decomposed Composed
\r
34 { "cat", "cat", "cat" },
\r
35 { "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", },
\r
37 { "\u1e0a", "D\u0307", "\u1e0a" }, // D-dot_above
\r
38 { "D\u0307", "D\u0307", "\u1e0a" }, // D dot_above
\r
40 { "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_below dot_above
\r
41 { "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_above dot_below
\r
42 { "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D dot_below dot_above
\r
44 { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above
\r
45 { "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below
\r
47 { "\u1E14", "E\u0304\u0300", "\u1E14" }, // E-macron-grave
\r
48 { "\u0112\u0300", "E\u0304\u0300", "\u1E14" }, // E-macron + grave
\r
49 { "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" }, // E-grave + macron
\r
51 { "\u212b", "A\u030a", "\u00c5" }, // angstrom_sign
\r
52 { "\u00c5", "A\u030a", "\u00c5" }, // A-ring
\r
54 { "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" },
\r
55 { "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" },
\r
57 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated with 3.0
\r
58 { "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" }, //updated with 3.0
\r
60 { "Henry IV", "Henry IV", "Henry IV" },
\r
61 { "Henry \u2163", "Henry \u2163", "Henry \u2163" },
\r
63 { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana)
\r
64 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten
\r
65 { "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" }, // hw_ka + hw_ten
\r
66 { "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" }, // ka + hw_ten
\r
67 { "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" }, // hw_ka + ten
\r
69 { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" },
\r
70 {"\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e","\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165", "\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165"},
\r
73 String[][] compatTests = {
\r
74 // Input Decomposed Composed
\r
75 { "cat", "cat", "cat" },
\r
76 { "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", }, // Alef-Lamed vs. Alef, Lamed
\r
78 { "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" },
\r
79 { "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" }, // ffi ligature -> f + f + i
\r
81 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated for 3.0
\r
82 { "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" }, // ffi ligature -> f + f + i
\r
84 { "Henry IV", "Henry IV", "Henry IV" },
\r
85 { "Henry \u2163", "Henry IV", "Henry IV" },
\r
87 { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana)
\r
88 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten
\r
90 { "\uFF76\u3099", "\u30AB\u3099", "\u30AC" }, // hw_ka + ten
\r
92 /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/
\r
93 { "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" }, // hw_ka + hw_ten
\r
94 { "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" }, // ka + hw_ten
\r
98 // With Canonical decomposition, Hangul syllables should get decomposed
\r
99 // into Jamo, but Jamo characters should not be decomposed into
\r
101 String[][] hangulCanon = {
\r
102 // Input Decomposed Composed
\r
103 { "\ud4db", "\u1111\u1171\u11b6", "\ud4db" },
\r
104 { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" },
\r
107 // With compatibility decomposition turned on,
\r
108 // it should go all the way down to conjoining Jamo characters.
\r
109 // THIS IS NO LONGER TRUE IN UNICODE v2.1.8, SO THIS TEST IS OBSOLETE
\r
110 String[][] hangulCompat = {
\r
111 // Input Decomposed Composed
\r
112 // { "\ud4db", "\u1111\u116e\u1175\u11af\u11c2", "\ud478\u1175\u11af\u11c2" },
\r
115 public void TestHangulCompose()
\r
117 // Make sure that the static composition methods work
\r
118 logln("Canonical composition...");
\r
119 staticTest(Normalizer.NFC, hangulCanon, 2);
\r
120 logln("Compatibility composition...");
\r
121 staticTest(Normalizer.NFKC, hangulCompat, 2);
\r
122 // Now try iterative composition....
\r
123 logln("Iterative composition...");
\r
124 Normalizer norm = new Normalizer("", Normalizer.NFC,0);
\r
125 iterateTest(norm, hangulCanon, 2);
\r
127 norm.setMode(Normalizer.NFKD);
\r
128 iterateTest(norm, hangulCompat, 2);
\r
130 // And finally, make sure you can do it in reverse too
\r
131 logln("Reverse iteration...");
\r
132 norm.setMode(Normalizer.NFC);
\r
133 backAndForth(norm, hangulCanon);
\r
136 public void TestHangulDecomp() throws Exception{
\r
137 // Make sure that the static decomposition methods work
\r
138 logln("Canonical decomposition...");
\r
139 staticTest(Normalizer.NFD, hangulCanon, 1);
\r
140 logln("Compatibility decomposition...");
\r
141 staticTest(Normalizer.NFKD, hangulCompat, 1);
\r
143 // Now the iterative decomposition methods...
\r
144 logln("Iterative decomposition...");
\r
145 Normalizer norm = new Normalizer("", Normalizer.NFD,0);
\r
146 iterateTest(norm, hangulCanon, 1);
\r
148 norm.setMode(Normalizer.NFKD);
\r
149 iterateTest(norm, hangulCompat, 1);
\r
151 // And finally, make sure you can do it in reverse too
\r
152 logln("Reverse iteration...");
\r
153 norm.setMode(Normalizer.NFD);
\r
154 backAndForth(norm, hangulCanon);
\r
156 public void TestNone() throws Exception{
\r
157 Normalizer norm = new Normalizer("", Normalizer.NONE,0);
\r
158 iterateTest(norm, canonTests, 0);
\r
159 staticTest(Normalizer.NONE, canonTests, 0);
\r
161 public void TestDecomp() throws Exception{
\r
162 Normalizer norm = new Normalizer("", Normalizer.NFD,0);
\r
163 iterateTest(norm, canonTests, 1);
\r
164 staticTest(Normalizer.NFD, canonTests, 1);
\r
165 decomposeTest(Normalizer.NFD, canonTests, 1);
\r
168 public void TestCompatDecomp() throws Exception{
\r
169 Normalizer norm = new Normalizer("", Normalizer.NFKD,0);
\r
170 iterateTest(norm, compatTests, 1);
\r
171 staticTest(Normalizer.NFKD,compatTests, 1);
\r
172 decomposeTest(Normalizer.NFKD,compatTests, 1);
\r
175 public void TestCanonCompose() throws Exception{
\r
176 Normalizer norm = new Normalizer("", Normalizer.NFC,0);
\r
177 iterateTest(norm, canonTests, 2);
\r
178 staticTest(Normalizer.NFC, canonTests, 2);
\r
179 composeTest(Normalizer.NFC, canonTests, 2);
\r
182 public void TestCompatCompose() throws Exception{
\r
183 Normalizer norm = new Normalizer("", Normalizer.NFKC,0);
\r
184 iterateTest(norm, compatTests, 2);
\r
185 staticTest(Normalizer.NFKC,compatTests, 2);
\r
186 composeTest(Normalizer.NFKC,compatTests, 2);
\r
189 public void TestExplodingBase() throws Exception{
\r
190 // \u017f - Latin small letter long s
\r
191 // \u0307 - combining dot above
\r
192 // \u1e61 - Latin small letter s with dot above
\r
193 // \u1e9b - Latin small letter long s with dot above
\r
194 String[][] canon = {
\r
195 // Input Decomposed Composed
\r
196 { "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" },
\r
197 { "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" },
\r
199 String[][] compat = {
\r
200 // Input Decomposed Composed
\r
201 { "\u017f", "s", "s" },
\r
202 { "\u1e9b", "s\u0307", "\u1e61" },
\r
205 staticTest(Normalizer.NFD, canon, 1);
\r
206 staticTest(Normalizer.NFC, canon, 2);
\r
208 staticTest(Normalizer.NFKD, compat, 1);
\r
209 staticTest(Normalizer.NFKC, compat, 2);
\r
214 * The Tibetan vowel sign AA, 0f71, was messed up prior to
\r
215 * Unicode version 2.1.9.
\r
216 * Once 2.1.9 or 3.0 is released, uncomment this test.
\r
218 public void TestTibetan() throws Exception{
\r
219 String[][] decomp = {
\r
220 { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" }
\r
222 String[][] compose = {
\r
223 { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" }
\r
226 staticTest(Normalizer.NFD, decomp, 1);
\r
227 staticTest(Normalizer.NFKD,decomp, 2);
\r
228 staticTest(Normalizer.NFC, compose, 1);
\r
229 staticTest(Normalizer.NFKC,compose, 2);
\r
233 * Make sure characters in the CompositionExclusion.txt list do not get
\r
236 public void TestCompositionExclusion()
\r
238 // This list is generated from CompositionExclusion.txt.
\r
239 // Update whenever the normalizer tables are updated. Note
\r
240 // that we test all characters listed, even those that can be
\r
241 // derived from the Unicode DB and are therefore commented
\r
244 "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" +
\r
245 "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" +
\r
246 "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" +
\r
247 "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" +
\r
248 "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" +
\r
249 "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" +
\r
250 "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u1FDB" +
\r
251 "\u1FE3\u1FEB\u1FEE\u1FEF\u1FF9\u1FFB\u1FFD\u2000" +
\r
252 "\u2001\u2126\u212A\u212B\u2329\u232A\uF900\uFA10" +
\r
253 "\uFA12\uFA15\uFA20\uFA22\uFA25\uFA26\uFA2A\uFB1F" +
\r
254 "\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E\uFB2F\uFB30\uFB31" +
\r
255 "\uFB32\uFB33\uFB34\uFB35\uFB36\uFB38\uFB39\uFB3A" +
\r
256 "\uFB3B\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46" +
\r
257 "\uFB47\uFB48\uFB49\uFB4A\uFB4B\uFB4C\uFB4D\uFB4E";
\r
258 for (int i=0; i<EXCLUDED.length(); ++i) {
\r
259 String a = String.valueOf(EXCLUDED.charAt(i));
\r
260 String b = Normalizer.normalize(a, Normalizer.NFKD);
\r
261 String c = Normalizer.normalize(b, Normalizer.NFC);
\r
263 errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " +
\r
264 hex(b) + " x COMPOSE => " +
\r
266 } else if (isVerbose()) {
\r
267 logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " +
\r
268 hex(b) + " x COMPOSE => " +
\r
272 // The following method works too, but it is somewhat
\r
273 // incestuous. It uses UInfo, which is the same database that
\r
274 // NormalizerBuilder uses, so if something is wrong with
\r
275 // UInfo, the following test won't show it. All it will show
\r
276 // is that NormalizerBuilder has been run with whatever the
\r
277 // current UInfo is.
\r
279 // We comment this out in favor of the test above, which
\r
280 // provides independent verification (but also requires
\r
281 // independent updating).
\r
283 // UInfo uinfo = new UInfo();
\r
284 // for (int i=0; i<=0xFFFF; ++i) {
\r
285 // if (!uinfo.isExcludedComposition((char)i) ||
\r
286 // (!uinfo.hasCanonicalDecomposition((char)i) &&
\r
287 // !uinfo.hasCompatibilityDecomposition((char)i))) continue;
\r
288 // String a = String.valueOf((char)i);
\r
289 // String b = Normalizer.normalize(a,Normalizer.DECOMP_COMPAT,0);
\r
290 // String c = Normalizer.normalize(b,Normalizer.COMPOSE,0);
\r
291 // if (c.equals(a)) {
\r
292 // errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " +
\r
293 // hex(b) + " x COMPOSE => " +
\r
295 // } else if (isVerbose()) {
\r
296 // logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " +
\r
297 // hex(b) + " x COMPOSE => " +
\r
304 * Test for a problem that showed up just before ICU 1.6 release
\r
305 * having to do with combining characters with an index of zero.
\r
306 * Such characters do not participate in any canonical
\r
307 * decompositions. However, having an index of zero means that
\r
308 * they all share one typeMask[] entry, that is, they all have to
\r
309 * map to the same canonical class, which is not the case, in
\r
312 public void TestZeroIndex()
\r
315 // Expect col1 x COMPOSE_COMPAT => col2
\r
316 // Expect col2 x DECOMP => col3
\r
317 "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300",
\r
318 "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300",
\r
319 "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300",
\r
320 "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327",
\r
321 "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321",
\r
324 for (int i=0; i<DATA.length; i+=3) {
\r
325 String a = DATA[i];
\r
326 String b = Normalizer.normalize(a, Normalizer.NFKC);
\r
327 String exp = DATA[i+1];
\r
328 if (b.equals(exp)) {
\r
329 logln("Ok: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b));
\r
331 errln("FAIL: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b) +
\r
332 ", expect " + hex(exp));
\r
334 a = Normalizer.normalize(b, Normalizer.NFD);
\r
336 if (a.equals(exp)) {
\r
337 logln("Ok: " + hex(b) + " x DECOMP => " + hex(a));
\r
339 errln("FAIL: " + hex(b) + " x DECOMP => " + hex(a) +
\r
340 ", expect " + hex(exp));
\r
346 * Test for a problem found by Verisign. Problem is that
\r
347 * characters at the start of a string are not put in canonical
\r
348 * order correctly by compose() if there is no starter.
\r
350 public void TestVerisign()
\r
352 String[] inputs = {
\r
353 "\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f",
\r
354 "\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad"
\r
356 String[] outputs = {
\r
357 "\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f",
\r
358 "\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4"
\r
361 for (int i = 0; i < inputs.length; ++i) {
\r
362 String input = inputs[i];
\r
363 String output = outputs[i];
\r
364 String result = Normalizer.decompose(input, false);
\r
365 if (!result.equals(output)) {
\r
366 errln("FAIL input: " + hex(input));
\r
367 errln(" decompose: " + hex(result));
\r
368 errln(" expected: " + hex(output));
\r
370 result = Normalizer.compose(input, false);
\r
371 if (!result.equals(output)) {
\r
372 errln("FAIL input: " + hex(input));
\r
373 errln(" compose: " + hex(result));
\r
374 errln(" expected: " + hex(output));
\r
379 public void TestQuickCheckResultNO()
\r
381 final char CPNFD[] = {0x00C5, 0x0407, 0x1E00, 0x1F57, 0x220C,
\r
382 0x30AE, 0xAC00, 0xD7A3, 0xFB36, 0xFB4E};
\r
383 final char CPNFC[] = {0x0340, 0x0F93, 0x1F77, 0x1FBB, 0x1FEB,
\r
384 0x2000, 0x232A, 0xF900, 0xFA1E, 0xFB4E};
\r
385 final char CPNFKD[] = {0x00A0, 0x02E4, 0x1FDB, 0x24EA, 0x32FE,
\r
386 0xAC00, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D};
\r
387 final char CPNFKC[] = {0x00A0, 0x017F, 0x2000, 0x24EA, 0x32FE,
\r
388 0x33FE, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D};
\r
391 final int SIZE = 10;
\r
394 for (; count < SIZE; count ++)
\r
396 if (Normalizer.quickCheck(String.valueOf(CPNFD[count]),
\r
397 Normalizer.NFD,0) != Normalizer.NO)
\r
399 errln("ERROR in NFD quick check at U+" +
\r
400 Integer.toHexString(CPNFD[count]));
\r
403 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),
\r
404 Normalizer.NFC,0) !=Normalizer.NO)
\r
406 errln("ERROR in NFC quick check at U+"+
\r
407 Integer.toHexString(CPNFC[count]));
\r
410 if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]),
\r
411 Normalizer.NFKD,0) != Normalizer.NO)
\r
413 errln("ERROR in NFKD quick check at U+"+
\r
414 Integer.toHexString(CPNFKD[count]));
\r
417 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
\r
418 Normalizer.NFKC,0) !=Normalizer.NO)
\r
420 errln("ERROR in NFKC quick check at U+"+
\r
421 Integer.toHexString(CPNFKC[count]));
\r
424 // for improving coverage
\r
425 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
\r
426 Normalizer.NFKC) !=Normalizer.NO)
\r
428 errln("ERROR in NFKC quick check at U+"+
\r
429 Integer.toHexString(CPNFKC[count]));
\r
436 public void TestQuickCheckResultYES()
\r
438 final char CPNFD[] = {0x00C6, 0x017F, 0x0F74, 0x1000, 0x1E9A,
\r
439 0x2261, 0x3075, 0x4000, 0x5000, 0xF000};
\r
440 final char CPNFC[] = {0x0400, 0x0540, 0x0901, 0x1000, 0x1500,
\r
441 0x1E9A, 0x3000, 0x4000, 0x5000, 0xF000};
\r
442 final char CPNFKD[] = {0x00AB, 0x02A0, 0x1000, 0x1027, 0x2FFB,
\r
443 0x3FFF, 0x4FFF, 0xA000, 0xF000, 0xFA27};
\r
444 final char CPNFKC[] = {0x00B0, 0x0100, 0x0200, 0x0A02, 0x1000,
\r
445 0x2010, 0x3030, 0x4000, 0xA000, 0xFA0E};
\r
447 final int SIZE = 10;
\r
453 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFD,0)
\r
456 errln("ERROR in NFD quick check at U+"+
\r
457 Integer.toHexString(cp));
\r
460 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFC,0)
\r
463 errln("ERROR in NFC quick check at U+"+
\r
464 Integer.toHexString(cp));
\r
467 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKD,0)
\r
470 errln("ERROR in NFKD quick check at U+" +
\r
471 Integer.toHexString(cp));
\r
474 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC,0)
\r
477 errln("ERROR in NFKC quick check at U+"+
\r
478 Integer.toHexString(cp));
\r
481 // improve the coverage
\r
482 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC)
\r
485 errln("ERROR in NFKC quick check at U+"+
\r
486 Integer.toHexString(cp));
\r
492 for (; count < SIZE; count ++)
\r
494 if (Normalizer.quickCheck(String.valueOf(CPNFD[count]),
\r
495 Normalizer.NFD,0)!=Normalizer.YES)
\r
497 errln("ERROR in NFD quick check at U+"+
\r
498 Integer.toHexString(CPNFD[count]));
\r
501 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),
\r
502 Normalizer.NFC,0)!=Normalizer.YES)
\r
504 errln("ERROR in NFC quick check at U+"+
\r
505 Integer.toHexString(CPNFC[count]));
\r
508 if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]),
\r
509 Normalizer.NFKD,0)!=Normalizer.YES)
\r
511 errln("ERROR in NFKD quick check at U+"+
\r
512 Integer.toHexString(CPNFKD[count]));
\r
515 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
\r
516 Normalizer.NFKC,0)!=Normalizer.YES)
\r
518 errln("ERROR in NFKC quick check at U+"+
\r
519 Integer.toHexString(CPNFKC[count]));
\r
522 // improve the coverage
\r
523 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
\r
524 Normalizer.NFKC)!=Normalizer.YES)
\r
526 errln("ERROR in NFKC quick check at U+"+
\r
527 Integer.toHexString(CPNFKC[count]));
\r
532 public void TestBengali() throws Exception{
\r
533 String input = "\u09bc\u09be\u09cd\u09be";
\r
534 String output=Normalizer.normalize(input,Normalizer.NFC);
\r
535 if(!input.equals(output)){
\r
536 errln("ERROR in NFC of string");
\r
539 public void TestQuickCheckResultMAYBE()
\r
542 final char[] CPNFC = {0x0306, 0x0654, 0x0BBE, 0x102E, 0x1161,
\r
543 0x116A, 0x1173, 0x1175, 0x3099, 0x309A};
\r
544 final char[] CPNFKC = {0x0300, 0x0654, 0x0655, 0x09D7, 0x0B3E,
\r
545 0x0DCF, 0xDDF, 0x102E, 0x11A8, 0x3099};
\r
548 final int SIZE = 10;
\r
552 /* NFD and NFKD does not have any MAYBE codepoints */
\r
553 for (; count < SIZE; count ++)
\r
555 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),
\r
556 Normalizer.NFC,0)!=Normalizer.MAYBE)
\r
558 errln("ERROR in NFC quick check at U+"+
\r
559 Integer.toHexString(CPNFC[count]));
\r
562 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
\r
563 Normalizer.NFKC,0)!=Normalizer.MAYBE)
\r
565 errln("ERROR in NFKC quick check at U+"+
\r
566 Integer.toHexString(CPNFKC[count]));
\r
569 if (Normalizer.quickCheck(new char[]{CPNFC[count]},
\r
570 Normalizer.NFC,0)!=Normalizer.MAYBE)
\r
572 errln("ERROR in NFC quick check at U+"+
\r
573 Integer.toHexString(CPNFC[count]));
\r
576 if (Normalizer.quickCheck(new char[]{CPNFKC[count]},
\r
577 Normalizer.NFKC,0)!=Normalizer.MAYBE)
\r
579 errln("ERROR in NFKC quick check at U+"+
\r
580 Integer.toHexString(CPNFKC[count]));
\r
583 if (Normalizer.quickCheck(new char[]{CPNFKC[count]},
\r
584 Normalizer.NONE,0)!=Normalizer.MAYBE)
\r
586 errln("ERROR in NFKC quick check at U+"+
\r
587 Integer.toHexString(CPNFKC[count]));
\r
593 public void TestQuickCheckStringResult()
\r
599 for (count = 0; count < canonTests.length; count ++)
\r
601 d = canonTests[count][1];
\r
602 c = canonTests[count][2];
\r
603 if (Normalizer.quickCheck(d,Normalizer.NFD,0)
\r
606 errln("ERROR in NFD quick check for string at count " + count);
\r
610 if (Normalizer.quickCheck(c, Normalizer.NFC,0)
\r
613 errln("ERROR in NFC quick check for string at count " + count);
\r
618 for (count = 0; count < compatTests.length; count ++)
\r
620 d = compatTests[count][1];
\r
621 c = compatTests[count][2];
\r
622 if (Normalizer.quickCheck(d, Normalizer.NFKD,0)
\r
625 errln("ERROR in NFKD quick check for string at count " + count);
\r
629 if (Normalizer.quickCheck(c, Normalizer.NFKC,0)
\r
632 errln("ERROR in NFKC quick check for string at count " + count);
\r
638 static final int qcToInt(Normalizer.QuickCheckResult qc) {
\r
639 if(qc==Normalizer.NO) {
\r
641 } else if(qc==Normalizer.YES) {
\r
643 } else /* Normalizer.MAYBE */ {
\r
648 public void TestQuickCheckPerCP() {
\r
649 int c, lead, trail;
\r
651 int lccc1, lccc2, tccc1, tccc2;
\r
655 UCharacter.getIntPropertyMaxValue(UProperty.NFD_QUICK_CHECK)!=1 || // YES
\r
656 UCharacter.getIntPropertyMaxValue(UProperty.NFKD_QUICK_CHECK)!=1 ||
\r
657 UCharacter.getIntPropertyMaxValue(UProperty.NFC_QUICK_CHECK)!=2 || // MAYBE
\r
658 UCharacter.getIntPropertyMaxValue(UProperty.NFKC_QUICK_CHECK)!=2 ||
\r
659 UCharacter.getIntPropertyMaxValue(UProperty.LEAD_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) ||
\r
660 UCharacter.getIntPropertyMaxValue(UProperty.TRAIL_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS)
\r
662 errln("wrong result from one of the u_getIntPropertyMaxValue(UCHAR_NF*_QUICK_CHECK) or UCHAR_*_CANONICAL_COMBINING_CLASS");
\r
666 * compare the quick check property values for some code points
\r
667 * to the quick check results for checking same-code point strings
\r
670 while(c<0x110000) {
\r
671 s=UTF16.valueOf(c);
\r
673 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFC_QUICK_CHECK);
\r
674 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFC));
\r
676 errln("getIntPropertyValue(NFC)="+qc1+" != "+qc2+"=quickCheck(NFC) for U+"+Integer.toHexString(c));
\r
679 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFD_QUICK_CHECK);
\r
680 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFD));
\r
682 errln("getIntPropertyValue(NFD)="+qc1+" != "+qc2+"=quickCheck(NFD) for U+"+Integer.toHexString(c));
\r
685 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKC_QUICK_CHECK);
\r
686 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKC));
\r
688 errln("getIntPropertyValue(NFKC)="+qc1+" != "+qc2+"=quickCheck(NFKC) for U+"+Integer.toHexString(c));
\r
691 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKD_QUICK_CHECK);
\r
692 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKD));
\r
694 errln("getIntPropertyValue(NFKD)="+qc1+" != "+qc2+"=quickCheck(NFKD) for U+"+Integer.toHexString(c));
\r
697 nfd=Normalizer.normalize(s, Normalizer.NFD);
\r
698 lead=UTF16.charAt(nfd, 0);
\r
699 trail=UTF16.charAt(nfd, nfd.length()-1);
\r
701 lccc1=UCharacter.getIntPropertyValue(c, UProperty.LEAD_CANONICAL_COMBINING_CLASS);
\r
702 lccc2=UCharacter.getCombiningClass(lead);
\r
703 tccc1=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS);
\r
704 tccc2=UCharacter.getCombiningClass(trail);
\r
707 errln("getIntPropertyValue(lccc)="+lccc1+" != "+lccc2+"=getCombiningClass(lead) for U+"+Integer.toHexString(c));
\r
710 errln("getIntPropertyValue(tccc)="+tccc1+" != "+tccc2+"=getCombiningClass(trail) for U+"+Integer.toHexString(c));
\r
713 /* skip some code points */
\r
718 //------------------------------------------------------------------------
\r
719 // Internal utilities
\r
721 //------------------------------------------------------------------------
\r
722 // Internal utilities
\r
725 /* private void backAndForth(Normalizer iter, String input)
\r
727 iter.setText(input);
\r
729 // Run through the iterator forwards and stick it into a StringBuffer
\r
730 StringBuffer forward = new StringBuffer();
\r
731 for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) {
\r
732 forward.append(ch);
\r
735 // Now do it backwards
\r
736 StringBuffer reverse = new StringBuffer();
\r
737 for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) {
\r
738 reverse.insert(0, ch);
\r
741 if (!forward.toString().equals(reverse.toString())) {
\r
742 errln("FAIL: Forward/reverse mismatch for input " + hex(input)
\r
743 + ", forward: " + hex(forward) + ", backward: "+hex(reverse));
\r
744 } else if (isVerbose()) {
\r
745 logln("Ok: Forward/reverse for input " + hex(input)
\r
746 + ", forward: " + hex(forward) + ", backward: "+hex(reverse));
\r
750 private void backAndForth(Normalizer iter, String[][] tests)
\r
752 for (int i = 0; i < tests.length; i++)
\r
754 iter.setText(tests[i][0]);
\r
756 // Run through the iterator forwards and stick it into a
\r
758 StringBuffer forward = new StringBuffer();
\r
759 for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) {
\r
760 forward.append(ch);
\r
763 // Now do it backwards
\r
764 StringBuffer reverse = new StringBuffer();
\r
765 for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) {
\r
766 reverse.insert(0, ch);
\r
769 if (!forward.toString().equals(reverse.toString())) {
\r
770 errln("FAIL: Forward/reverse mismatch for input "
\r
771 + hex(tests[i][0]) + ", forward: " + hex(forward)
\r
772 + ", backward: " + hex(reverse));
\r
773 } else if (isVerbose()) {
\r
774 logln("Ok: Forward/reverse for input " + hex(tests[i][0])
\r
775 + ", forward: " + hex(forward) + ", backward: "
\r
781 private void staticTest (Normalizer.Mode mode,
\r
782 String[][] tests, int outCol) throws Exception{
\r
783 for (int i = 0; i < tests.length; i++)
\r
785 String input = Utility.unescape(tests[i][0]);
\r
786 String expect = Utility.unescape(tests[i][outCol]);
\r
788 logln("Normalizing '" + input + "' (" + hex(input) + ")" );
\r
790 String output = Normalizer.normalize(input, mode);
\r
792 if (!output.equals(expect)) {
\r
793 errln("FAIL: case " + i
\r
794 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
795 + " but got '" + output + "' (" + hex(output) + ")" );
\r
798 char[] output = new char[1];
\r
799 for (int i = 0; i < tests.length; i++)
\r
801 char[] input = Utility.unescape(tests[i][0]).toCharArray();
\r
802 String expect =Utility.unescape( tests[i][outCol]);
\r
804 logln("Normalizing '" + new String(input) + "' (" +
\r
805 hex(new String(input)) + ")" );
\r
809 reqLength=Normalizer.normalize(input,output, mode,0);
\r
810 if(reqLength<=output.length ){
\r
813 }catch(IndexOutOfBoundsException e){
\r
814 output= new char[Integer.parseInt(e.getMessage())];
\r
818 if (!expect.equals(new String(output,0,reqLength))) {
\r
819 errln("FAIL: case " + i
\r
820 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
821 + " but got '" + new String(output)
\r
822 + "' (" + hex(new String(output)) + ")" );
\r
826 private void decomposeTest(Normalizer.Mode mode,
\r
827 String[][] tests, int outCol) throws Exception{
\r
828 for (int i = 0; i < tests.length; i++)
\r
830 String input = Utility.unescape(tests[i][0]);
\r
831 String expect = Utility.unescape(tests[i][outCol]);
\r
833 logln("Normalizing '" + input + "' (" + hex(input) + ")" );
\r
835 String output = Normalizer.decompose(input, mode==Normalizer.NFKD);
\r
837 if (!output.equals(expect)) {
\r
838 errln("FAIL: case " + i
\r
839 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
840 + " but got '" + output + "' (" + hex(output) + ")" );
\r
843 char[] output = new char[1];
\r
844 for (int i = 0; i < tests.length; i++)
\r
846 char[] input = Utility.unescape(tests[i][0]).toCharArray();
\r
847 String expect = Utility.unescape(tests[i][outCol]);
\r
849 logln("Normalizing '" + new String(input) + "' (" +
\r
850 hex(new String(input)) + ")" );
\r
854 reqLength=Normalizer.decompose(input,output, mode==Normalizer.NFKD,0);
\r
855 if(reqLength<=output.length ){
\r
858 }catch(IndexOutOfBoundsException e){
\r
859 output= new char[Integer.parseInt(e.getMessage())];
\r
863 if (!expect.equals(new String(output,0,reqLength))) {
\r
864 errln("FAIL: case " + i
\r
865 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
866 + " but got '" + new String(output)
\r
867 + "' (" + hex(new String(output)) + ")" );
\r
870 output = new char[1];
\r
871 for (int i = 0; i < tests.length; i++)
\r
873 char[] input = Utility.unescape(tests[i][0]).toCharArray();
\r
874 String expect = Utility.unescape(tests[i][outCol]);
\r
876 logln("Normalizing '" + new String(input) + "' (" +
\r
877 hex(new String(input)) + ")" );
\r
881 reqLength=Normalizer.decompose(input,0,input.length,output,0,output.length, mode==Normalizer.NFKD,0);
\r
882 if(reqLength<=output.length ){
\r
885 }catch(IndexOutOfBoundsException e){
\r
886 output= new char[Integer.parseInt(e.getMessage())];
\r
890 if (!expect.equals(new String(output,0,reqLength))) {
\r
891 errln("FAIL: case " + i
\r
892 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
893 + " but got '" + new String(output)
\r
894 + "' (" + hex(new String(output)) + ")" );
\r
896 char[] output2 = new char[reqLength * 2];
\r
897 System.arraycopy(output, 0, output2, 0, reqLength);
\r
898 int retLength = Normalizer.decompose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0);
\r
899 if(retLength != reqLength){
\r
900 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength);
\r
905 private void composeTest(Normalizer.Mode mode,
\r
906 String[][] tests, int outCol) throws Exception{
\r
907 for (int i = 0; i < tests.length; i++)
\r
909 String input = Utility.unescape(tests[i][0]);
\r
910 String expect = Utility.unescape(tests[i][outCol]);
\r
912 logln("Normalizing '" + input + "' (" + hex(input) + ")" );
\r
914 String output = Normalizer.compose(input, mode==Normalizer.NFKC);
\r
916 if (!output.equals(expect)) {
\r
917 errln("FAIL: case " + i
\r
918 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
919 + " but got '" + output + "' (" + hex(output) + ")" );
\r
922 char[] output = new char[1];
\r
923 for (int i = 0; i < tests.length; i++)
\r
925 char[] input = Utility.unescape(tests[i][0]).toCharArray();
\r
926 String expect = Utility.unescape(tests[i][outCol]);
\r
928 logln("Normalizing '" + new String(input) + "' (" +
\r
929 hex(new String(input)) + ")" );
\r
933 reqLength=Normalizer.compose(input,output, mode==Normalizer.NFKC,0);
\r
934 if(reqLength<=output.length ){
\r
937 }catch(IndexOutOfBoundsException e){
\r
938 output= new char[Integer.parseInt(e.getMessage())];
\r
942 if (!expect.equals(new String(output,0,reqLength))) {
\r
943 errln("FAIL: case " + i
\r
944 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
945 + " but got '" + new String(output)
\r
946 + "' (" + hex(new String(output)) + ")" );
\r
949 output = new char[1];
\r
950 for (int i = 0; i < tests.length; i++)
\r
952 char[] input = Utility.unescape(tests[i][0]).toCharArray();
\r
953 String expect = Utility.unescape(tests[i][outCol]);
\r
955 logln("Normalizing '" + new String(input) + "' (" +
\r
956 hex(new String(input)) + ")" );
\r
960 reqLength=Normalizer.compose(input,0,input.length, output, 0, output.length, mode==Normalizer.NFKC,0);
\r
961 if(reqLength<=output.length ){
\r
964 }catch(IndexOutOfBoundsException e){
\r
965 output= new char[Integer.parseInt(e.getMessage())];
\r
969 if (!expect.equals(new String(output,0,reqLength))) {
\r
970 errln("FAIL: case " + i
\r
971 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
972 + " but got '" + new String(output)
\r
973 + "' (" + hex(new String(output)) + ")" );
\r
976 char[] output2 = new char[reqLength * 2];
\r
977 System.arraycopy(output, 0, output2, 0, reqLength);
\r
978 int retLength = Normalizer.compose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0);
\r
979 if(retLength != reqLength){
\r
980 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength);
\r
984 private void iterateTest(Normalizer iter, String[][] tests, int outCol){
\r
985 for (int i = 0; i < tests.length; i++)
\r
987 String input = Utility.unescape(tests[i][0]);
\r
988 String expect = Utility.unescape(tests[i][outCol]);
\r
990 logln("Normalizing '" + input + "' (" + hex(input) + ")" );
\r
992 iter.setText(input);
\r
993 assertEqual(expect, iter, "case " + i + " ");
\r
997 private void assertEqual(String expected, Normalizer iter, String msg)
\r
1001 UCharacterIterator cIter = UCharacterIterator.getInstance(expected);
\r
1003 while ((ch=iter.next())!= Normalizer.DONE){
\r
1004 if (index >= expected.length()) {
\r
1005 errln("FAIL: " + msg + "Unexpected character '" + (char)ch
\r
1006 + "' (" + hex(ch) + ")"
\r
1007 + " at index " + index);
\r
1010 int want = UTF16.charAt(expected,index);
\r
1012 errln("FAIL: " + msg + "got '" + (char)ch
\r
1013 + "' (" + hex(ch) + ")"
\r
1014 + " but expected '" + want + "' (" + hex(want)+ ")"
\r
1015 + " at index " + index);
\r
1017 index+= UTF16.getCharCount(ch);
\r
1019 if (index < expected.length()) {
\r
1020 errln("FAIL: " + msg + "Only got " + index + " chars, expected "
\r
1021 + expected.length());
\r
1024 cIter.setToLimit();
\r
1025 while((ch=iter.previous())!=Normalizer.DONE){
\r
1026 int want = cIter.previousCodePoint();
\r
1027 if (ch != want ) {
\r
1028 errln("FAIL: " + msg + "got '" + (char)ch
\r
1029 + "' (" + hex(ch) + ")"
\r
1030 + " but expected '" + want + "' (" + hex(want) + ")"
\r
1031 + " at index " + index);
\r
1035 //--------------------------------------------------------------------------
\r
1037 // NOTE: These tests are used for quick debugging so are not ported
\r
1038 // to ICU4C tsnorm.cpp in intltest
\r
1041 public void TestDebugStatic(){
\r
1042 String in = Utility.unescape("\\U0001D157\\U0001D165");
\r
1043 if(!Normalizer.isNormalized(in,Normalizer.NFC,0)){
\r
1044 errln("isNormalized failed");
\r
1047 String input = "\uAD8B\uAD8B\uAD8B\uAD8B"+
\r
1048 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1049 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1050 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1051 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1052 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1053 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
\r
1054 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
\r
1055 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+
\r
1056 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
\r
1057 "\uAD8B\uAD8B\uAD8B\uAD8B"+
\r
1058 "d\u031B\u0307\u0323";
\r
1059 String expect = "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+
\r
1060 "\u11AA\u1100\u116F\u11AA\uD834\uDD57\uD834\uDD65"+
\r
1061 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1062 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1063 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1064 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1065 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1066 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1067 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1068 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1069 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1070 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1071 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1072 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1073 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1074 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1075 "\uD834\uDD57\uD834\uDD65aaaaaaaaaaaaaaaaaazzzzzz"+
\r
1076 "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
\r
1077 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
\r
1078 "bbbbbbbbbbbbbbbbbbbbbbbbccccccccccccccccccccccccccccc"+
\r
1079 "cccccccccccccccccccccccccccccccccccccccccccccccc"+
\r
1080 "ddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
\r
1081 "dddddddddddddddddddddddd"+
\r
1082 "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+
\r
1083 "\u11AA\u1100\u116F\u11AA\u0064\u031B\u0323\u0307";
\r
1084 String output = Normalizer.normalize(Utility.unescape(input),
\r
1086 if(!expect.equals(output)){
\r
1087 errln("FAIL expected: "+hex(expect) + " got: "+hex(output));
\r
1093 public void TestDebugIter(){
\r
1094 String src = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e");
\r
1095 String expected = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e");
\r
1096 Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(src)),
\r
1097 Normalizer.NONE,0);
\r
1100 UCharacterIterator cIter = UCharacterIterator.getInstance(expected);
\r
1102 while ((ch=iter.next())!= Normalizer.DONE){
\r
1103 if (index >= expected.length()) {
\r
1104 errln("FAIL: " + "Unexpected character '" + (char)ch
\r
1105 + "' (" + hex(ch) + ")"
\r
1106 + " at index " + index);
\r
1109 int want = UTF16.charAt(expected,index);
\r
1111 errln("FAIL: " + "got '" + (char)ch
\r
1112 + "' (" + hex(ch) + ")"
\r
1113 + " but expected '" + want + "' (" + hex(want)+ ")"
\r
1114 + " at index " + index);
\r
1116 index+= UTF16.getCharCount(ch);
\r
1118 if (index < expected.length()) {
\r
1119 errln("FAIL: " + "Only got " + index + " chars, expected "
\r
1120 + expected.length());
\r
1123 cIter.setToLimit();
\r
1124 while((ch=iter.previous())!=Normalizer.DONE){
\r
1125 int want = cIter.previousCodePoint();
\r
1126 if (ch != want ) {
\r
1127 errln("FAIL: " + "got '" + (char)ch
\r
1128 + "' (" + hex(ch) + ")"
\r
1129 + " but expected '" + want + "' (" + hex(want) + ")"
\r
1130 + " at index " + index);
\r
1134 public void TestDebugIterOld(){
\r
1135 String input = "\\U0001D15E";
\r
1136 String expected = "\uD834\uDD57\uD834\uDD65";
\r
1137 String expectedReverse = "\uD834\uDD65\uD834\uDD57";
\r
1140 Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(input)),
\r
1141 Normalizer.NFKC,0);
\r
1142 StringBuffer got = new StringBuffer();
\r
1143 for (ch = iter.first();ch!=Normalizer.DONE;ch=iter.next())
\r
1145 if (index >= expected.length()) {
\r
1146 errln("FAIL: " + "Unexpected character '" + (char)ch +
\r
1147 "' (" + hex(ch) + ")" + " at index " + index);
\r
1150 got.append(UCharacter.toString(ch));
\r
1153 if (!expected.equals(got.toString())) {
\r
1154 errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")"
\r
1155 + " but expected '" + expected + "' ("
\r
1156 + hex(expected) + ")");
\r
1158 if (got.length() < expected.length()) {
\r
1159 errln("FAIL: " + "Only got " + index + " chars, expected "
\r
1160 + expected.length());
\r
1163 logln("Reverse Iteration\n");
\r
1164 iter.setIndexOnly(iter.endIndex());
\r
1166 for(ch=iter.previous();ch!=Normalizer.DONE;ch=iter.previous()){
\r
1167 if (index >= expected.length()) {
\r
1168 errln("FAIL: " + "Unexpected character '" + (char)ch
\r
1169 + "' (" + hex(ch) + ")" + " at index " + index);
\r
1172 got.append(UCharacter.toString(ch));
\r
1174 if (!expectedReverse.equals(got.toString())) {
\r
1175 errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")"
\r
1176 + " but expected '" + expected
\r
1177 + "' (" + hex(expected) + ")");
\r
1179 if (got.length() < expected.length()) {
\r
1180 errln("FAIL: " + "Only got " + index + " chars, expected "
\r
1181 + expected.length());
\r
1185 //--------------------------------------------------------------------------
\r
1186 // helper class for TestPreviousNext()
\r
1187 // simple UTF-32 character iterator
\r
1188 class UCharIterator {
\r
1190 public UCharIterator(int[] src, int len, int index){
\r
1197 public int current() {
\r
1205 public int next() {
\r
1213 public int previous() {
\r
1221 public int getIndex() {
\r
1226 private int length, i;
\r
1228 public void TestPreviousNext() {
\r
1229 // src and expect strings
\r
1231 UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999),
\r
1232 UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f),
\r
1240 0x4f, 0x302, 0x301
\r
1243 // expected src indexes corresponding to expect indexes
\r
1244 int expectIndex[]={
\r
1249 6 // behind last character
\r
1252 // initial indexes into the src and expect strings
\r
1254 final int SRC_MIDDLE=4;
\r
1255 final int EXPECT_MIDDLE=3;
\r
1258 // movement vector
\r
1259 // - for previous(), 0 for current(), + for next()
\r
1260 // not const so that we can terminate it below for the error message
\r
1261 String moves="0+0+0--0-0-+++0--+++++++0--------";
\r
1264 Normalizer iter = new Normalizer(new String(src),
\r
1265 Normalizer.NFD,0);
\r
1266 UCharIterator iter32 = new UCharIterator(expect, expect.length,
\r
1272 // initially set the indexes into the middle of the strings
\r
1273 iter.setIndexOnly(SRC_MIDDLE);
\r
1275 // move around and compare the iteration code points with
\r
1276 // the expected ones
\r
1277 int movesIndex =0;
\r
1278 while(movesIndex<moves.length()) {
\r
1279 m=moves.charAt(movesIndex++);
\r
1281 c1=iter.previous();
\r
1282 c2=iter32.previous();
\r
1283 } else if(m=='0') {
\r
1284 c1=iter.current();
\r
1285 c2=iter32.current();
\r
1286 } else /* m=='+' */ {
\r
1291 // compare results
\r
1293 // copy the moves until the current (m) move, and terminate
\r
1294 String history = moves.substring(0,movesIndex);
\r
1295 errln("error: mismatch in Normalizer iteration at "+history+": "
\r
1296 +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2));
\r
1300 // compare indexes
\r
1301 if(iter.getIndex()!=expectIndex[iter32.getIndex()]) {
\r
1302 // copy the moves until the current (m) move, and terminate
\r
1303 String history = moves.substring(0,movesIndex);
\r
1304 errln("error: index mismatch in Normalizer iteration at "
\r
1305 +history+ " : "+ "Normalizer index " +iter.getIndex()
\r
1306 +" expected "+ expectIndex[iter32.getIndex()]);
\r
1312 public void TestPreviousNextJCI() {
\r
1313 // src and expect strings
\r
1315 UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999),
\r
1316 UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f),
\r
1324 0x4f, 0x302, 0x301
\r
1327 // expected src indexes corresponding to expect indexes
\r
1328 int expectIndex[]={
\r
1333 6 // behind last character
\r
1336 // initial indexes into the src and expect strings
\r
1338 final int SRC_MIDDLE=4;
\r
1339 final int EXPECT_MIDDLE=3;
\r
1342 // movement vector
\r
1343 // - for previous(), 0 for current(), + for next()
\r
1344 // not const so that we can terminate it below for the error message
\r
1345 String moves="0+0+0--0-0-+++0--+++++++0--------";
\r
1348 StringCharacterIterator text = new StringCharacterIterator(new String(src));
\r
1349 Normalizer iter = new Normalizer(text,Normalizer.NFD,0);
\r
1350 UCharIterator iter32 = new UCharIterator(expect, expect.length,
\r
1356 // initially set the indexes into the middle of the strings
\r
1357 iter.setIndexOnly(SRC_MIDDLE);
\r
1359 // move around and compare the iteration code points with
\r
1360 // the expected ones
\r
1361 int movesIndex =0;
\r
1362 while(movesIndex<moves.length()) {
\r
1363 m=moves.charAt(movesIndex++);
\r
1365 c1=iter.previous();
\r
1366 c2=iter32.previous();
\r
1367 } else if(m=='0') {
\r
1368 c1=iter.current();
\r
1369 c2=iter32.current();
\r
1370 } else /* m=='+' */ {
\r
1375 // compare results
\r
1377 // copy the moves until the current (m) move, and terminate
\r
1378 String history = moves.substring(0,movesIndex);
\r
1379 errln("error: mismatch in Normalizer iteration at "+history+": "
\r
1380 +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2));
\r
1384 // compare indexes
\r
1385 if(iter.getIndex()!=expectIndex[iter32.getIndex()]) {
\r
1386 // copy the moves until the current (m) move, and terminate
\r
1387 String history = moves.substring(0,movesIndex);
\r
1388 errln("error: index mismatch in Normalizer iteration at "
\r
1389 +history+ " : "+ "Normalizer index " +iter.getIndex()
\r
1390 +" expected "+ expectIndex[iter32.getIndex()]);
\r
1396 // test APIs that are not otherwise used - improve test coverage
\r
1397 public void TestNormalizerAPI() throws Exception {
\r
1399 // instantiate a Normalizer from a CharacterIterator
\r
1400 String s=Utility.unescape("a\u0308\uac00\\U0002f800");
\r
1401 // make s a bit longer and more interesting
\r
1402 UCharacterIterator iter = UCharacterIterator.getInstance(s+s);
\r
1403 Normalizer norm = new Normalizer(iter, Normalizer.NFC,0);
\r
1404 if(norm.next()!=0xe4) {
\r
1405 errln("error in Normalizer(CharacterIterator).next()");
\r
1408 // test clone(), ==, and hashCode()
\r
1409 Normalizer clone=(Normalizer)norm.clone();
\r
1410 if(clone.equals(norm)) {
\r
1411 errln("error in Normalizer(Normalizer(CharacterIterator)).clone()!=norm");
\r
1415 if(clone.getLength()!= norm.getLength()){
\r
1416 errln("error in Normalizer.getBeginIndex()");
\r
1418 // clone must have the same hashCode()
\r
1419 //if(clone.hashCode()!=norm.hashCode()) {
\r
1420 // errln("error in Normalizer(Normalizer(CharacterIterator)).clone().hashCode()!=copy.hashCode()");
\r
1422 if(clone.next()!=0xac00) {
\r
1423 errln("error in Normalizer(Normalizer(CharacterIterator)).next()");
\r
1425 int ch = clone.next();
\r
1427 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next()");
\r
1429 // position changed, must change hashCode()
\r
1430 if(clone.hashCode()==norm.hashCode()) {
\r
1431 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next().hashCode()==copy.hashCode()");
\r
1434 // test compose() and decompose()
\r
1436 String nfkc, nfkd;
\r
1437 tel=new StringBuffer("\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121");
\r
1438 tel.insert(1,(char)0x0301);
\r
1440 nfkc=Normalizer.compose(tel.toString(), true);
\r
1441 nfkd=Normalizer.decompose(tel.toString(), true);
\r
1443 !nfkc.equals(Utility.unescape("TE\u0139TELTELTELTELTELTELTELTELTEL"))||
\r
1444 !nfkd.equals(Utility.unescape("TEL\u0301TELTELTELTELTELTELTELTELTEL"))
\r
1446 errln("error in Normalizer::(de)compose(): wrong result(s)");
\r
1449 // test setIndex()
\r
1450 // ch=norm.setIndex(3);
\r
1451 // if(ch!=0x4e3d) {
\r
1452 // errln("error in Normalizer(CharacterIterator).setIndex(3)");
\r
1455 // test setText(CharacterIterator) and getText()
\r
1457 clone.setText(iter);
\r
1459 out = clone.getText();
\r
1460 out2 = iter.getText();
\r
1461 if( !out.equals(out2) ||
\r
1462 clone.startIndex()!=0||
\r
1463 clone.endIndex()!=iter.getLength()
\r
1465 errln("error in Normalizer::setText() or Normalizer::getText()");
\r
1468 char[] fillIn1 = new char[clone.getLength()];
\r
1469 char[] fillIn2 = new char[iter.getLength()];
\r
1470 int len = clone.getText(fillIn1);
\r
1471 iter.getText(fillIn2,0);
\r
1472 if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){
\r
1473 errln("error in Normalizer.getText(). Normalizer: "+
\r
1474 Utility.hex(new String(fillIn1))+
\r
1475 " Iter: " + Utility.hex(new String(fillIn2)));
\r
1478 clone.setText(fillIn1);
\r
1479 len = clone.getText(fillIn2);
\r
1480 if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){
\r
1481 errln("error in Normalizer.setText() or Normalizer.getText()"+
\r
1482 Utility.hex(new String(fillIn1))+
\r
1483 " Iter: " + Utility.hex(new String(fillIn2)));
\r
1486 // test setText(UChar *), getUMode() and setMode()
\r
1488 clone.setIndexOnly(1);
\r
1489 clone.setMode(Normalizer.NFD);
\r
1490 if(clone.getMode()!=Normalizer.NFD) {
\r
1491 errln("error in Normalizer::setMode() or Normalizer::getMode()");
\r
1493 if(clone.next()!=0x308 || clone.next()!=0x1100) {
\r
1494 errln("error in Normalizer::setText() or Normalizer::setMode()");
\r
1497 // test last()/previous() with an internal buffer overflow
\r
1498 StringBuffer buf = new StringBuffer("aaaaaaaaaa");
\r
1499 buf.setCharAt(10-1,'\u0308');
\r
1500 clone.setText(buf);
\r
1501 if(clone.last()!=0x308) {
\r
1502 errln("error in Normalizer(10*U+0308).last()");
\r
1505 // test UNORM_NONE
\r
1506 norm.setMode(Normalizer.NONE);
\r
1507 if(norm.first()!=0x61 || norm.next()!=0x308 || norm.last()!=0x2f800) {
\r
1508 errln("error in Normalizer(UNORM_NONE).first()/next()/last()");
\r
1510 out=Normalizer.normalize(s, Normalizer.NONE);
\r
1511 if(!out.equals(s)) {
\r
1512 errln("error in Normalizer::normalize(UNORM_NONE)");
\r
1515 String exp = "\\U0001D157\\U0001D165";
\r
1516 String ns = Normalizer.normalize(ch,Normalizer.NFC);
\r
1517 if(!ns.equals(Utility.unescape(exp))){
\r
1518 errln("error in Normalizer.normalize(int,Mode)");
\r
1520 ns = Normalizer.normalize(ch,Normalizer.NFC,0);
\r
1521 if(!ns.equals(Utility.unescape(exp))){
\r
1522 errln("error in Normalizer.normalize(int,Mode,int)");
\r
1526 }catch(Exception e){
\r
1531 public void TestConcatenate() {
\r
1533 Object[][]cases=new Object[][]{
\r
1534 /* mode, left, right, result */
\r
1538 "\u0301sum\u00e9",
\r
1539 "r\u00e9sum\u00e9"
\r
1544 "\u1161bcdefghijk",
\r
1545 "a\uac00bcdefghijk"
\r
1547 /* ### TODO: add more interesting cases */
\r
1550 "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" +
\r
1551 "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" +
\r
1552 "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" +
\r
1553 "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" +
\r
1554 "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" +
\r
1555 "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" +
\r
1556 "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u1FDB",
\r
1558 "\u1FE3\u1FEB\u1FEE\u1FEF\u1FF9\u1FFB\u1FFD\u2000" +
\r
1559 "\u2001\u2126\u212A\u212B\u2329\u232A\uF900\uFA10" +
\r
1560 "\uFA12\uFA15\uFA20\uFA22\uFA25\uFA26\uFA2A\uFB1F" +
\r
1561 "\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E\uFB2F\uFB30\uFB31" +
\r
1562 "\uFB32\uFB33\uFB34\uFB35\uFB36\uFB38\uFB39\uFB3A" +
\r
1563 "\uFB3B\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46" +
\r
1564 "\uFB47\uFB48\uFB49\uFB4A\uFB4B\uFB4C\uFB4D\uFB4E",
\r
1566 "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" +
\r
1567 "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" +
\r
1568 "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" +
\r
1569 "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" +
\r
1570 "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" +
\r
1571 "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" +
\r
1572 "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u0399" +
\r
1573 "\u0301\u03C5\u0308\u0301\u1FEB\u1FEE\u1FEF\u1FF9" +
\r
1574 "\u1FFB\u1FFD\u2000\u2001\u2126\u212A\u212B\u2329" +
\r
1575 "\u232A\uF900\uFA10\uFA12\uFA15\uFA20\uFA22\uFA25" +
\r
1576 "\uFA26\uFA2A\uFB1F\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E" +
\r
1577 "\uFB2F\uFB30\uFB31\uFB32\uFB33\uFB34\uFB35\uFB36" +
\r
1578 "\uFB38\uFB39\uFB3A\uFB3B\uFB3C\uFB3E\uFB40\uFB41" +
\r
1579 "\uFB43\uFB44\uFB46\uFB47\uFB48\uFB49\uFB4A\uFB4B" +
\r
1580 "\uFB4C\uFB4D\uFB4E"
\r
1584 String left, right, expect, result;
\r
1585 Normalizer.Mode mode;
\r
1588 /* test concatenation */
\r
1589 for(i=0; i<cases.length; ++i) {
\r
1590 mode = (Normalizer.Mode)cases[i][0];
\r
1592 left=(String)cases[i][1];
\r
1593 right=(String)cases[i][2];
\r
1594 expect=(String)cases[i][3];
\r
1596 result=Normalizer.concatenate(left, right, mode,0);
\r
1597 if( result.equals(expect)) {
\r
1598 errln("error in Normalizer.concatenate(), cases[] failed"
\r
1599 +", result==expect: expected: "
\r
1600 + hex(expect)+" =========> got: " + hex(result));
\r
1604 result=Normalizer.concatenate(left.toCharArray(), right.toCharArray(), mode,0);
\r
1605 if( result.equals(expect)) {
\r
1606 errln("error in Normalizer.concatenate(), cases[] failed"
\r
1607 +", result==expect: expected: "
\r
1608 + hex(expect)+" =========> got: " + hex(result));
\r
1613 private final int RAND_MAX = 0x7fff;
\r
1615 public void TestCheckFCD()
\r
1617 char[] FAST = {0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
\r
1618 0x0008, 0x0009, 0x000A};
\r
1620 char[] FALSE = {0x0001, 0x0002, 0x02EA, 0x03EB, 0x0300, 0x0301,
\r
1621 0x02B9, 0x0314, 0x0315, 0x0316};
\r
1623 char[] TRUE = {0x0030, 0x0040, 0x0440, 0x056D, 0x064F, 0x06E7,
\r
1624 0x0050, 0x0730, 0x09EE, 0x1E10};
\r
1626 char[][] datastr= { {0x0061, 0x030A, 0x1E05, 0x0302, 0},
\r
1627 {0x0061, 0x030A, 0x00E2, 0x0323, 0},
\r
1628 {0x0061, 0x0323, 0x00E2, 0x0323, 0},
\r
1629 {0x0061, 0x0323, 0x1E05, 0x0302, 0}
\r
1631 Normalizer.QuickCheckResult result[] = {Normalizer.YES, Normalizer.NO, Normalizer.NO, Normalizer.YES};
\r
1633 char[] datachar= { 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
\r
1635 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
\r
1637 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306,
\r
1638 0x0307, 0x0308, 0x0309, 0x030a,
\r
1639 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326,
\r
1640 0x0327, 0x0328, 0x0329, 0x032a,
\r
1641 0x1e00, 0x1e01, 0x1e02, 0x1e03, 0x1e04, 0x1e05, 0x1e06,
\r
1642 0x1e07, 0x1e08, 0x1e09, 0x1e0a
\r
1647 if (Normalizer.quickCheck(FAST,0,FAST.length, Normalizer.FCD,0) != Normalizer.YES)
\r
1648 errln("Normalizer.quickCheck(FCD) failed: expected value for fast Normalizer.quickCheck is Normalizer.YES\n");
\r
1649 if (Normalizer.quickCheck(FALSE,0, FALSE.length,Normalizer.FCD,0) != Normalizer.NO)
\r
1650 errln("Normalizer.quickCheck(FCD) failed: expected value for error Normalizer.quickCheck is Normalizer.NO\n");
\r
1651 if (Normalizer.quickCheck(TRUE,0,TRUE.length,Normalizer.FCD,0) != Normalizer.YES)
\r
1652 errln("Normalizer.quickCheck(FCD) failed: expected value for correct Normalizer.quickCheck is Normalizer.YES\n");
\r
1657 Normalizer.QuickCheckResult fcdresult = Normalizer.quickCheck(datastr[count],0,datastr[count].length, Normalizer.FCD,0);
\r
1658 if (result[count] != fcdresult) {
\r
1659 errln("Normalizer.quickCheck(FCD) failed: Data set "+ count
\r
1660 + " expected value "+ result[count]);
\r
1665 /* random checks of long strings */
\r
1666 //srand((unsigned)time( NULL ));
\r
1667 Random rand = createRandom(); // use test framework's random
\r
1669 for (count = 0; count < 50; count ++)
\r
1672 Normalizer.QuickCheckResult testresult = Normalizer.YES;
\r
1673 char[] data= new char[20];
\r
1674 char[] norm= new char[100];
\r
1675 char[] nfd = new char[100];
\r
1676 int normStart = 0;
\r
1678 while (size != 19) {
\r
1679 data[size] = datachar[rand.nextInt(RAND_MAX)*50/RAND_MAX];
\r
1680 logln("0x"+data[size]);
\r
1681 normStart += Normalizer.normalize(data,size,size+1,
\r
1682 norm,normStart,100,
\r
1683 Normalizer.NFD,0);
\r
1688 nfdsize = Normalizer.normalize(data,0,size, nfd,0,nfd.length,Normalizer.NFD,0);
\r
1689 // nfdsize = unorm_normalize(data, size, UNORM_NFD, UCOL_IGNORE_HANGUL,
\r
1690 // nfd, 100, &status);
\r
1691 if (nfdsize != normStart || Utility.arrayRegionMatches(nfd,0, norm,0,nfdsize) ==false) {
\r
1692 testresult = Normalizer.NO;
\r
1694 if (testresult == Normalizer.YES) {
\r
1695 logln("result Normalizer.YES\n");
\r
1698 logln("result Normalizer.NO\n");
\r
1701 if (Normalizer.quickCheck(data,0,data.length, Normalizer.FCD,0) != testresult) {
\r
1702 errln("Normalizer.quickCheck(FCD) failed: expected "+ testresult +" for random data: "+hex(new String(data)) );
\r
1708 // reference implementation of Normalizer::compare
\r
1709 private int ref_norm_compare(String s1, String s2, int options) {
\r
1710 String t1, t2,r1,r2;
\r
1712 int normOptions=(int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT);
\r
1714 if((options&Normalizer.COMPARE_IGNORE_CASE)!=0) {
\r
1715 // NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
\r
1716 r1 = Normalizer.decompose(s1,false,normOptions);
\r
1717 r2 = Normalizer.decompose(s2,false,normOptions);
\r
1718 r1 = UCharacter.foldCase(r1,options);
\r
1719 r2 = UCharacter.foldCase(r2,options);
\r
1725 t1 = Normalizer.decompose(r1, false, normOptions);
\r
1726 t2 = Normalizer.decompose(r2, false, normOptions);
\r
1728 if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) {
\r
1729 UTF16.StringComparator comp
\r
1730 = new UTF16.StringComparator(true, false,
\r
1731 UTF16.StringComparator.FOLD_CASE_DEFAULT);
\r
1732 return comp.compare(t1,t2);
\r
1734 return t1.compareTo(t2);
\r
1739 // test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately
\r
1740 private int norm_compare(String s1, String s2, int options) {
\r
1741 int normOptions=(int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT);
\r
1743 if( Normalizer.YES==Normalizer.quickCheck(s1,Normalizer.FCD,normOptions) &&
\r
1744 Normalizer.YES==Normalizer.quickCheck(s2,Normalizer.FCD,normOptions)) {
\r
1745 options|=Normalizer.INPUT_IS_FCD;
\r
1748 return Normalizer.compare(s1, s2, options);
\r
1751 // reference implementation of UnicodeString::caseCompare
\r
1752 private int ref_case_compare(String s1, String s2, int options) {
\r
1758 t1 = UCharacter.foldCase(t1,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0));
\r
1759 t2 = UCharacter.foldCase(t2,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0));
\r
1761 if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) {
\r
1762 UTF16.StringComparator comp
\r
1763 = new UTF16.StringComparator(true, false,
\r
1764 UTF16.StringComparator.FOLD_CASE_DEFAULT);
\r
1765 return comp.compare(t1,t2);
\r
1767 return t1.compareTo(t2);
\r
1772 // reduce an integer to -1/0/1
\r
1773 private static int sign(int value) {
\r
1777 return (value>>31)|1;
\r
1780 private static String signString(int value) {
\r
1783 } else if(value==0) {
\r
1785 } else /* value>0 */ {
\r
1789 // test Normalizer::compare and unorm_compare (thinly wrapped by the former)
\r
1790 // by comparing it with its semantic equivalent
\r
1791 // since we trust the pieces, this is sufficient
\r
1793 // test each string with itself and each other
\r
1794 // each time with all options
\r
1795 private String strings[]=new String[]{
\r
1796 // some cases from NormalizationTest.txt
\r
1798 "D\u031B\u0307\u0323",
\r
1799 "\u1E0C\u031B\u0307",
\r
1800 "D\u031B\u0323\u0307",
\r
1801 "d\u031B\u0323\u0307",
\r
1808 // Angstrom sign = A ring
\r
1816 "a\u059A\u0316\u302A\u032Fb",
\r
1817 "a\u302A\u0316\u032F\u059Ab",
\r
1818 "a\u302A\u0316\u032F\u059Ab",
\r
1819 "A\u059A\u0316\u302A\u032Fb",
\r
1821 // from ICU case folding tests
\r
1823 "A\u00df\u00b5\ufb03\\U0001040c\u0131",
\r
1824 "ass\u03bcffi\\U00010434i",
\r
1825 "\u0061\u0042\u0131\u03a3\u00df\ufb03\ud93f\udfff",
\r
1826 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udfff",
\r
1827 "\u0041\u0062\u0131\u03c3\u0053\u0073\u0066\u0046\u0069\ud93f\udfff",
\r
1828 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udffd",
\r
1830 // U+d800 U+10001 see implementation comment in unorm_cmpEquivFold
\r
1831 // vs. U+10000 at bottom - code point order
\r
1833 "\ud800\ud800\udc01",
\r
1836 // other code point order tests from ustrtest.cpp
\r
1839 "\u20ac\ud800\udc00",
\r
1844 "\uff61\ud800\udc02",
\r
1848 // long strings, see cnormtst.c/TestNormCoverage()
\r
1849 // equivalent if case-insensitive
\r
1851 "\uAD8B\uAD8B\uAD8B\uAD8B"+
\r
1852 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1853 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1854 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1855 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1856 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1857 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
\r
1858 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
\r
1859 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+
\r
1860 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
\r
1861 "\uAD8B\uAD8B\uAD8B\uAD8B"+
\r
1862 "d\u031B\u0307\u0323",
\r
1864 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+
\r
1865 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1866 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1867 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1868 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1869 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1870 "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
\r
1871 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
\r
1872 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+
\r
1873 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
\r
1874 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+
\r
1875 "\u1E0C\u031B\u0307",
\r
1877 // some strings that may make a difference whether the compare function
\r
1878 // case-folds or decomposes first
\r
1880 "\u0360\u0345\u0334",
\r
1881 "\u0360\u03b9\u0334",
\r
1883 "\u0360\u1f80\u0334",
\r
1884 "\u0360\u03b1\u0313\u03b9\u0334",
\r
1886 "\u0360\u1ffc\u0334",
\r
1887 "\u0360\u03c9\u03b9\u0334",
\r
1889 "a\u0360\u0345\u0360\u0345b",
\r
1890 "a\u0345\u0360\u0345\u0360b",
\r
1892 // interesting cases for canonical caseless match with turkic i handling
\r
1897 // strings with post-Unicode 3.2 normalization or normalization corrections
\r
1899 "\u00e4\u193b\\U0002f868",
\r
1900 "\u0061\u193b\u0308\u36fc",
\r
1905 // all combinations of options
\r
1906 // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions
\r
1907 final class Temp {
\r
1910 public Temp(int opt,String str){
\r
1916 // set UNORM_UNICODE_3_2 in one additional combination
\r
1918 private Temp[] opt = new Temp[]{
\r
1919 new Temp(0,"default"),
\r
1920 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER, "code point order" ),
\r
1921 new Temp(Normalizer.COMPARE_IGNORE_CASE, "ignore case" ),
\r
1922 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE, "code point order & ignore case" ),
\r
1923 new Temp(Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "ignore case & special i"),
\r
1924 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "code point order & ignore case & special i"),
\r
1925 new Temp(Normalizer.UNICODE_3_2 << Normalizer.COMPARE_NORM_OPTIONS_SHIFT, "Unicode 3.2")
\r
1929 public void TestCompareDebug(){
\r
1931 String[] s = new String[100]; // at least as many items as in strings[] !
\r
1934 int i, j, k, count=strings.length;
\r
1935 int result, refResult;
\r
1937 // create the UnicodeStrings
\r
1938 for(i=0; i<count; ++i) {
\r
1939 s[i]=Utility.unescape(strings[i]);
\r
1941 UTF16.StringComparator comp = new UTF16.StringComparator(true, false,
\r
1942 UTF16.StringComparator.FOLD_CASE_DEFAULT);
\r
1943 // test them each with each other
\r
1948 // test Normalizer::compare
\r
1949 result=norm_compare(s[i], s[j], opt[k].options);
\r
1950 refResult=ref_norm_compare(s[i], s[j], opt[k].options);
\r
1951 if(sign(result)!=sign(refResult)) {
\r
1952 errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
\r
1955 // test UnicodeString::caseCompare - same internal implementation function
\r
1956 if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) {
\r
1957 // result=s[i]. (s[j], opt[k].options);
\r
1958 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)
\r
1960 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);
\r
1963 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
\r
1966 result=comp.compare(s[i],s[j]);
\r
1967 refResult=ref_case_compare(s[i], s[j], opt[k].options);
\r
1968 if(sign(result)!=sign(refResult)) {
\r
1969 errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
\r
1972 String value1 = "\u00dater\u00fd";
\r
1973 String value2 = "\u00fater\u00fd";
\r
1974 if(Normalizer.compare(value1,value2,0)!=0){
\r
1975 if(Normalizer.compare(value1,value2,Normalizer.COMPARE_IGNORE_CASE)==0){
\r
1981 public void TestCompare() {
\r
1983 String[] s = new String[100]; // at least as many items as in strings[] !
\r
1985 int i, j, k, count=strings.length;
\r
1986 int result, refResult;
\r
1988 // create the UnicodeStrings
\r
1989 for(i=0; i<count; ++i) {
\r
1990 s[i]=Utility.unescape(strings[i]);
\r
1992 UTF16.StringComparator comp = new UTF16.StringComparator();
\r
1993 // test them each with each other
\r
1994 for(i=0; i<count; ++i) {
\r
1995 for(j=i; j<count; ++j) {
\r
1996 for(k=0; k<opt.length; ++k) {
\r
1997 // test Normalizer::compare
\r
1998 result=norm_compare(s[i], s[j], opt[k].options);
\r
1999 refResult=ref_norm_compare(s[i], s[j], opt[k].options);
\r
2000 if(sign(result)!=sign(refResult)) {
\r
2001 errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
\r
2004 // test UnicodeString::caseCompare - same internal implementation function
\r
2005 if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) {
\r
2006 // result=s[i]. (s[j], opt[k].options);
\r
2007 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)
\r
2009 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);
\r
2012 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
\r
2015 comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0);
\r
2016 // result=comp.caseCompare(s[i],s[j], opt[k].options);
\r
2017 result=comp.compare(s[i],s[j]);
\r
2018 refResult=ref_case_compare(s[i], s[j], opt[k].options);
\r
2019 if(sign(result)!=sign(refResult)) {
\r
2020 errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
\r
2027 // test cases with i and I to make sure Turkic works
\r
2028 char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 };
\r
2029 USerializedSet sset=new USerializedSet();
\r
2030 UnicodeSet set = new UnicodeSet();
\r
2035 // collect all sets into one for contiguous output
\r
2036 int[] startEnd = new int[2];
\r
2037 for(i=0; i<iI.length; ++i) {
\r
2038 if(NormalizerImpl.getCanonStartSet(iI[i], sset)) {
\r
2039 count=sset.countRanges();
\r
2040 for(j=0; j<count; ++j) {
\r
2041 sset.getRange(j, startEnd);
\r
2042 set.add(startEnd[0], startEnd[1]);
\r
2047 // test all of these precomposed characters
\r
2048 UnicodeSetIterator it = new UnicodeSetIterator(set);
\r
2049 while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) {
\r
2050 start=it.codepoint;
\r
2051 end=it.codepointEnd;
\r
2052 while(start<=end) {
\r
2053 s1 = Integer.toString(start);
\r
2054 s2 = Normalizer.decompose(s1, false, 0);
\r
2055 // if(U_FAILURE(errorCode)) {
\r
2056 // errln("Normalizer::decompose(U+%04x) failed: %s", start, u_errorName(errorCode));
\r
2059 for(k=0; k<opt.length; ++k) {
\r
2060 // test Normalizer::compare
\r
2062 result= norm_compare(s1, s2, opt[k].options);
\r
2063 refResult=ref_norm_compare(s1, s2, opt[k].options);
\r
2064 if(sign(result)!=sign(refResult)) {
\r
2065 errln("Normalizer.compare(U+"+hex(start)+" with its NFD, "+opt[k].name+")"
\r
2066 + signString(result)+" should be "+signString(refResult));
\r
2069 // test UnicodeString::caseCompare - same internal implementation function
\r
2070 if((opt[k].options & Normalizer.COMPARE_IGNORE_CASE)>0) {
\r
2071 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)
\r
2073 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);
\r
2076 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
\r
2079 comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0);
\r
2081 result=comp.compare(s1,s2);
\r
2082 refResult=ref_case_compare(s1, s2, opt[k].options);
\r
2083 if(sign(result)!=sign(refResult)) {
\r
2084 errln("UTF16.compare(U+"+hex(start)+" with its NFD, "
\r
2085 +opt[k].name+")"+signString(result) +" should be "+signString(refResult));
\r
2096 // verify that case-folding does not un-FCD strings
\r
2097 int countFoldFCDExceptions(int foldingOptions) {
\r
2101 int/*unsigned*/ cc, trailCC, foldCC, foldTrailCC;
\r
2102 Normalizer.QuickCheckResult qcResult;
\r
2107 logln("Test if case folding may un-FCD a string (folding options 0x)"+hex(foldingOptions));
\r
2110 for(c=0; c<=0x10ffff; ++c) {
\r
2111 category=UCharacter.getType(c);
\r
2112 if(category==UCharacterCategory.UNASSIGNED) {
\r
2113 continue; // skip unassigned code points
\r
2116 c=0xd7a3; // skip Hangul - no case folding there
\r
2119 // skip Han blocks - no case folding there either
\r
2133 s= UTF16.valueOf(c);
\r
2135 // get leading and trailing cc for c
\r
2136 d= Normalizer.decompose(s,false);
\r
2138 cc=UCharacter.getCombiningClass(UTF16.charAt(d,0));
\r
2139 trailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1));
\r
2141 // get leading and trailing cc for the case-folding of c
\r
2142 UCharacter.foldCase(s,(foldingOptions==0));
\r
2143 d = Normalizer.decompose(s, false);
\r
2144 foldCC=UCharacter.getCombiningClass(UTF16.charAt(d,0));
\r
2145 foldTrailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1));
\r
2147 qcResult=Normalizer.quickCheck(s, Normalizer.FCD,0);
\r
2151 // - character maps to empty string: adjacent characters may then need reordering
\r
2152 // - folding has different leading/trailing cc's, and they don't become just 0
\r
2153 // - folding itself is not FCD
\r
2154 if( qcResult!=Normalizer.YES ||
\r
2156 (cc!=foldCC && foldCC!=0) || (trailCC!=foldTrailCC && foldTrailCC!=0)
\r
2159 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")");
\r
2160 //errln(" cc %02x trailCC %02x foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x quickCheck(folded)=%d", cc, trailCC, UTF16.charAt(d,0), foldCC, UTF16.charAt(d,d.length()-1), foldTrailCC, qcResult);
\r
2165 // if a code point is in NFD but its case folding is not, then
\r
2166 // unorm_compare will also fail
\r
2167 if(isNFD && Normalizer.YES!=Normalizer.quickCheck(s, Normalizer.NFD,0)) {
\r
2169 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")");
\r
2173 logln("There are "+hex(count)+" code points for which case-folding may un-FCD a string (folding options"+foldingOptions+"x)" );
\r
2177 public void TestFindFoldFCDExceptions() {
\r
2180 count=countFoldFCDExceptions(0);
\r
2181 count+=countFoldFCDExceptions(Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I);
\r
2184 //* If case-folding un-FCDs any strings, then unorm_compare() must be
\r
2185 //* re-implemented.
\r
2186 //* It currently assumes that one can check for FCD then case-fold
\r
2187 //* and then still have FCD strings for raw decomposition without reordering.
\r
2189 errln("error: There are "+count+" code points for which case-folding"+
\r
2190 " may un-FCD a string for all folding options.\n See comment"+
\r
2191 " in BasicNormalizerTest::FindFoldFCDExceptions()!");
\r
2195 public void TestCombiningMarks(){
\r
2196 String src = "\u0f71\u0f72\u0f73\u0f74\u0f75";
\r
2197 String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74";
\r
2198 String result = Normalizer.decompose(src,false);
\r
2199 if(!expected.equals(result)){
\r
2200 errln("Reordering of combining marks failed. Expected: "+Utility.hex(expected)+" Got: "+ Utility.hex(result));
\r
2205 * Re-enable this test when UTC fixes UAX 21
\r
2206 public void TestUAX21Failure(){
\r
2207 final String[][] cases = new String[][]{
\r
2208 {"\u0061\u0345\u0360\u0345\u0062", "\u0061\u0360\u0345\u0345\u0062"},
\r
2209 {"\u0061\u0345\u0345\u0360\u0062", "\u0061\u0360\u0345\u0345\u0062"},
\r
2210 {"\u0061\u0345\u0360\u0362\u0360\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"},
\r
2211 {"\u0061\u0360\u0345\u0360\u0362\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"},
\r
2212 {"\u0061\u0345\u0360\u0362\u0361\u0062", "\u0061\u0362\u0360\u0361\u0345\u0062"},
\r
2213 {"\u0061\u0361\u0345\u0360\u0362\u0062", "\u0061\u0362\u0361\u0360\u0345\u0062"},
\r
2215 for(int i = 0; i< cases.length; i++){
\r
2216 String s1 =cases[0][0];
\r
2217 String s2 = cases[0][1];
\r
2218 if( (Normalizer.compare(s1,s2,Normalizer.FOLD_CASE_DEFAULT ==0)//case sensitive compare
\r
2220 (Normalizer.compare(s1,s2,Normalizer.COMPARE_IGNORE_CASE)!=0)){
\r
2221 errln("Normalizer.compare() failed for s1: "
\r
2222 + Utility.hex(s1) +" s2: " + Utility.hex(s2));
\r
2227 public void TestFCNFKCClosure() {
\r
2228 final class TestStruct{
\r
2231 TestStruct(int cp, String src){
\r
2237 TestStruct[] tests= new TestStruct[]{
\r
2238 new TestStruct( 0x037A, "\u0020\u03B9" ),
\r
2239 new TestStruct( 0x03D2, "\u03C5" ),
\r
2240 new TestStruct( 0x20A8, "\u0072\u0073" ) ,
\r
2241 new TestStruct( 0x210B, "\u0068" ),
\r
2242 new TestStruct( 0x210C, "\u0068" ),
\r
2243 new TestStruct( 0x2121, "\u0074\u0065\u006C" ),
\r
2244 new TestStruct( 0x2122, "\u0074\u006D" ),
\r
2245 new TestStruct( 0x2128, "\u007A" ),
\r
2246 new TestStruct( 0x1D5DB,"\u0068" ),
\r
2247 new TestStruct( 0x1D5ED,"\u007A" ),
\r
2248 new TestStruct( 0x0061, "" )
\r
2252 for(int i = 0; i < tests.length; ++ i) {
\r
2253 String result=Normalizer.getFC_NFKC_Closure(tests[i].c);
\r
2254 if(!result.equals(new String(tests[i].s))) {
\r
2255 errln("getFC_NFKC_Closure(U+"+Integer.toHexString(tests[i].c)+") is wrong");
\r
2259 /* error handling */
\r
2261 int length=Normalizer.getFC_NFKC_Closure(0x5c, null);
\r
2263 errln("getFC_NFKC_Closure did not perform error handling correctly");
\r
2266 public void TestBugJ2324(){
\r
2267 /* String[] input = new String[]{
\r
2276 String troublesome = "\u309A";
\r
2277 for(int i=0x3000; i<0x3100;i++){
\r
2278 String input = ((char)i)+troublesome;
\r
2280 /* String result =*/ Normalizer.compose(input,false);
\r
2281 }catch(IndexOutOfBoundsException e){
\r
2282 errln("compose() failed for input: " + Utility.hex(input) + " Exception: " + e.toString());
\r
2288 static final int D = 0, C = 1, KD= 2, KC = 3, FCD=4, NONE=5;
\r
2289 private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets){
\r
2290 if( skipSets.length < 4 ){
\r
2293 skipSets[D].applyPattern(
\r
2294 "[^\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6\\u00D9-\\u00DD"
\r
2295 + "\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6\\u00F9-\\u00FD"
\r
2296 + "\\u00FF-\\u010F\\u0112-\\u0125\\u0128-\\u0130\\u0134-\\u0137"
\r
2297 + "\\u0139-\\u013E\\u0143-\\u0148\\u014C-\\u0151\\u0154-\\u0165"
\r
2298 + "\\u0168-\\u017E\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\\u01DC"
\r
2299 + "\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B"
\r
2300 + "\\u021E\\u021F\\u0226-\\u0233\\u0300-\\u034E\\u0350-\\u036F"
\r
2301 + "\\u0374\\u037E\\u0385-\\u038A\\u038C\\u038E-\\u0390\\u03AA-"
\r
2302 + "\\u03B0\\u03CA-\\u03CE\\u03D3\\u03D4\\u0400\\u0401\\u0403\\u0407"
\r
2303 + "\\u040C-\\u040E\\u0419\\u0439\\u0450\\u0451\\u0453\\u0457\\u045C"
\r
2304 + "-\\u045E\\u0476\\u0477\\u0483-\\u0487\\u04C1\\u04C2\\u04D0-"
\r
2305 + "\\u04D3\\u04D6\\u04D7\\u04DA-\\u04DF\\u04E2-\\u04E7\\u04EA-"
\r
2306 + "\\u04F5\\u04F8\\u04F9\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4"
\r
2307 + "\\u05C5\\u05C7\\u0610-\\u061A\\u0622-\\u0626\\u064B-\\u065E"
\r
2308 + "\\u0670\\u06C0\\u06C2\\u06D3\\u06D6-\\u06DC\\u06DF-\\u06E4"
\r
2309 + "\\u06E7\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-"
\r
2310 + "\\u07F3\\u0929\\u0931\\u0934\\u093C\\u094D\\u0951-\\u0954\\u0958"
\r
2311 + "-\\u095F\\u09BC\\u09CB-\\u09CD\\u09DC\\u09DD\\u09DF\\u0A33"
\r
2312 + "\\u0A36\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C"
\r
2313 + "\\u0B48\\u0B4B-\\u0B4D\\u0B5C\\u0B5D\\u0B94\\u0BCA-\\u0BCD"
\r
2314 + "\\u0C48\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CC0\\u0CC7\\u0CC8\\u0CCA"
\r
2315 + "\\u0CCB\\u0CCD\\u0D4A-\\u0D4D\\u0DCA\\u0DDA\\u0DDC-\\u0DDE"
\r
2316 + "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB8\\u0EB9\\u0EC8-\\u0ECB"
\r
2317 + "\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D\\u0F52\\u0F57"
\r
2318 + "\\u0F5C\\u0F69\\u0F71-\\u0F76\\u0F78\\u0F7A-\\u0F7D\\u0F80-"
\r
2319 + "\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"
\r
2320 + "\\u0FC6\\u1026\\u1037\\u1039\\u103A\\u108D\\u135F\\u1714\\u1734"
\r
2321 + "\\u17D2\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1B06\\u1B08"
\r
2322 + "\\u1B0A\\u1B0C\\u1B0E\\u1B12\\u1B34\\u1B3B\\u1B3D\\u1B40\\u1B41"
\r
2323 + "\\u1B43\\u1B44\\u1B6B-\\u1B73\\u1BAA\\u1C37\\u1DC0-\\u1DE6"
\r
2324 + "\\u1DFE-\\u1E99\\u1E9B\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-"
\r
2325 + "\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59"
\r
2326 + "\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-\\u1FBC"
\r
2327 + "\\u1FBE\\u1FC1-\\u1FC4\\u1FC6-\\u1FD3\\u1FD6-\\u1FDB\\u1FDD-"
\r
2328 + "\\u1FEF\\u1FF2-\\u1FF4\\u1FF6-\\u1FFD\\u2000\\u2001\\u20D0-"
\r
2329 + "\\u20DC\\u20E1\\u20E5-\\u20F0\\u2126\\u212A\\u212B\\u219A\\u219B"
\r
2330 + "\\u21AE\\u21CD-\\u21CF\\u2204\\u2209\\u220C\\u2224\\u2226\\u2241"
\r
2331 + "\\u2244\\u2247\\u2249\\u2260\\u2262\\u226D-\\u2271\\u2274\\u2275"
\r
2332 + "\\u2278\\u2279\\u2280\\u2281\\u2284\\u2285\\u2288\\u2289\\u22AC-"
\r
2333 + "\\u22AF\\u22E0-\\u22E3\\u22EA-\\u22ED\\u2329\\u232A\\u2ADC"
\r
2334 + "\\u2DE0-\\u2DFF\\u302A-\\u302F\\u304C\\u304E\\u3050\\u3052"
\r
2335 + "\\u3054\\u3056\\u3058\\u305A\\u305C\\u305E\\u3060\\u3062\\u3065"
\r
2336 + "\\u3067\\u3069\\u3070\\u3071\\u3073\\u3074\\u3076\\u3077\\u3079"
\r
2337 + "\\u307A\\u307C\\u307D\\u3094\\u3099\\u309A\\u309E\\u30AC\\u30AE"
\r
2338 + "\\u30B0\\u30B2\\u30B4\\u30B6\\u30B8\\u30BA\\u30BC\\u30BE\\u30C0"
\r
2339 + "\\u30C2\\u30C5\\u30C7\\u30C9\\u30D0\\u30D1\\u30D3\\u30D4\\u30D6"
\r
2340 + "\\u30D7\\u30D9\\u30DA\\u30DC\\u30DD\\u30F4\\u30F7-\\u30FA\\u30FE"
\r
2341 + "\\uA66F\\uA67C\\uA67D\\uA806\\uA8C4\\uA92B-\\uA92D\\uA953\\uAC00"
\r
2342 + "-\\uD7A3\\uF900-\\uFA0D\\uFA10\\uFA12\\uFA15-\\uFA1E\\uFA20"
\r
2343 + "\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D\\uFA30-\\uFA6A\\uFA70-"
\r
2344 + "\\uFAD9\\uFB1D-\\uFB1F\\uFB2A-\\uFB36\\uFB38-\\uFB3C\\uFB3E"
\r
2345 + "\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-\\uFB4E\\uFE20-\\uFE26"
\r
2346 + "\\U000101FD\\U00010A0D\\U00010A0F\\U00010A38-\\U00010A3A\\U00010"
\r
2347 + "A3F\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001D172\\U0001D17B-"
\r
2348 + "\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-\\U0001D1AD\\U0001"
\r
2349 + "D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0002F800-\\U0002FA1D]", false);
\r
2351 skipSets[C].applyPattern(
\r
2352 "[^<->A-PR-Za-pr-z\\u00A8\\u00C0-\\u00CF\\u00D1-\\u00D6\\u00D8-"
\r
2353 + "\\u00DD\\u00E0-\\u00EF\\u00F1-\\u00F6\\u00F8-\\u00FD\\u00FF-"
\r
2354 + "\\u0103\\u0106-\\u010F\\u0112-\\u0117\\u011A-\\u0121\\u0124"
\r
2355 + "\\u0125\\u0128-\\u012D\\u0130\\u0139\\u013A\\u013D\\u013E\\u0143"
\r
2356 + "\\u0144\\u0147\\u0148\\u014C-\\u0151\\u0154\\u0155\\u0158-"
\r
2357 + "\\u015D\\u0160\\u0161\\u0164\\u0165\\u0168-\\u0171\\u0174-"
\r
2358 + "\\u017F\\u01A0\\u01A1\\u01AF\\u01B0\\u01B7\\u01CD-\\u01DC\\u01DE"
\r
2359 + "-\\u01E1\\u01E6-\\u01EB\\u01F4\\u01F5\\u01F8-\\u01FB\\u0200-"
\r
2360 + "\\u021B\\u021E\\u021F\\u0226-\\u0233\\u0292\\u0300-\\u034E"
\r
2361 + "\\u0350-\\u036F\\u0374\\u037E\\u0387\\u0391\\u0395\\u0397\\u0399"
\r
2362 + "\\u039F\\u03A1\\u03A5\\u03A9\\u03AC\\u03AE\\u03B1\\u03B5\\u03B7"
\r
2363 + "\\u03B9\\u03BF\\u03C1\\u03C5\\u03C9-\\u03CB\\u03CE\\u03D2\\u0406"
\r
2364 + "\\u0410\\u0413\\u0415-\\u0418\\u041A\\u041E\\u0423\\u0427\\u042B"
\r
2365 + "\\u042D\\u0430\\u0433\\u0435-\\u0438\\u043A\\u043E\\u0443\\u0447"
\r
2366 + "\\u044B\\u044D\\u0456\\u0474\\u0475\\u0483-\\u0487\\u04D8\\u04D9"
\r
2367 + "\\u04E8\\u04E9\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5"
\r
2368 + "\\u05C7\\u0610-\\u061A\\u0622\\u0623\\u0627\\u0648\\u064A-"
\r
2369 + "\\u065E\\u0670\\u06C1\\u06D2\\u06D5-\\u06DC\\u06DF-\\u06E4"
\r
2370 + "\\u06E7\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-"
\r
2371 + "\\u07F3\\u0928\\u0930\\u0933\\u093C\\u094D\\u0951-\\u0954\\u0958"
\r
2372 + "-\\u095F\\u09BC\\u09BE\\u09C7\\u09CD\\u09D7\\u09DC\\u09DD\\u09DF"
\r
2373 + "\\u0A33\\u0A36\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD"
\r
2374 + "\\u0B3C\\u0B3E\\u0B47\\u0B4D\\u0B56\\u0B57\\u0B5C\\u0B5D\\u0B92"
\r
2375 + "\\u0BBE\\u0BC6\\u0BC7\\u0BCD\\u0BD7\\u0C46\\u0C4D\\u0C55\\u0C56"
\r
2376 + "\\u0CBC\\u0CBF\\u0CC2\\u0CC6\\u0CCA\\u0CCD\\u0CD5\\u0CD6\\u0D3E"
\r
2377 + "\\u0D46\\u0D47\\u0D4D\\u0D57\\u0DCA\\u0DCF\\u0DD9\\u0DDC\\u0DDF"
\r
2378 + "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB8\\u0EB9\\u0EC8-\\u0ECB"
\r
2379 + "\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D\\u0F52\\u0F57"
\r
2380 + "\\u0F5C\\u0F69\\u0F71-\\u0F76\\u0F78\\u0F7A-\\u0F7D\\u0F80-"
\r
2381 + "\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"
\r
2382 + "\\u0FC6\\u1025\\u102E\\u1037\\u1039\\u103A\\u108D\\u1100-\\u1112"
\r
2383 + "\\u1161-\\u1175\\u11A8-\\u11C2\\u135F\\u1714\\u1734\\u17D2"
\r
2384 + "\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1B05\\u1B07\\u1B09"
\r
2385 + "\\u1B0B\\u1B0D\\u1B11\\u1B34\\u1B35\\u1B3A\\u1B3C\\u1B3E\\u1B3F"
\r
2386 + "\\u1B42\\u1B44\\u1B6B-\\u1B73\\u1BAA\\u1C37\\u1DC0-\\u1DE6"
\r
2387 + "\\u1DFE-\\u1E03\\u1E0A-\\u1E0F\\u1E12-\\u1E1B\\u1E20-\\u1E27"
\r
2388 + "\\u1E2A-\\u1E41\\u1E44-\\u1E53\\u1E58-\\u1E7D\\u1E80-\\u1E87"
\r
2389 + "\\u1E8E-\\u1E91\\u1E96-\\u1E99\\u1EA0-\\u1EF3\\u1EF6-\\u1EF9"
\r
2390 + "\\u1F00-\\u1F11\\u1F18\\u1F19\\u1F20-\\u1F31\\u1F38\\u1F39"
\r
2391 + "\\u1F40\\u1F41\\u1F48\\u1F49\\u1F50\\u1F51\\u1F59\\u1F60-\\u1F71"
\r
2392 + "\\u1F73-\\u1F75\\u1F77\\u1F79\\u1F7B-\\u1F7D\\u1F80\\u1F81"
\r
2393 + "\\u1F88\\u1F89\\u1F90\\u1F91\\u1F98\\u1F99\\u1FA0\\u1FA1\\u1FA8"
\r
2394 + "\\u1FA9\\u1FB3\\u1FB6\\u1FBB\\u1FBC\\u1FBE\\u1FBF\\u1FC3\\u1FC6"
\r
2395 + "\\u1FC9\\u1FCB\\u1FCC\\u1FD3\\u1FDB\\u1FE3\\u1FEB\\u1FEE\\u1FEF"
\r
2396 + "\\u1FF3\\u1FF6\\u1FF9\\u1FFB-\\u1FFE\\u2000\\u2001\\u20D0-"
\r
2397 + "\\u20DC\\u20E1\\u20E5-\\u20F0\\u2126\\u212A\\u212B\\u2190\\u2192"
\r
2398 + "\\u2194\\u21D0\\u21D2\\u21D4\\u2203\\u2208\\u220B\\u2223\\u2225"
\r
2399 + "\\u223C\\u2243\\u2245\\u2248\\u224D\\u2261\\u2264\\u2265\\u2272"
\r
2400 + "\\u2273\\u2276\\u2277\\u227A-\\u227D\\u2282\\u2283\\u2286\\u2287"
\r
2401 + "\\u2291\\u2292\\u22A2\\u22A8\\u22A9\\u22AB\\u22B2-\\u22B5\\u2329"
\r
2402 + "\\u232A\\u2ADC\\u2DE0-\\u2DFF\\u302A-\\u302F\\u3046\\u304B"
\r
2403 + "\\u304D\\u304F\\u3051\\u3053\\u3055\\u3057\\u3059\\u305B\\u305D"
\r
2404 + "\\u305F\\u3061\\u3064\\u3066\\u3068\\u306F\\u3072\\u3075\\u3078"
\r
2405 + "\\u307B\\u3099\\u309A\\u309D\\u30A6\\u30AB\\u30AD\\u30AF\\u30B1"
\r
2406 + "\\u30B3\\u30B5\\u30B7\\u30B9\\u30BB\\u30BD\\u30BF\\u30C1\\u30C4"
\r
2407 + "\\u30C6\\u30C8\\u30CF\\u30D2\\u30D5\\u30D8\\u30DB\\u30EF-\\u30F2"
\r
2408 + "\\u30FD\\uA66F\\uA67C\\uA67D\\uA806\\uA8C4\\uA92B-\\uA92D\\uA953"
\r
2409 + "\\uAC00\\uAC1C\\uAC38\\uAC54\\uAC70\\uAC8C\\uACA8\\uACC4\\uACE0"
\r
2410 + "\\uACFC\\uAD18\\uAD34\\uAD50\\uAD6C\\uAD88\\uADA4\\uADC0\\uADDC"
\r
2411 + "\\uADF8\\uAE14\\uAE30\\uAE4C\\uAE68\\uAE84\\uAEA0\\uAEBC\\uAED8"
\r
2412 + "\\uAEF4\\uAF10\\uAF2C\\uAF48\\uAF64\\uAF80\\uAF9C\\uAFB8\\uAFD4"
\r
2413 + "\\uAFF0\\uB00C\\uB028\\uB044\\uB060\\uB07C\\uB098\\uB0B4\\uB0D0"
\r
2414 + "\\uB0EC\\uB108\\uB124\\uB140\\uB15C\\uB178\\uB194\\uB1B0\\uB1CC"
\r
2415 + "\\uB1E8\\uB204\\uB220\\uB23C\\uB258\\uB274\\uB290\\uB2AC\\uB2C8"
\r
2416 + "\\uB2E4\\uB300\\uB31C\\uB338\\uB354\\uB370\\uB38C\\uB3A8\\uB3C4"
\r
2417 + "\\uB3E0\\uB3FC\\uB418\\uB434\\uB450\\uB46C\\uB488\\uB4A4\\uB4C0"
\r
2418 + "\\uB4DC\\uB4F8\\uB514\\uB530\\uB54C\\uB568\\uB584\\uB5A0\\uB5BC"
\r
2419 + "\\uB5D8\\uB5F4\\uB610\\uB62C\\uB648\\uB664\\uB680\\uB69C\\uB6B8"
\r
2420 + "\\uB6D4\\uB6F0\\uB70C\\uB728\\uB744\\uB760\\uB77C\\uB798\\uB7B4"
\r
2421 + "\\uB7D0\\uB7EC\\uB808\\uB824\\uB840\\uB85C\\uB878\\uB894\\uB8B0"
\r
2422 + "\\uB8CC\\uB8E8\\uB904\\uB920\\uB93C\\uB958\\uB974\\uB990\\uB9AC"
\r
2423 + "\\uB9C8\\uB9E4\\uBA00\\uBA1C\\uBA38\\uBA54\\uBA70\\uBA8C\\uBAA8"
\r
2424 + "\\uBAC4\\uBAE0\\uBAFC\\uBB18\\uBB34\\uBB50\\uBB6C\\uBB88\\uBBA4"
\r
2425 + "\\uBBC0\\uBBDC\\uBBF8\\uBC14\\uBC30\\uBC4C\\uBC68\\uBC84\\uBCA0"
\r
2426 + "\\uBCBC\\uBCD8\\uBCF4\\uBD10\\uBD2C\\uBD48\\uBD64\\uBD80\\uBD9C"
\r
2427 + "\\uBDB8\\uBDD4\\uBDF0\\uBE0C\\uBE28\\uBE44\\uBE60\\uBE7C\\uBE98"
\r
2428 + "\\uBEB4\\uBED0\\uBEEC\\uBF08\\uBF24\\uBF40\\uBF5C\\uBF78\\uBF94"
\r
2429 + "\\uBFB0\\uBFCC\\uBFE8\\uC004\\uC020\\uC03C\\uC058\\uC074\\uC090"
\r
2430 + "\\uC0AC\\uC0C8\\uC0E4\\uC100\\uC11C\\uC138\\uC154\\uC170\\uC18C"
\r
2431 + "\\uC1A8\\uC1C4\\uC1E0\\uC1FC\\uC218\\uC234\\uC250\\uC26C\\uC288"
\r
2432 + "\\uC2A4\\uC2C0\\uC2DC\\uC2F8\\uC314\\uC330\\uC34C\\uC368\\uC384"
\r
2433 + "\\uC3A0\\uC3BC\\uC3D8\\uC3F4\\uC410\\uC42C\\uC448\\uC464\\uC480"
\r
2434 + "\\uC49C\\uC4B8\\uC4D4\\uC4F0\\uC50C\\uC528\\uC544\\uC560\\uC57C"
\r
2435 + "\\uC598\\uC5B4\\uC5D0\\uC5EC\\uC608\\uC624\\uC640\\uC65C\\uC678"
\r
2436 + "\\uC694\\uC6B0\\uC6CC\\uC6E8\\uC704\\uC720\\uC73C\\uC758\\uC774"
\r
2437 + "\\uC790\\uC7AC\\uC7C8\\uC7E4\\uC800\\uC81C\\uC838\\uC854\\uC870"
\r
2438 + "\\uC88C\\uC8A8\\uC8C4\\uC8E0\\uC8FC\\uC918\\uC934\\uC950\\uC96C"
\r
2439 + "\\uC988\\uC9A4\\uC9C0\\uC9DC\\uC9F8\\uCA14\\uCA30\\uCA4C\\uCA68"
\r
2440 + "\\uCA84\\uCAA0\\uCABC\\uCAD8\\uCAF4\\uCB10\\uCB2C\\uCB48\\uCB64"
\r
2441 + "\\uCB80\\uCB9C\\uCBB8\\uCBD4\\uCBF0\\uCC0C\\uCC28\\uCC44\\uCC60"
\r
2442 + "\\uCC7C\\uCC98\\uCCB4\\uCCD0\\uCCEC\\uCD08\\uCD24\\uCD40\\uCD5C"
\r
2443 + "\\uCD78\\uCD94\\uCDB0\\uCDCC\\uCDE8\\uCE04\\uCE20\\uCE3C\\uCE58"
\r
2444 + "\\uCE74\\uCE90\\uCEAC\\uCEC8\\uCEE4\\uCF00\\uCF1C\\uCF38\\uCF54"
\r
2445 + "\\uCF70\\uCF8C\\uCFA8\\uCFC4\\uCFE0\\uCFFC\\uD018\\uD034\\uD050"
\r
2446 + "\\uD06C\\uD088\\uD0A4\\uD0C0\\uD0DC\\uD0F8\\uD114\\uD130\\uD14C"
\r
2447 + "\\uD168\\uD184\\uD1A0\\uD1BC\\uD1D8\\uD1F4\\uD210\\uD22C\\uD248"
\r
2448 + "\\uD264\\uD280\\uD29C\\uD2B8\\uD2D4\\uD2F0\\uD30C\\uD328\\uD344"
\r
2449 + "\\uD360\\uD37C\\uD398\\uD3B4\\uD3D0\\uD3EC\\uD408\\uD424\\uD440"
\r
2450 + "\\uD45C\\uD478\\uD494\\uD4B0\\uD4CC\\uD4E8\\uD504\\uD520\\uD53C"
\r
2451 + "\\uD558\\uD574\\uD590\\uD5AC\\uD5C8\\uD5E4\\uD600\\uD61C\\uD638"
\r
2452 + "\\uD654\\uD670\\uD68C\\uD6A8\\uD6C4\\uD6E0\\uD6FC\\uD718\\uD734"
\r
2453 + "\\uD750\\uD76C\\uD788\\uF900-\\uFA0D\\uFA10\\uFA12\\uFA15-"
\r
2454 + "\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D\\uFA30-"
\r
2455 + "\\uFA6A\\uFA70-\\uFAD9\\uFB1D-\\uFB1F\\uFB2A-\\uFB36\\uFB38-"
\r
2456 + "\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-\\uFB4E\\uFE20"
\r
2457 + "-\\uFE26\\U000101FD\\U00010A0D\\U00010A0F\\U00010A38-\\U00010A3A"
\r
2458 + "\\U00010A3F\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001D172\\U0001"
\r
2459 + "D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-\\U0001D1AD"
\r
2460 + "\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0002F800-"
\r
2461 + "\\U0002FA1D]", false);
\r
2463 skipSets[KD].applyPattern(
\r
2464 "[^\\u00A0\\u00A8\\u00AA\\u00AF\\u00B2-\\u00B5\\u00B8-\\u00BA"
\r
2465 + "\\u00BC-\\u00BE\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6"
\r
2466 + "\\u00D9-\\u00DD\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6"
\r
2467 + "\\u00F9-\\u00FD\\u00FF-\\u010F\\u0112-\\u0125\\u0128-\\u0130"
\r
2468 + "\\u0132-\\u0137\\u0139-\\u0140\\u0143-\\u0149\\u014C-\\u0151"
\r
2469 + "\\u0154-\\u0165\\u0168-\\u017F\\u01A0\\u01A1\\u01AF\\u01B0"
\r
2470 + "\\u01C4-\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F5\\u01F8-\\u021B"
\r
2471 + "\\u021E\\u021F\\u0226-\\u0233\\u02B0-\\u02B8\\u02D8-\\u02DD"
\r
2472 + "\\u02E0-\\u02E4\\u0300-\\u034E\\u0350-\\u036F\\u0374\\u037A"
\r
2473 + "\\u037E\\u0384-\\u038A\\u038C\\u038E-\\u0390\\u03AA-\\u03B0"
\r
2474 + "\\u03CA-\\u03CE\\u03D0-\\u03D6\\u03F0-\\u03F2\\u03F4\\u03F5"
\r
2475 + "\\u03F9\\u0400\\u0401\\u0403\\u0407\\u040C-\\u040E\\u0419\\u0439"
\r
2476 + "\\u0450\\u0451\\u0453\\u0457\\u045C-\\u045E\\u0476\\u0477\\u0483"
\r
2477 + "-\\u0487\\u04C1\\u04C2\\u04D0-\\u04D3\\u04D6\\u04D7\\u04DA-"
\r
2478 + "\\u04DF\\u04E2-\\u04E7\\u04EA-\\u04F5\\u04F8\\u04F9\\u0587"
\r
2479 + "\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5\\u05C7\\u0610"
\r
2480 + "-\\u061A\\u0622-\\u0626\\u064B-\\u065E\\u0670\\u0675-\\u0678"
\r
2481 + "\\u06C0\\u06C2\\u06D3\\u06D6-\\u06DC\\u06DF-\\u06E4\\u06E7"
\r
2482 + "\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-\\u07F3"
\r
2483 + "\\u0929\\u0931\\u0934\\u093C\\u094D\\u0951-\\u0954\\u0958-"
\r
2484 + "\\u095F\\u09BC\\u09CB-\\u09CD\\u09DC\\u09DD\\u09DF\\u0A33\\u0A36"
\r
2485 + "\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C\\u0B48"
\r
2486 + "\\u0B4B-\\u0B4D\\u0B5C\\u0B5D\\u0B94\\u0BCA-\\u0BCD\\u0C48"
\r
2487 + "\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CC0\\u0CC7\\u0CC8\\u0CCA\\u0CCB"
\r
2488 + "\\u0CCD\\u0D4A-\\u0D4D\\u0DCA\\u0DDA\\u0DDC-\\u0DDE\\u0E33"
\r
2489 + "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB3\\u0EB8\\u0EB9\\u0EC8-"
\r
2490 + "\\u0ECB\\u0EDC\\u0EDD\\u0F0C\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39"
\r
2491 + "\\u0F43\\u0F4D\\u0F52\\u0F57\\u0F5C\\u0F69\\u0F71-\\u0F7D\\u0F80"
\r
2492 + "-\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"
\r
2493 + "\\u0FC6\\u1026\\u1037\\u1039\\u103A\\u108D\\u10FC\\u135F\\u1714"
\r
2494 + "\\u1734\\u17D2\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1B06"
\r
2495 + "\\u1B08\\u1B0A\\u1B0C\\u1B0E\\u1B12\\u1B34\\u1B3B\\u1B3D\\u1B40"
\r
2496 + "\\u1B41\\u1B43\\u1B44\\u1B6B-\\u1B73\\u1BAA\\u1C37\\u1D2C-"
\r
2497 + "\\u1D2E\\u1D30-\\u1D3A\\u1D3C-\\u1D4D\\u1D4F-\\u1D6A\\u1D78"
\r
2498 + "\\u1D9B-\\u1DE6\\u1DFE-\\u1E9B\\u1EA0-\\u1EF9\\u1F00-\\u1F15"
\r
2499 + "\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57"
\r
2500 + "\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-"
\r
2501 + "\\u1FC4\\u1FC6-\\u1FD3\\u1FD6-\\u1FDB\\u1FDD-\\u1FEF\\u1FF2-"
\r
2502 + "\\u1FF4\\u1FF6-\\u1FFE\\u2000-\\u200A\\u2011\\u2017\\u2024-"
\r
2503 + "\\u2026\\u202F\\u2033\\u2034\\u2036\\u2037\\u203C\\u203E\\u2047-"
\r
2504 + "\\u2049\\u2057\\u205F\\u2070\\u2071\\u2074-\\u208E\\u2090-"
\r
2505 + "\\u2094\\u20A8\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2100-"
\r
2506 + "\\u2103\\u2105-\\u2107\\u2109-\\u2113\\u2115\\u2116\\u2119-"
\r
2507 + "\\u211D\\u2120-\\u2122\\u2124\\u2126\\u2128\\u212A-\\u212D"
\r
2508 + "\\u212F-\\u2131\\u2133-\\u2139\\u213B-\\u2140\\u2145-\\u2149"
\r
2509 + "\\u2153-\\u217F\\u219A\\u219B\\u21AE\\u21CD-\\u21CF\\u2204"
\r
2510 + "\\u2209\\u220C\\u2224\\u2226\\u222C\\u222D\\u222F\\u2230\\u2241"
\r
2511 + "\\u2244\\u2247\\u2249\\u2260\\u2262\\u226D-\\u2271\\u2274\\u2275"
\r
2512 + "\\u2278\\u2279\\u2280\\u2281\\u2284\\u2285\\u2288\\u2289\\u22AC-"
\r
2513 + "\\u22AF\\u22E0-\\u22E3\\u22EA-\\u22ED\\u2329\\u232A\\u2460-"
\r
2514 + "\\u24EA\\u2A0C\\u2A74-\\u2A76\\u2ADC\\u2C7C\\u2C7D\\u2D6F\\u2DE0"
\r
2515 + "-\\u2DFF\\u2E9F\\u2EF3\\u2F00-\\u2FD5\\u3000\\u302A-\\u302F"
\r
2516 + "\\u3036\\u3038-\\u303A\\u304C\\u304E\\u3050\\u3052\\u3054\\u3056"
\r
2517 + "\\u3058\\u305A\\u305C\\u305E\\u3060\\u3062\\u3065\\u3067\\u3069"
\r
2518 + "\\u3070\\u3071\\u3073\\u3074\\u3076\\u3077\\u3079\\u307A\\u307C"
\r
2519 + "\\u307D\\u3094\\u3099-\\u309C\\u309E\\u309F\\u30AC\\u30AE\\u30B0"
\r
2520 + "\\u30B2\\u30B4\\u30B6\\u30B8\\u30BA\\u30BC\\u30BE\\u30C0\\u30C2"
\r
2521 + "\\u30C5\\u30C7\\u30C9\\u30D0\\u30D1\\u30D3\\u30D4\\u30D6\\u30D7"
\r
2522 + "\\u30D9\\u30DA\\u30DC\\u30DD\\u30F4\\u30F7-\\u30FA\\u30FE\\u30FF"
\r
2523 + "\\u3131-\\u318E\\u3192-\\u319F\\u3200-\\u321E\\u3220-\\u3243"
\r
2524 + "\\u3250-\\u327E\\u3280-\\u32FE\\u3300-\\u33FF\\uA66F\\uA67C"
\r
2525 + "\\uA67D\\uA770\\uA806\\uA8C4\\uA92B-\\uA92D\\uA953\\uAC00-"
\r
2526 + "\\uD7A3\\uF900-\\uFA0D\\uFA10\\uFA12\\uFA15-\\uFA1E\\uFA20"
\r
2527 + "\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D\\uFA30-\\uFA6A\\uFA70-"
\r
2528 + "\\uFAD9\\uFB00-\\uFB06\\uFB13-\\uFB17\\uFB1D-\\uFB36\\uFB38-"
\r
2529 + "\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-\\uFBB1\\uFBD3"
\r
2530 + "-\\uFD3D\\uFD50-\\uFD8F\\uFD92-\\uFDC7\\uFDF0-\\uFDFC\\uFE10-"
\r
2531 + "\\uFE19\\uFE20-\\uFE26\\uFE30-\\uFE44\\uFE47-\\uFE52\\uFE54-"
\r
2532 + "\\uFE66\\uFE68-\\uFE6B\\uFE70-\\uFE72\\uFE74\\uFE76-\\uFEFC"
\r
2533 + "\\uFF01-\\uFFBE\\uFFC2-\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7"
\r
2534 + "\\uFFDA-\\uFFDC\\uFFE0-\\uFFE6\\uFFE8-\\uFFEE\\U000101FD\\U00010"
\r
2535 + "A0D\\U00010A0F\\U00010A38-\\U00010A3A\\U00010A3F\\U0001D15E-"
\r
2536 + "\\U0001D169\\U0001D16D-\\U0001D172\\U0001D17B-\\U0001D182\\U0001"
\r
2537 + "D185-\\U0001D18B\\U0001D1AA-\\U0001D1AD\\U0001D1BB-\\U0001D1C0"
\r
2538 + "\\U0001D242-\\U0001D244\\U0001D400-\\U0001D454\\U0001D456-"
\r
2539 + "\\U0001D49C\\U0001D49E\\U0001D49F\\U0001D4A2\\U0001D4A5\\U0001D4"
\r
2540 + "A6\\U0001D4A9-\\U0001D4AC\\U0001D4AE-\\U0001D4B9\\U0001D4BB"
\r
2541 + "\\U0001D4BD-\\U0001D4C3\\U0001D4C5-\\U0001D505\\U0001D507-"
\r
2542 + "\\U0001D50A\\U0001D50D-\\U0001D514\\U0001D516-\\U0001D51C\\U0001"
\r
2543 + "D51E-\\U0001D539\\U0001D53B-\\U0001D53E\\U0001D540-\\U0001D544"
\r
2544 + "\\U0001D546\\U0001D54A-\\U0001D550\\U0001D552-\\U0001D6A5\\U0001"
\r
2545 + "D6A8-\\U0001D7CB\\U0001D7CE-\\U0001D7FF\\U0002F800-\\U0002FA1D]", false);
\r
2547 skipSets[KC].applyPattern(
\r
2548 "[^<->A-PR-Za-pr-z\\u00A0\\u00A8\\u00AA\\u00AF\\u00B2-\\u00B5"
\r
2549 + "\\u00B8-\\u00BA\\u00BC-\\u00BE\\u00C0-\\u00CF\\u00D1-\\u00D6"
\r
2550 + "\\u00D8-\\u00DD\\u00E0-\\u00EF\\u00F1-\\u00F6\\u00F8-\\u00FD"
\r
2551 + "\\u00FF-\\u0103\\u0106-\\u010F\\u0112-\\u0117\\u011A-\\u0121"
\r
2552 + "\\u0124\\u0125\\u0128-\\u012D\\u0130\\u0132\\u0133\\u0139\\u013A"
\r
2553 + "\\u013D-\\u0140\\u0143\\u0144\\u0147-\\u0149\\u014C-\\u0151"
\r
2554 + "\\u0154\\u0155\\u0158-\\u015D\\u0160\\u0161\\u0164\\u0165\\u0168"
\r
2555 + "-\\u0171\\u0174-\\u017F\\u01A0\\u01A1\\u01AF\\u01B0\\u01B7"
\r
2556 + "\\u01C4-\\u01DC\\u01DE-\\u01E1\\u01E6-\\u01EB\\u01F1-\\u01F5"
\r
2557 + "\\u01F8-\\u01FB\\u0200-\\u021B\\u021E\\u021F\\u0226-\\u0233"
\r
2558 + "\\u0292\\u02B0-\\u02B8\\u02D8-\\u02DD\\u02E0-\\u02E4\\u0300-"
\r
2559 + "\\u034E\\u0350-\\u036F\\u0374\\u037A\\u037E\\u0384\\u0385\\u0387"
\r
2560 + "\\u0391\\u0395\\u0397\\u0399\\u039F\\u03A1\\u03A5\\u03A9\\u03AC"
\r
2561 + "\\u03AE\\u03B1\\u03B5\\u03B7\\u03B9\\u03BF\\u03C1\\u03C5\\u03C9-"
\r
2562 + "\\u03CB\\u03CE\\u03D0-\\u03D6\\u03F0-\\u03F2\\u03F4\\u03F5"
\r
2563 + "\\u03F9\\u0406\\u0410\\u0413\\u0415-\\u0418\\u041A\\u041E\\u0423"
\r
2564 + "\\u0427\\u042B\\u042D\\u0430\\u0433\\u0435-\\u0438\\u043A\\u043E"
\r
2565 + "\\u0443\\u0447\\u044B\\u044D\\u0456\\u0474\\u0475\\u0483-\\u0487"
\r
2566 + "\\u04D8\\u04D9\\u04E8\\u04E9\\u0587\\u0591-\\u05BD\\u05BF\\u05C1"
\r
2567 + "\\u05C2\\u05C4\\u05C5\\u05C7\\u0610-\\u061A\\u0622\\u0623\\u0627"
\r
2568 + "\\u0648\\u064A-\\u065E\\u0670\\u0675-\\u0678\\u06C1\\u06D2"
\r
2569 + "\\u06D5-\\u06DC\\u06DF-\\u06E4\\u06E7\\u06E8\\u06EA-\\u06ED"
\r
2570 + "\\u0711\\u0730-\\u074A\\u07EB-\\u07F3\\u0928\\u0930\\u0933"
\r
2571 + "\\u093C\\u094D\\u0951-\\u0954\\u0958-\\u095F\\u09BC\\u09BE"
\r
2572 + "\\u09C7\\u09CD\\u09D7\\u09DC\\u09DD\\u09DF\\u0A33\\u0A36\\u0A3C"
\r
2573 + "\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C\\u0B3E\\u0B47"
\r
2574 + "\\u0B4D\\u0B56\\u0B57\\u0B5C\\u0B5D\\u0B92\\u0BBE\\u0BC6\\u0BC7"
\r
2575 + "\\u0BCD\\u0BD7\\u0C46\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CBF\\u0CC2"
\r
2576 + "\\u0CC6\\u0CCA\\u0CCD\\u0CD5\\u0CD6\\u0D3E\\u0D46\\u0D47\\u0D4D"
\r
2577 + "\\u0D57\\u0DCA\\u0DCF\\u0DD9\\u0DDC\\u0DDF\\u0E33\\u0E38-\\u0E3A"
\r
2578 + "\\u0E48-\\u0E4B\\u0EB3\\u0EB8\\u0EB9\\u0EC8-\\u0ECB\\u0EDC"
\r
2579 + "\\u0EDD\\u0F0C\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D"
\r
2580 + "\\u0F52\\u0F57\\u0F5C\\u0F69\\u0F71-\\u0F7D\\u0F80-\\u0F84"
\r
2581 + "\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9\\u0FC6"
\r
2582 + "\\u1025\\u102E\\u1037\\u1039\\u103A\\u108D\\u10FC\\u1100-\\u1112"
\r
2583 + "\\u1161-\\u1175\\u11A8-\\u11C2\\u135F\\u1714\\u1734\\u17D2"
\r
2584 + "\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1B05\\u1B07\\u1B09"
\r
2585 + "\\u1B0B\\u1B0D\\u1B11\\u1B34\\u1B35\\u1B3A\\u1B3C\\u1B3E\\u1B3F"
\r
2586 + "\\u1B42\\u1B44\\u1B6B-\\u1B73\\u1BAA\\u1C37\\u1D2C-\\u1D2E"
\r
2587 + "\\u1D30-\\u1D3A\\u1D3C-\\u1D4D\\u1D4F-\\u1D6A\\u1D78\\u1D9B-"
\r
2588 + "\\u1DE6\\u1DFE-\\u1E03\\u1E0A-\\u1E0F\\u1E12-\\u1E1B\\u1E20-"
\r
2589 + "\\u1E27\\u1E2A-\\u1E41\\u1E44-\\u1E53\\u1E58-\\u1E7D\\u1E80-"
\r
2590 + "\\u1E87\\u1E8E-\\u1E91\\u1E96-\\u1E9B\\u1EA0-\\u1EF3\\u1EF6-"
\r
2591 + "\\u1EF9\\u1F00-\\u1F11\\u1F18\\u1F19\\u1F20-\\u1F31\\u1F38"
\r
2592 + "\\u1F39\\u1F40\\u1F41\\u1F48\\u1F49\\u1F50\\u1F51\\u1F59\\u1F60-"
\r
2593 + "\\u1F71\\u1F73-\\u1F75\\u1F77\\u1F79\\u1F7B-\\u1F7D\\u1F80"
\r
2594 + "\\u1F81\\u1F88\\u1F89\\u1F90\\u1F91\\u1F98\\u1F99\\u1FA0\\u1FA1"
\r
2595 + "\\u1FA8\\u1FA9\\u1FB3\\u1FB6\\u1FBB-\\u1FC1\\u1FC3\\u1FC6\\u1FC9"
\r
2596 + "\\u1FCB-\\u1FCF\\u1FD3\\u1FDB\\u1FDD-\\u1FDF\\u1FE3\\u1FEB"
\r
2597 + "\\u1FED-\\u1FEF\\u1FF3\\u1FF6\\u1FF9\\u1FFB-\\u1FFE\\u2000-"
\r
2598 + "\\u200A\\u2011\\u2017\\u2024-\\u2026\\u202F\\u2033\\u2034\\u2036"
\r
2599 + "\\u2037\\u203C\\u203E\\u2047-\\u2049\\u2057\\u205F\\u2070\\u2071"
\r
2600 + "\\u2074-\\u208E\\u2090-\\u2094\\u20A8\\u20D0-\\u20DC\\u20E1"
\r
2601 + "\\u20E5-\\u20F0\\u2100-\\u2103\\u2105-\\u2107\\u2109-\\u2113"
\r
2602 + "\\u2115\\u2116\\u2119-\\u211D\\u2120-\\u2122\\u2124\\u2126"
\r
2603 + "\\u2128\\u212A-\\u212D\\u212F-\\u2131\\u2133-\\u2139\\u213B-"
\r
2604 + "\\u2140\\u2145-\\u2149\\u2153-\\u217F\\u2190\\u2192\\u2194"
\r
2605 + "\\u21D0\\u21D2\\u21D4\\u2203\\u2208\\u220B\\u2223\\u2225\\u222C"
\r
2606 + "\\u222D\\u222F\\u2230\\u223C\\u2243\\u2245\\u2248\\u224D\\u2261"
\r
2607 + "\\u2264\\u2265\\u2272\\u2273\\u2276\\u2277\\u227A-\\u227D\\u2282"
\r
2608 + "\\u2283\\u2286\\u2287\\u2291\\u2292\\u22A2\\u22A8\\u22A9\\u22AB"
\r
2609 + "\\u22B2-\\u22B5\\u2329\\u232A\\u2460-\\u24EA\\u2A0C\\u2A74-"
\r
2610 + "\\u2A76\\u2ADC\\u2C7C\\u2C7D\\u2D6F\\u2DE0-\\u2DFF\\u2E9F\\u2EF3"
\r
2611 + "\\u2F00-\\u2FD5\\u3000\\u302A-\\u302F\\u3036\\u3038-\\u303A"
\r
2612 + "\\u3046\\u304B\\u304D\\u304F\\u3051\\u3053\\u3055\\u3057\\u3059"
\r
2613 + "\\u305B\\u305D\\u305F\\u3061\\u3064\\u3066\\u3068\\u306F\\u3072"
\r
2614 + "\\u3075\\u3078\\u307B\\u3099-\\u309D\\u309F\\u30A6\\u30AB\\u30AD"
\r
2615 + "\\u30AF\\u30B1\\u30B3\\u30B5\\u30B7\\u30B9\\u30BB\\u30BD\\u30BF"
\r
2616 + "\\u30C1\\u30C4\\u30C6\\u30C8\\u30CF\\u30D2\\u30D5\\u30D8\\u30DB"
\r
2617 + "\\u30EF-\\u30F2\\u30FD\\u30FF\\u3131-\\u318E\\u3192-\\u319F"
\r
2618 + "\\u3200-\\u321E\\u3220-\\u3243\\u3250-\\u327E\\u3280-\\u32FE"
\r
2619 + "\\u3300-\\u33FF\\uA66F\\uA67C\\uA67D\\uA770\\uA806\\uA8C4\\uA92B"
\r
2620 + "-\\uA92D\\uA953\\uAC00\\uAC1C\\uAC38\\uAC54\\uAC70\\uAC8C\\uACA8"
\r
2621 + "\\uACC4\\uACE0\\uACFC\\uAD18\\uAD34\\uAD50\\uAD6C\\uAD88\\uADA4"
\r
2622 + "\\uADC0\\uADDC\\uADF8\\uAE14\\uAE30\\uAE4C\\uAE68\\uAE84\\uAEA0"
\r
2623 + "\\uAEBC\\uAED8\\uAEF4\\uAF10\\uAF2C\\uAF48\\uAF64\\uAF80\\uAF9C"
\r
2624 + "\\uAFB8\\uAFD4\\uAFF0\\uB00C\\uB028\\uB044\\uB060\\uB07C\\uB098"
\r
2625 + "\\uB0B4\\uB0D0\\uB0EC\\uB108\\uB124\\uB140\\uB15C\\uB178\\uB194"
\r
2626 + "\\uB1B0\\uB1CC\\uB1E8\\uB204\\uB220\\uB23C\\uB258\\uB274\\uB290"
\r
2627 + "\\uB2AC\\uB2C8\\uB2E4\\uB300\\uB31C\\uB338\\uB354\\uB370\\uB38C"
\r
2628 + "\\uB3A8\\uB3C4\\uB3E0\\uB3FC\\uB418\\uB434\\uB450\\uB46C\\uB488"
\r
2629 + "\\uB4A4\\uB4C0\\uB4DC\\uB4F8\\uB514\\uB530\\uB54C\\uB568\\uB584"
\r
2630 + "\\uB5A0\\uB5BC\\uB5D8\\uB5F4\\uB610\\uB62C\\uB648\\uB664\\uB680"
\r
2631 + "\\uB69C\\uB6B8\\uB6D4\\uB6F0\\uB70C\\uB728\\uB744\\uB760\\uB77C"
\r
2632 + "\\uB798\\uB7B4\\uB7D0\\uB7EC\\uB808\\uB824\\uB840\\uB85C\\uB878"
\r
2633 + "\\uB894\\uB8B0\\uB8CC\\uB8E8\\uB904\\uB920\\uB93C\\uB958\\uB974"
\r
2634 + "\\uB990\\uB9AC\\uB9C8\\uB9E4\\uBA00\\uBA1C\\uBA38\\uBA54\\uBA70"
\r
2635 + "\\uBA8C\\uBAA8\\uBAC4\\uBAE0\\uBAFC\\uBB18\\uBB34\\uBB50\\uBB6C"
\r
2636 + "\\uBB88\\uBBA4\\uBBC0\\uBBDC\\uBBF8\\uBC14\\uBC30\\uBC4C\\uBC68"
\r
2637 + "\\uBC84\\uBCA0\\uBCBC\\uBCD8\\uBCF4\\uBD10\\uBD2C\\uBD48\\uBD64"
\r
2638 + "\\uBD80\\uBD9C\\uBDB8\\uBDD4\\uBDF0\\uBE0C\\uBE28\\uBE44\\uBE60"
\r
2639 + "\\uBE7C\\uBE98\\uBEB4\\uBED0\\uBEEC\\uBF08\\uBF24\\uBF40\\uBF5C"
\r
2640 + "\\uBF78\\uBF94\\uBFB0\\uBFCC\\uBFE8\\uC004\\uC020\\uC03C\\uC058"
\r
2641 + "\\uC074\\uC090\\uC0AC\\uC0C8\\uC0E4\\uC100\\uC11C\\uC138\\uC154"
\r
2642 + "\\uC170\\uC18C\\uC1A8\\uC1C4\\uC1E0\\uC1FC\\uC218\\uC234\\uC250"
\r
2643 + "\\uC26C\\uC288\\uC2A4\\uC2C0\\uC2DC\\uC2F8\\uC314\\uC330\\uC34C"
\r
2644 + "\\uC368\\uC384\\uC3A0\\uC3BC\\uC3D8\\uC3F4\\uC410\\uC42C\\uC448"
\r
2645 + "\\uC464\\uC480\\uC49C\\uC4B8\\uC4D4\\uC4F0\\uC50C\\uC528\\uC544"
\r
2646 + "\\uC560\\uC57C\\uC598\\uC5B4\\uC5D0\\uC5EC\\uC608\\uC624\\uC640"
\r
2647 + "\\uC65C\\uC678\\uC694\\uC6B0\\uC6CC\\uC6E8\\uC704\\uC720\\uC73C"
\r
2648 + "\\uC758\\uC774\\uC790\\uC7AC\\uC7C8\\uC7E4\\uC800\\uC81C\\uC838"
\r
2649 + "\\uC854\\uC870\\uC88C\\uC8A8\\uC8C4\\uC8E0\\uC8FC\\uC918\\uC934"
\r
2650 + "\\uC950\\uC96C\\uC988\\uC9A4\\uC9C0\\uC9DC\\uC9F8\\uCA14\\uCA30"
\r
2651 + "\\uCA4C\\uCA68\\uCA84\\uCAA0\\uCABC\\uCAD8\\uCAF4\\uCB10\\uCB2C"
\r
2652 + "\\uCB48\\uCB64\\uCB80\\uCB9C\\uCBB8\\uCBD4\\uCBF0\\uCC0C\\uCC28"
\r
2653 + "\\uCC44\\uCC60\\uCC7C\\uCC98\\uCCB4\\uCCD0\\uCCEC\\uCD08\\uCD24"
\r
2654 + "\\uCD40\\uCD5C\\uCD78\\uCD94\\uCDB0\\uCDCC\\uCDE8\\uCE04\\uCE20"
\r
2655 + "\\uCE3C\\uCE58\\uCE74\\uCE90\\uCEAC\\uCEC8\\uCEE4\\uCF00\\uCF1C"
\r
2656 + "\\uCF38\\uCF54\\uCF70\\uCF8C\\uCFA8\\uCFC4\\uCFE0\\uCFFC\\uD018"
\r
2657 + "\\uD034\\uD050\\uD06C\\uD088\\uD0A4\\uD0C0\\uD0DC\\uD0F8\\uD114"
\r
2658 + "\\uD130\\uD14C\\uD168\\uD184\\uD1A0\\uD1BC\\uD1D8\\uD1F4\\uD210"
\r
2659 + "\\uD22C\\uD248\\uD264\\uD280\\uD29C\\uD2B8\\uD2D4\\uD2F0\\uD30C"
\r
2660 + "\\uD328\\uD344\\uD360\\uD37C\\uD398\\uD3B4\\uD3D0\\uD3EC\\uD408"
\r
2661 + "\\uD424\\uD440\\uD45C\\uD478\\uD494\\uD4B0\\uD4CC\\uD4E8\\uD504"
\r
2662 + "\\uD520\\uD53C\\uD558\\uD574\\uD590\\uD5AC\\uD5C8\\uD5E4\\uD600"
\r
2663 + "\\uD61C\\uD638\\uD654\\uD670\\uD68C\\uD6A8\\uD6C4\\uD6E0\\uD6FC"
\r
2664 + "\\uD718\\uD734\\uD750\\uD76C\\uD788\\uF900-\\uFA0D\\uFA10\\uFA12"
\r
2665 + "\\uFA15-\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D"
\r
2666 + "\\uFA30-\\uFA6A\\uFA70-\\uFAD9\\uFB00-\\uFB06\\uFB13-\\uFB17"
\r
2667 + "\\uFB1D-\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43"
\r
2668 + "\\uFB44\\uFB46-\\uFBB1\\uFBD3-\\uFD3D\\uFD50-\\uFD8F\\uFD92-"
\r
2669 + "\\uFDC7\\uFDF0-\\uFDFC\\uFE10-\\uFE19\\uFE20-\\uFE26\\uFE30-"
\r
2670 + "\\uFE44\\uFE47-\\uFE52\\uFE54-\\uFE66\\uFE68-\\uFE6B\\uFE70-"
\r
2671 + "\\uFE72\\uFE74\\uFE76-\\uFEFC\\uFF01-\\uFFBE\\uFFC2-\\uFFC7"
\r
2672 + "\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7\\uFFDA-\\uFFDC\\uFFE0-\\uFFE6"
\r
2673 + "\\uFFE8-\\uFFEE\\U000101FD\\U00010A0D\\U00010A0F\\U00010A38-"
\r
2674 + "\\U00010A3A\\U00010A3F\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001"
\r
2675 + "D172\\U0001D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-"
\r
2676 + "\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0001"
\r
2677 + "D400-\\U0001D454\\U0001D456-\\U0001D49C\\U0001D49E\\U0001D49F"
\r
2678 + "\\U0001D4A2\\U0001D4A5\\U0001D4A6\\U0001D4A9-\\U0001D4AC\\U0001D"
\r
2679 + "4AE-\\U0001D4B9\\U0001D4BB\\U0001D4BD-\\U0001D4C3\\U0001D4C5-"
\r
2680 + "\\U0001D505\\U0001D507-\\U0001D50A\\U0001D50D-\\U0001D514\\U0001"
\r
2681 + "D516-\\U0001D51C\\U0001D51E-\\U0001D539\\U0001D53B-\\U0001D53E"
\r
2682 + "\\U0001D540-\\U0001D544\\U0001D546\\U0001D54A-\\U0001D550\\U0001"
\r
2683 + "D552-\\U0001D6A5\\U0001D6A8-\\U0001D7CB\\U0001D7CE-\\U0001D7FF"
\r
2684 + "\\U0002F800-\\U0002FA1D]", false);
\r
2689 public void TestSkippable() {
\r
2690 UnicodeSet starts;
\r
2691 UnicodeSet[] skipSets = new UnicodeSet[]{
\r
2692 new UnicodeSet(), //NFD
\r
2693 new UnicodeSet(), //NFC
\r
2694 new UnicodeSet(), //NFKC
\r
2695 new UnicodeSet(), //NFKD
\r
2696 new UnicodeSet(), //FCD
\r
2697 new UnicodeSet(), //NONE
\r
2699 UnicodeSet[] expectSets = new UnicodeSet[]{
\r
2707 StringBuffer s, pattern;
\r
2708 int start, limit, rangeEnd;
\r
2709 int i, range, count;
\r
2710 starts = new UnicodeSet();
\r
2712 //[\u0350-\u0357\u035D-\u035F\u0610-\u0615\u0656-\u0658\u0CBC\u17DD\u1939-\u193B]
\r
2713 for(int ch=0;ch<=0x10FFFF;ch++){
\r
2714 if(Normalizer.isNFSkippable(ch, Normalizer.NFD)) {
\r
2715 skipSets[D].add(ch);
\r
2717 if(Normalizer.isNFSkippable(ch, Normalizer.NFKD)) {
\r
2718 skipSets[KD].add(ch);
\r
2720 if(Normalizer.isNFSkippable(ch, Normalizer.NFC)) {
\r
2721 skipSets[C].add(ch);
\r
2723 if(Normalizer.isNFSkippable(ch, Normalizer.NFKC)) {
\r
2724 skipSets[KC].add(ch);
\r
2726 if(Normalizer.isNFSkippable(ch, Normalizer.FCD)) {
\r
2727 skipSets[FCD].add(ch);
\r
2729 if(Normalizer.isNFSkippable(ch, Normalizer.NONE)) {
\r
2730 skipSets[NONE].add(ch);
\r
2734 // build NF*Skippable sets from runtime data
\r
2735 NormalizerImpl.addPropertyStarts(starts);
\r
2736 count=starts.getRangeCount();
\r
2743 // get properties for start and apply them to [start..limit[
\r
2744 if(Normalizer.isNFSkippable(start, Normalizer.NFD)) {
\r
2745 skipSets[D].add(start, limit-1);
\r
2747 if(Normalizer.isNFSkippable(start, Normalizer.NFKD)) {
\r
2748 skipSets[KD].add(start, limit-1);
\r
2750 if(Normalizer.isNFSkippable(start, Normalizer.NFC)) {
\r
2751 skipSets[C].add(start, limit-1);
\r
2753 if(Normalizer.isNFSkippable(start, Normalizer.NFKC)) {
\r
2754 skipSets[KC].add(start, limit-1);
\r
2756 if(Normalizer.isNFSkippable(start, Normalizer.FCD)) {
\r
2757 skipSets[FCD].add(start, limit-1);
\r
2759 if(Normalizer.isNFSkippable(start, Normalizer.NONE)) {
\r
2760 skipSets[NONE].add(start, limit-1);
\r
2765 // go to next range of same properties
\r
2767 if(++limit>rangeEnd) {
\r
2769 limit=starts.getRangeStart(range);
\r
2770 rangeEnd=starts.getRangeEnd(range);
\r
2772 } else if(range==count) {
\r
2773 // additional range to complete the Unicode code space
\r
2774 limit=rangeEnd=0x110000;
\r
2782 expectSets = initSkippables(expectSets);
\r
2783 if(expectSets[D].contains(0x0350)){
\r
2784 errln("expectSets[D] contains 0x0350");
\r
2786 //expectSets.length for now do not test FCD and NONE since there is no data
\r
2787 for(i=0; i< 4; ++i) {
\r
2789 if(!skipSets[i].equals(expectSets[i])) {
\r
2790 errln("error: TestSkippable skipSets["+i+"]!=expectedSets["+i+"]\n"+
\r
2791 "May need to update hardcoded UnicodeSet patterns in com.ibm.icu.dev.test.normalizer.BasicTest.java\n"+
\r
2792 "See ICU4J - unicodetools.com.ibm.text.UCD.NFSkippable\n" +
\r
2793 "Run com.ibm.text.UCD.Main with the option NFSkippable.");
\r
2795 s=new StringBuffer();
\r
2797 s.append("\n\nskip= ");
\r
2798 s.append(skipSets[i].toPattern(true));
\r
2801 s.append("skip-expect=");
\r
2802 pattern = new StringBuffer(((UnicodeSet)skipSets[i].clone()).removeAll(expectSets[i]).toPattern(true));
\r
2803 s.append(pattern);
\r
2805 pattern.delete(0,pattern.length());
\r
2806 s.append("\n\nexpect-skip=");
\r
2807 pattern = new StringBuffer(((UnicodeSet)expectSets[i].clone()).removeAll(skipSets[i]).toPattern(true));
\r
2808 s.append(pattern);
\r
2811 pattern.delete(0,pattern.length());
\r
2812 s.append("\n\nintersection(expect,skip)=");
\r
2813 UnicodeSet intersection = ((UnicodeSet) expectSets[i].clone()).retainAll(skipSets[i]);
\r
2814 pattern = new StringBuffer(intersection.toPattern(true));
\r
2815 s.append(pattern);
\r
2820 errln(s.toString());
\r
2825 public void TestBugJ2068(){
\r
2826 String sample = "The quick brown fox jumped over the lazy dog";
\r
2827 UCharacterIterator text = UCharacterIterator.getInstance(sample);
\r
2828 Normalizer norm = new Normalizer(text,Normalizer.NFC,0);
\r
2830 if(text.current() == norm.current()){
\r
2831 errln("Normalizer is not cloning the UCharacterIterator");
\r
2834 public void TestGetCombiningClass(){
\r
2835 for(int i=0;i<0x10FFFF;i++){
\r
2836 int cc = UCharacter.getCombiningClass(i);
\r
2837 if(0xD800<= i && i<=0xDFFF && cc >0 ){
\r
2838 cc = UCharacter.getCombiningClass(i);
\r
2839 errln("CC: "+ cc + " for codepoint: " +Utility.hex(i,8));
\r
2844 public void TestGetNX(){
\r
2845 UnicodeSet set = NormalizerImpl.getNX(1 /*NormalizerImpl.NX_HANGUL*/);
\r
2846 if(!set.contains(0xac01)){
\r
2847 errln("getNX did not return correct set for NX_HANGUL");
\r
2850 set = NormalizerImpl.getNX(2/*NormalizerImpl.NX_CJK_COMPAT*/);
\r
2851 if(!set.contains('\uFA20')){
\r
2852 errln("getNX did not return correct set for NX_CJK_COMPAT");
\r
2855 public void TestSerializedSet(){
\r
2856 USerializedSet sset=new USerializedSet();
\r
2857 UnicodeSet set = new UnicodeSet();
\r
2860 // collect all sets into one for contiguous output
\r
2861 int[] startEnd = new int[2];
\r
2863 if(NormalizerImpl.getCanonStartSet(0x0130, sset)) {
\r
2864 int count=sset.countRanges();
\r
2865 for(int j=0; j<count; ++j) {
\r
2866 sset.getRange(j, startEnd);
\r
2867 set.add(startEnd[0], startEnd[1]);
\r
2872 // test all of these precomposed characters
\r
2873 UnicodeSetIterator it = new UnicodeSetIterator(set);
\r
2874 while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) {
\r
2875 start=it.codepoint;
\r
2876 end=it.codepointEnd;
\r
2877 while(start<=end) {
\r
2878 if(!sset.contains(start)){
\r
2879 errln("USerializedSet.contains failed for "+Utility.hex(start,8));
\r
2885 public void TestReturnFailure(){
\r
2886 char[] term = {'r','\u00e9','s','u','m','\u00e9' };
\r
2887 char[] decomposed_term = new char[10 + term.length + 2];
\r
2888 int rc = Normalizer.decompose(term,0,term.length, decomposed_term,0,decomposed_term.length,true, 0);
\r
2889 int rc1 = Normalizer.decompose(term,0,term.length, decomposed_term,10,decomposed_term.length,true, 0);
\r
2891 errln("Normalizer decompose did not return correct length");
\r
2895 private final static class TestCompositionCase {
\r
2896 public Normalizer.Mode mode;
\r
2897 public int options;
\r
2898 public String input, expect;
\r
2899 TestCompositionCase(Normalizer.Mode mode, int options, String input, String expect) {
\r
2901 this.options=options;
\r
2903 this.expect=expect;
\r
2907 public void TestComposition() {
\r
2908 final TestCompositionCase cases[]=new TestCompositionCase[]{
\r
2910 * special cases for UAX #15 bug
\r
2911 * see Unicode Public Review Issue #29
\r
2912 * at http://www.unicode.org/review/resolved-pri.html#pri29
\r
2914 new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327", "\u1100\u0300\u1161\u0327"),
\r
2915 new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327\u11a8","\u1100\u0300\u1161\u0327\u11a8"),
\r
2916 new TestCompositionCase(Normalizer.NFC, 0, "\uac00\u0300\u0327\u11a8", "\uac00\u0327\u0300\u11a8"),
\r
2917 new TestCompositionCase(Normalizer.NFC, 0, "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"),
\r
2919 new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\u1100\u0300\u1161\u0327", "\uac00\u0300\u0327"),
\r
2920 new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\u1100\u0300\u1161\u0327\u11a8", "\uac01\u0300\u0327"),
\r
2921 new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\uac00\u0300\u0327\u11a8", "\uac01\u0327\u0300"),
\r
2922 new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\u0b47\u0300\u0b3e", "\u0b4b\u0300")
\r
2924 /* TODO: add test cases for UNORM_FCC here (j2151) */
\r
2930 for(i=0; i<cases.length; ++i) {
\r
2931 output=Normalizer.normalize(cases[i].input, cases[i].mode, cases[i].options);
\r
2932 if(!output.equals(cases[i].expect)) {
\r
2933 errln("unexpected result for case "+i);
\r