2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and
\r
4 * others. All Rights Reserved.
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.dev.test.normalizer;
\r
10 import java.text.StringCharacterIterator;
\r
11 import java.util.Random;
\r
13 import com.ibm.icu.dev.test.TestFmwk;
\r
14 import com.ibm.icu.impl.Norm2AllModes;
\r
15 import com.ibm.icu.impl.Normalizer2Impl;
\r
16 import com.ibm.icu.impl.USerializedSet;
\r
17 import com.ibm.icu.impl.Utility;
\r
18 import com.ibm.icu.lang.UCharacter;
\r
19 import com.ibm.icu.lang.UCharacterCategory;
\r
20 import com.ibm.icu.lang.UProperty;
\r
21 import com.ibm.icu.text.Normalizer;
\r
22 import com.ibm.icu.text.Normalizer2;
\r
23 import com.ibm.icu.text.UCharacterIterator;
\r
24 import com.ibm.icu.text.UTF16;
\r
25 import com.ibm.icu.text.UnicodeSet;
\r
26 import com.ibm.icu.text.UnicodeSetIterator;
\r
29 public class BasicTest extends TestFmwk {
\r
30 public static void main(String[] args) throws Exception {
\r
31 new BasicTest().run(args);
\r
34 String[][] canonTests = {
\r
35 // Input Decomposed Composed
\r
36 { "cat", "cat", "cat" },
\r
37 { "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", },
\r
39 { "\u1e0a", "D\u0307", "\u1e0a" }, // D-dot_above
\r
40 { "D\u0307", "D\u0307", "\u1e0a" }, // D dot_above
\r
42 { "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_below dot_above
\r
43 { "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_above dot_below
\r
44 { "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D dot_below dot_above
\r
46 { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above
\r
47 { "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below
\r
49 { "\u1E14", "E\u0304\u0300", "\u1E14" }, // E-macron-grave
\r
50 { "\u0112\u0300", "E\u0304\u0300", "\u1E14" }, // E-macron + grave
\r
51 { "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" }, // E-grave + macron
\r
53 { "\u212b", "A\u030a", "\u00c5" }, // angstrom_sign
\r
54 { "\u00c5", "A\u030a", "\u00c5" }, // A-ring
\r
56 { "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" },
\r
57 { "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" },
\r
59 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated with 3.0
\r
60 { "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" }, //updated with 3.0
\r
62 { "Henry IV", "Henry IV", "Henry IV" },
\r
63 { "Henry \u2163", "Henry \u2163", "Henry \u2163" },
\r
65 { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana)
\r
66 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten
\r
67 { "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" }, // hw_ka + hw_ten
\r
68 { "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" }, // ka + hw_ten
\r
69 { "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" }, // hw_ka + ten
\r
71 { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" },
\r
72 {"\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e","\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165", "\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165"},
\r
75 String[][] compatTests = {
\r
76 // Input Decomposed Composed
\r
77 { "cat", "cat", "cat" },
\r
78 { "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", }, // Alef-Lamed vs. Alef, Lamed
\r
80 { "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" },
\r
81 { "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" }, // ffi ligature -> f + f + i
\r
83 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated for 3.0
\r
84 { "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" }, // ffi ligature -> f + f + i
\r
86 { "Henry IV", "Henry IV", "Henry IV" },
\r
87 { "Henry \u2163", "Henry IV", "Henry IV" },
\r
89 { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana)
\r
90 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten
\r
92 { "\uFF76\u3099", "\u30AB\u3099", "\u30AC" }, // hw_ka + ten
\r
94 /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/
\r
95 { "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" }, // hw_ka + hw_ten
\r
96 { "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" }, // ka + hw_ten
\r
100 // With Canonical decomposition, Hangul syllables should get decomposed
\r
101 // into Jamo, but Jamo characters should not be decomposed into
\r
103 String[][] hangulCanon = {
\r
104 // Input Decomposed Composed
\r
105 { "\ud4db", "\u1111\u1171\u11b6", "\ud4db" },
\r
106 { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" },
\r
109 // With compatibility decomposition turned on,
\r
110 // it should go all the way down to conjoining Jamo characters.
\r
111 // THIS IS NO LONGER TRUE IN UNICODE v2.1.8, SO THIS TEST IS OBSOLETE
\r
112 String[][] hangulCompat = {
\r
113 // Input Decomposed Composed
\r
114 // { "\ud4db", "\u1111\u116e\u1175\u11af\u11c2", "\ud478\u1175\u11af\u11c2" },
\r
117 public void TestHangulCompose()
\r
119 // Make sure that the static composition methods work
\r
120 logln("Canonical composition...");
\r
121 staticTest(Normalizer.NFC, hangulCanon, 2);
\r
122 logln("Compatibility composition...");
\r
123 staticTest(Normalizer.NFKC, hangulCompat, 2);
\r
124 // Now try iterative composition....
\r
125 logln("Iterative composition...");
\r
126 Normalizer norm = new Normalizer("", Normalizer.NFC,0);
\r
127 iterateTest(norm, hangulCanon, 2);
\r
129 norm.setMode(Normalizer.NFKD);
\r
130 iterateTest(norm, hangulCompat, 2);
\r
132 // And finally, make sure you can do it in reverse too
\r
133 logln("Reverse iteration...");
\r
134 norm.setMode(Normalizer.NFC);
\r
135 backAndForth(norm, hangulCanon);
\r
138 public void TestHangulDecomp() throws Exception{
\r
139 // Make sure that the static decomposition methods work
\r
140 logln("Canonical decomposition...");
\r
141 staticTest(Normalizer.NFD, hangulCanon, 1);
\r
142 logln("Compatibility decomposition...");
\r
143 staticTest(Normalizer.NFKD, hangulCompat, 1);
\r
145 // Now the iterative decomposition methods...
\r
146 logln("Iterative decomposition...");
\r
147 Normalizer norm = new Normalizer("", Normalizer.NFD,0);
\r
148 iterateTest(norm, hangulCanon, 1);
\r
150 norm.setMode(Normalizer.NFKD);
\r
151 iterateTest(norm, hangulCompat, 1);
\r
153 // And finally, make sure you can do it in reverse too
\r
154 logln("Reverse iteration...");
\r
155 norm.setMode(Normalizer.NFD);
\r
156 backAndForth(norm, hangulCanon);
\r
158 public void TestNone() throws Exception{
\r
159 Normalizer norm = new Normalizer("", Normalizer.NONE,0);
\r
160 iterateTest(norm, canonTests, 0);
\r
161 staticTest(Normalizer.NONE, canonTests, 0);
\r
163 public void TestDecomp() throws Exception{
\r
164 Normalizer norm = new Normalizer("", Normalizer.NFD,0);
\r
165 iterateTest(norm, canonTests, 1);
\r
166 staticTest(Normalizer.NFD, canonTests, 1);
\r
167 decomposeTest(Normalizer.NFD, canonTests, 1);
\r
170 public void TestCompatDecomp() throws Exception{
\r
171 Normalizer norm = new Normalizer("", Normalizer.NFKD,0);
\r
172 iterateTest(norm, compatTests, 1);
\r
173 staticTest(Normalizer.NFKD,compatTests, 1);
\r
174 decomposeTest(Normalizer.NFKD,compatTests, 1);
\r
177 public void TestCanonCompose() throws Exception{
\r
178 Normalizer norm = new Normalizer("", Normalizer.NFC,0);
\r
179 iterateTest(norm, canonTests, 2);
\r
180 staticTest(Normalizer.NFC, canonTests, 2);
\r
181 composeTest(Normalizer.NFC, canonTests, 2);
\r
184 public void TestCompatCompose() throws Exception{
\r
185 Normalizer norm = new Normalizer("", Normalizer.NFKC,0);
\r
186 iterateTest(norm, compatTests, 2);
\r
187 staticTest(Normalizer.NFKC,compatTests, 2);
\r
188 composeTest(Normalizer.NFKC,compatTests, 2);
\r
191 public void TestExplodingBase() throws Exception{
\r
192 // \u017f - Latin small letter long s
\r
193 // \u0307 - combining dot above
\r
194 // \u1e61 - Latin small letter s with dot above
\r
195 // \u1e9b - Latin small letter long s with dot above
\r
196 String[][] canon = {
\r
197 // Input Decomposed Composed
\r
198 { "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" },
\r
199 { "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" },
\r
201 String[][] compat = {
\r
202 // Input Decomposed Composed
\r
203 { "\u017f", "s", "s" },
\r
204 { "\u1e9b", "s\u0307", "\u1e61" },
\r
207 staticTest(Normalizer.NFD, canon, 1);
\r
208 staticTest(Normalizer.NFC, canon, 2);
\r
210 staticTest(Normalizer.NFKD, compat, 1);
\r
211 staticTest(Normalizer.NFKC, compat, 2);
\r
216 * The Tibetan vowel sign AA, 0f71, was messed up prior to
\r
217 * Unicode version 2.1.9.
\r
218 * Once 2.1.9 or 3.0 is released, uncomment this test.
\r
220 public void TestTibetan() throws Exception{
\r
221 String[][] decomp = {
\r
222 { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" }
\r
224 String[][] compose = {
\r
225 { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" }
\r
228 staticTest(Normalizer.NFD, decomp, 1);
\r
229 staticTest(Normalizer.NFKD,decomp, 2);
\r
230 staticTest(Normalizer.NFC, compose, 1);
\r
231 staticTest(Normalizer.NFKC,compose, 2);
\r
235 * Make sure characters in the CompositionExclusion.txt list do not get
\r
238 public void TestCompositionExclusion()
\r
240 // This list is generated from CompositionExclusion.txt.
\r
241 // Update whenever the normalizer tables are updated. Note
\r
242 // that we test all characters listed, even those that can be
\r
243 // derived from the Unicode DB and are therefore commented
\r
246 "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" +
\r
247 "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" +
\r
248 "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" +
\r
249 "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" +
\r
250 "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" +
\r
251 "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" +
\r
252 "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u1FDB" +
\r
253 "\u1FE3\u1FEB\u1FEE\u1FEF\u1FF9\u1FFB\u1FFD\u2000" +
\r
254 "\u2001\u2126\u212A\u212B\u2329\u232A\uF900\uFA10" +
\r
255 "\uFA12\uFA15\uFA20\uFA22\uFA25\uFA26\uFA2A\uFB1F" +
\r
256 "\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E\uFB2F\uFB30\uFB31" +
\r
257 "\uFB32\uFB33\uFB34\uFB35\uFB36\uFB38\uFB39\uFB3A" +
\r
258 "\uFB3B\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46" +
\r
259 "\uFB47\uFB48\uFB49\uFB4A\uFB4B\uFB4C\uFB4D\uFB4E";
\r
260 for (int i=0; i<EXCLUDED.length(); ++i) {
\r
261 String a = String.valueOf(EXCLUDED.charAt(i));
\r
262 String b = Normalizer.normalize(a, Normalizer.NFKD);
\r
263 String c = Normalizer.normalize(b, Normalizer.NFC);
\r
265 errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " +
\r
266 hex(b) + " x COMPOSE => " +
\r
268 } else if (isVerbose()) {
\r
269 logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " +
\r
270 hex(b) + " x COMPOSE => " +
\r
274 // The following method works too, but it is somewhat
\r
275 // incestuous. It uses UInfo, which is the same database that
\r
276 // NormalizerBuilder uses, so if something is wrong with
\r
277 // UInfo, the following test won't show it. All it will show
\r
278 // is that NormalizerBuilder has been run with whatever the
\r
279 // current UInfo is.
\r
281 // We comment this out in favor of the test above, which
\r
282 // provides independent verification (but also requires
\r
283 // independent updating).
\r
285 // UInfo uinfo = new UInfo();
\r
286 // for (int i=0; i<=0xFFFF; ++i) {
\r
287 // if (!uinfo.isExcludedComposition((char)i) ||
\r
288 // (!uinfo.hasCanonicalDecomposition((char)i) &&
\r
289 // !uinfo.hasCompatibilityDecomposition((char)i))) continue;
\r
290 // String a = String.valueOf((char)i);
\r
291 // String b = Normalizer.normalize(a,Normalizer.DECOMP_COMPAT,0);
\r
292 // String c = Normalizer.normalize(b,Normalizer.COMPOSE,0);
\r
293 // if (c.equals(a)) {
\r
294 // errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " +
\r
295 // hex(b) + " x COMPOSE => " +
\r
297 // } else if (isVerbose()) {
\r
298 // logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " +
\r
299 // hex(b) + " x COMPOSE => " +
\r
306 * Test for a problem that showed up just before ICU 1.6 release
\r
307 * having to do with combining characters with an index of zero.
\r
308 * Such characters do not participate in any canonical
\r
309 * decompositions. However, having an index of zero means that
\r
310 * they all share one typeMask[] entry, that is, they all have to
\r
311 * map to the same canonical class, which is not the case, in
\r
314 public void TestZeroIndex()
\r
317 // Expect col1 x COMPOSE_COMPAT => col2
\r
318 // Expect col2 x DECOMP => col3
\r
319 "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300",
\r
320 "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300",
\r
321 "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300",
\r
322 "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327",
\r
323 "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321",
\r
326 for (int i=0; i<DATA.length; i+=3) {
\r
327 String a = DATA[i];
\r
328 String b = Normalizer.normalize(a, Normalizer.NFKC);
\r
329 String exp = DATA[i+1];
\r
330 if (b.equals(exp)) {
\r
331 logln("Ok: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b));
\r
333 errln("FAIL: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b) +
\r
334 ", expect " + hex(exp));
\r
336 a = Normalizer.normalize(b, Normalizer.NFD);
\r
338 if (a.equals(exp)) {
\r
339 logln("Ok: " + hex(b) + " x DECOMP => " + hex(a));
\r
341 errln("FAIL: " + hex(b) + " x DECOMP => " + hex(a) +
\r
342 ", expect " + hex(exp));
\r
348 * Test for a problem found by Verisign. Problem is that
\r
349 * characters at the start of a string are not put in canonical
\r
350 * order correctly by compose() if there is no starter.
\r
352 public void TestVerisign()
\r
354 String[] inputs = {
\r
355 "\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f",
\r
356 "\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad"
\r
358 String[] outputs = {
\r
359 "\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f",
\r
360 "\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4"
\r
363 for (int i = 0; i < inputs.length; ++i) {
\r
364 String input = inputs[i];
\r
365 String output = outputs[i];
\r
366 String result = Normalizer.decompose(input, false);
\r
367 if (!result.equals(output)) {
\r
368 errln("FAIL input: " + hex(input));
\r
369 errln(" decompose: " + hex(result));
\r
370 errln(" expected: " + hex(output));
\r
372 result = Normalizer.compose(input, false);
\r
373 if (!result.equals(output)) {
\r
374 errln("FAIL input: " + hex(input));
\r
375 errln(" compose: " + hex(result));
\r
376 errln(" expected: " + hex(output));
\r
381 public void TestQuickCheckResultNO()
\r
383 final char CPNFD[] = {0x00C5, 0x0407, 0x1E00, 0x1F57, 0x220C,
\r
384 0x30AE, 0xAC00, 0xD7A3, 0xFB36, 0xFB4E};
\r
385 final char CPNFC[] = {0x0340, 0x0F93, 0x1F77, 0x1FBB, 0x1FEB,
\r
386 0x2000, 0x232A, 0xF900, 0xFA1E, 0xFB4E};
\r
387 final char CPNFKD[] = {0x00A0, 0x02E4, 0x1FDB, 0x24EA, 0x32FE,
\r
388 0xAC00, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D};
\r
389 final char CPNFKC[] = {0x00A0, 0x017F, 0x2000, 0x24EA, 0x32FE,
\r
390 0x33FE, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D};
\r
393 final int SIZE = 10;
\r
396 for (; count < SIZE; count ++)
\r
398 if (Normalizer.quickCheck(String.valueOf(CPNFD[count]),
\r
399 Normalizer.NFD,0) != Normalizer.NO)
\r
401 errln("ERROR in NFD quick check at U+" +
\r
402 Integer.toHexString(CPNFD[count]));
\r
405 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),
\r
406 Normalizer.NFC,0) !=Normalizer.NO)
\r
408 errln("ERROR in NFC quick check at U+"+
\r
409 Integer.toHexString(CPNFC[count]));
\r
412 if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]),
\r
413 Normalizer.NFKD,0) != Normalizer.NO)
\r
415 errln("ERROR in NFKD quick check at U+"+
\r
416 Integer.toHexString(CPNFKD[count]));
\r
419 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
\r
420 Normalizer.NFKC,0) !=Normalizer.NO)
\r
422 errln("ERROR in NFKC quick check at U+"+
\r
423 Integer.toHexString(CPNFKC[count]));
\r
426 // for improving coverage
\r
427 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
\r
428 Normalizer.NFKC) !=Normalizer.NO)
\r
430 errln("ERROR in NFKC quick check at U+"+
\r
431 Integer.toHexString(CPNFKC[count]));
\r
438 public void TestQuickCheckResultYES()
\r
440 final char CPNFD[] = {0x00C6, 0x017F, 0x0F74, 0x1000, 0x1E9A,
\r
441 0x2261, 0x3075, 0x4000, 0x5000, 0xF000};
\r
442 final char CPNFC[] = {0x0400, 0x0540, 0x0901, 0x1000, 0x1500,
\r
443 0x1E9A, 0x3000, 0x4000, 0x5000, 0xF000};
\r
444 final char CPNFKD[] = {0x00AB, 0x02A0, 0x1000, 0x1027, 0x2FFB,
\r
445 0x3FFF, 0x4FFF, 0xA000, 0xF000, 0xFA27};
\r
446 final char CPNFKC[] = {0x00B0, 0x0100, 0x0200, 0x0A02, 0x1000,
\r
447 0x2010, 0x3030, 0x4000, 0xA000, 0xFA0E};
\r
449 final int SIZE = 10;
\r
455 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFD,0)
\r
458 errln("ERROR in NFD quick check at U+"+
\r
459 Integer.toHexString(cp));
\r
462 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFC,0)
\r
465 errln("ERROR in NFC quick check at U+"+
\r
466 Integer.toHexString(cp));
\r
469 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKD,0)
\r
472 errln("ERROR in NFKD quick check at U+" +
\r
473 Integer.toHexString(cp));
\r
476 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC,0)
\r
479 errln("ERROR in NFKC quick check at U+"+
\r
480 Integer.toHexString(cp));
\r
483 // improve the coverage
\r
484 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC)
\r
487 errln("ERROR in NFKC quick check at U+"+
\r
488 Integer.toHexString(cp));
\r
494 for (; count < SIZE; count ++)
\r
496 if (Normalizer.quickCheck(String.valueOf(CPNFD[count]),
\r
497 Normalizer.NFD,0)!=Normalizer.YES)
\r
499 errln("ERROR in NFD quick check at U+"+
\r
500 Integer.toHexString(CPNFD[count]));
\r
503 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),
\r
504 Normalizer.NFC,0)!=Normalizer.YES)
\r
506 errln("ERROR in NFC quick check at U+"+
\r
507 Integer.toHexString(CPNFC[count]));
\r
510 if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]),
\r
511 Normalizer.NFKD,0)!=Normalizer.YES)
\r
513 errln("ERROR in NFKD quick check at U+"+
\r
514 Integer.toHexString(CPNFKD[count]));
\r
517 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
\r
518 Normalizer.NFKC,0)!=Normalizer.YES)
\r
520 errln("ERROR in NFKC quick check at U+"+
\r
521 Integer.toHexString(CPNFKC[count]));
\r
524 // improve the coverage
\r
525 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
\r
526 Normalizer.NFKC)!=Normalizer.YES)
\r
528 errln("ERROR in NFKC quick check at U+"+
\r
529 Integer.toHexString(CPNFKC[count]));
\r
534 public void TestBengali() throws Exception{
\r
535 String input = "\u09bc\u09be\u09cd\u09be";
\r
536 String output=Normalizer.normalize(input,Normalizer.NFC);
\r
537 if(!input.equals(output)){
\r
538 errln("ERROR in NFC of string");
\r
541 public void TestQuickCheckResultMAYBE()
\r
544 final char[] CPNFC = {0x0306, 0x0654, 0x0BBE, 0x102E, 0x1161,
\r
545 0x116A, 0x1173, 0x1175, 0x3099, 0x309A};
\r
546 final char[] CPNFKC = {0x0300, 0x0654, 0x0655, 0x09D7, 0x0B3E,
\r
547 0x0DCF, 0xDDF, 0x102E, 0x11A8, 0x3099};
\r
550 final int SIZE = 10;
\r
554 /* NFD and NFKD does not have any MAYBE codepoints */
\r
555 for (; count < SIZE; count ++)
\r
557 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),
\r
558 Normalizer.NFC,0)!=Normalizer.MAYBE)
\r
560 errln("ERROR in NFC quick check at U+"+
\r
561 Integer.toHexString(CPNFC[count]));
\r
564 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
\r
565 Normalizer.NFKC,0)!=Normalizer.MAYBE)
\r
567 errln("ERROR in NFKC quick check at U+"+
\r
568 Integer.toHexString(CPNFKC[count]));
\r
571 if (Normalizer.quickCheck(new char[]{CPNFC[count]},
\r
572 Normalizer.NFC,0)!=Normalizer.MAYBE)
\r
574 errln("ERROR in NFC quick check at U+"+
\r
575 Integer.toHexString(CPNFC[count]));
\r
578 if (Normalizer.quickCheck(new char[]{CPNFKC[count]},
\r
579 Normalizer.NFKC,0)!=Normalizer.MAYBE)
\r
581 errln("ERROR in NFKC quick check at U+"+
\r
582 Integer.toHexString(CPNFKC[count]));
\r
585 if (Normalizer.quickCheck(new char[]{CPNFKC[count]},
\r
586 Normalizer.NONE,0)!=Normalizer.YES)
\r
588 errln("ERROR in NONE quick check at U+"+
\r
589 Integer.toHexString(CPNFKC[count]));
\r
595 public void TestQuickCheckStringResult()
\r
601 for (count = 0; count < canonTests.length; count ++)
\r
603 d = canonTests[count][1];
\r
604 c = canonTests[count][2];
\r
605 if (Normalizer.quickCheck(d,Normalizer.NFD,0)
\r
608 errln("ERROR in NFD quick check for string at count " + count);
\r
612 if (Normalizer.quickCheck(c, Normalizer.NFC,0)
\r
615 errln("ERROR in NFC quick check for string at count " + count);
\r
620 for (count = 0; count < compatTests.length; count ++)
\r
622 d = compatTests[count][1];
\r
623 c = compatTests[count][2];
\r
624 if (Normalizer.quickCheck(d, Normalizer.NFKD,0)
\r
627 errln("ERROR in NFKD quick check for string at count " + count);
\r
631 if (Normalizer.quickCheck(c, Normalizer.NFKC,0)
\r
634 errln("ERROR in NFKC quick check for string at count " + count);
\r
640 static final int qcToInt(Normalizer.QuickCheckResult qc) {
\r
641 if(qc==Normalizer.NO) {
\r
643 } else if(qc==Normalizer.YES) {
\r
645 } else /* Normalizer.MAYBE */ {
\r
650 public void TestQuickCheckPerCP() {
\r
651 int c, lead, trail;
\r
653 int lccc1, lccc2, tccc1, tccc2;
\r
657 UCharacter.getIntPropertyMaxValue(UProperty.NFD_QUICK_CHECK)!=1 || // YES
\r
658 UCharacter.getIntPropertyMaxValue(UProperty.NFKD_QUICK_CHECK)!=1 ||
\r
659 UCharacter.getIntPropertyMaxValue(UProperty.NFC_QUICK_CHECK)!=2 || // MAYBE
\r
660 UCharacter.getIntPropertyMaxValue(UProperty.NFKC_QUICK_CHECK)!=2 ||
\r
661 UCharacter.getIntPropertyMaxValue(UProperty.LEAD_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) ||
\r
662 UCharacter.getIntPropertyMaxValue(UProperty.TRAIL_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS)
\r
664 errln("wrong result from one of the u_getIntPropertyMaxValue(UCHAR_NF*_QUICK_CHECK) or UCHAR_*_CANONICAL_COMBINING_CLASS");
\r
668 * compare the quick check property values for some code points
\r
669 * to the quick check results for checking same-code point strings
\r
672 while(c<0x110000) {
\r
673 s=UTF16.valueOf(c);
\r
675 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFC_QUICK_CHECK);
\r
676 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFC));
\r
678 errln("getIntPropertyValue(NFC)="+qc1+" != "+qc2+"=quickCheck(NFC) for U+"+Integer.toHexString(c));
\r
681 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFD_QUICK_CHECK);
\r
682 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFD));
\r
684 errln("getIntPropertyValue(NFD)="+qc1+" != "+qc2+"=quickCheck(NFD) for U+"+Integer.toHexString(c));
\r
687 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKC_QUICK_CHECK);
\r
688 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKC));
\r
690 errln("getIntPropertyValue(NFKC)="+qc1+" != "+qc2+"=quickCheck(NFKC) for U+"+Integer.toHexString(c));
\r
693 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKD_QUICK_CHECK);
\r
694 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKD));
\r
696 errln("getIntPropertyValue(NFKD)="+qc1+" != "+qc2+"=quickCheck(NFKD) for U+"+Integer.toHexString(c));
\r
699 nfd=Normalizer.normalize(s, Normalizer.NFD);
\r
700 lead=UTF16.charAt(nfd, 0);
\r
701 trail=UTF16.charAt(nfd, nfd.length()-1);
\r
703 lccc1=UCharacter.getIntPropertyValue(c, UProperty.LEAD_CANONICAL_COMBINING_CLASS);
\r
704 lccc2=UCharacter.getCombiningClass(lead);
\r
705 tccc1=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS);
\r
706 tccc2=UCharacter.getCombiningClass(trail);
\r
709 errln("getIntPropertyValue(lccc)="+lccc1+" != "+lccc2+"=getCombiningClass(lead) for U+"+Integer.toHexString(c));
\r
712 errln("getIntPropertyValue(tccc)="+tccc1+" != "+tccc2+"=getCombiningClass(trail) for U+"+Integer.toHexString(c));
\r
715 /* skip some code points */
\r
720 //------------------------------------------------------------------------
\r
721 // Internal utilities
\r
723 //------------------------------------------------------------------------
\r
724 // Internal utilities
\r
727 /* private void backAndForth(Normalizer iter, String input)
\r
729 iter.setText(input);
\r
731 // Run through the iterator forwards and stick it into a StringBuffer
\r
732 StringBuffer forward = new StringBuffer();
\r
733 for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) {
\r
734 forward.append(ch);
\r
737 // Now do it backwards
\r
738 StringBuffer reverse = new StringBuffer();
\r
739 for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) {
\r
740 reverse.insert(0, ch);
\r
743 if (!forward.toString().equals(reverse.toString())) {
\r
744 errln("FAIL: Forward/reverse mismatch for input " + hex(input)
\r
745 + ", forward: " + hex(forward) + ", backward: "+hex(reverse));
\r
746 } else if (isVerbose()) {
\r
747 logln("Ok: Forward/reverse for input " + hex(input)
\r
748 + ", forward: " + hex(forward) + ", backward: "+hex(reverse));
\r
752 private void backAndForth(Normalizer iter, String[][] tests)
\r
754 for (int i = 0; i < tests.length; i++)
\r
756 iter.setText(tests[i][0]);
\r
758 // Run through the iterator forwards and stick it into a
\r
760 StringBuffer forward = new StringBuffer();
\r
761 for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) {
\r
762 forward.append(ch);
\r
765 // Now do it backwards
\r
766 StringBuffer reverse = new StringBuffer();
\r
767 for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) {
\r
768 reverse.insert(0, ch);
\r
771 if (!forward.toString().equals(reverse.toString())) {
\r
772 errln("FAIL: Forward/reverse mismatch for input "
\r
773 + hex(tests[i][0]) + ", forward: " + hex(forward)
\r
774 + ", backward: " + hex(reverse));
\r
775 } else if (isVerbose()) {
\r
776 logln("Ok: Forward/reverse for input " + hex(tests[i][0])
\r
777 + ", forward: " + hex(forward) + ", backward: "
\r
783 private void staticTest (Normalizer.Mode mode,
\r
784 String[][] tests, int outCol) throws Exception{
\r
785 for (int i = 0; i < tests.length; i++)
\r
787 String input = Utility.unescape(tests[i][0]);
\r
788 String expect = Utility.unescape(tests[i][outCol]);
\r
790 logln("Normalizing '" + input + "' (" + hex(input) + ")" );
\r
792 String output = Normalizer.normalize(input, mode);
\r
794 if (!output.equals(expect)) {
\r
795 errln("FAIL: case " + i
\r
796 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
797 + " but got '" + output + "' (" + hex(output) + ")" );
\r
800 char[] output = new char[1];
\r
801 for (int i = 0; i < tests.length; i++)
\r
803 char[] input = Utility.unescape(tests[i][0]).toCharArray();
\r
804 String expect =Utility.unescape( tests[i][outCol]);
\r
806 logln("Normalizing '" + new String(input) + "' (" +
\r
807 hex(new String(input)) + ")" );
\r
811 reqLength=Normalizer.normalize(input,output, mode,0);
\r
812 if(reqLength<=output.length ){
\r
815 }catch(IndexOutOfBoundsException e){
\r
816 output= new char[Integer.parseInt(e.getMessage())];
\r
820 if (!expect.equals(new String(output,0,reqLength))) {
\r
821 errln("FAIL: case " + i
\r
822 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
823 + " but got '" + new String(output)
\r
824 + "' (" + hex(new String(output)) + ")" );
\r
828 private void decomposeTest(Normalizer.Mode mode,
\r
829 String[][] tests, int outCol) throws Exception{
\r
830 for (int i = 0; i < tests.length; i++)
\r
832 String input = Utility.unescape(tests[i][0]);
\r
833 String expect = Utility.unescape(tests[i][outCol]);
\r
835 logln("Normalizing '" + input + "' (" + hex(input) + ")" );
\r
837 String output = Normalizer.decompose(input, mode==Normalizer.NFKD);
\r
839 if (!output.equals(expect)) {
\r
840 errln("FAIL: case " + i
\r
841 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
842 + " but got '" + output + "' (" + hex(output) + ")" );
\r
845 char[] output = new char[1];
\r
846 for (int i = 0; i < tests.length; i++)
\r
848 char[] input = Utility.unescape(tests[i][0]).toCharArray();
\r
849 String expect = Utility.unescape(tests[i][outCol]);
\r
851 logln("Normalizing '" + new String(input) + "' (" +
\r
852 hex(new String(input)) + ")" );
\r
856 reqLength=Normalizer.decompose(input,output, mode==Normalizer.NFKD,0);
\r
857 if(reqLength<=output.length ){
\r
860 }catch(IndexOutOfBoundsException e){
\r
861 output= new char[Integer.parseInt(e.getMessage())];
\r
865 if (!expect.equals(new String(output,0,reqLength))) {
\r
866 errln("FAIL: case " + i
\r
867 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
868 + " but got '" + new String(output)
\r
869 + "' (" + hex(new String(output)) + ")" );
\r
872 output = new char[1];
\r
873 for (int i = 0; i < tests.length; i++)
\r
875 char[] input = Utility.unescape(tests[i][0]).toCharArray();
\r
876 String expect = Utility.unescape(tests[i][outCol]);
\r
878 logln("Normalizing '" + new String(input) + "' (" +
\r
879 hex(new String(input)) + ")" );
\r
883 reqLength=Normalizer.decompose(input,0,input.length,output,0,output.length, mode==Normalizer.NFKD,0);
\r
884 if(reqLength<=output.length ){
\r
887 }catch(IndexOutOfBoundsException e){
\r
888 output= new char[Integer.parseInt(e.getMessage())];
\r
892 if (!expect.equals(new String(output,0,reqLength))) {
\r
893 errln("FAIL: case " + i
\r
894 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
895 + " but got '" + new String(output)
\r
896 + "' (" + hex(new String(output)) + ")" );
\r
898 char[] output2 = new char[reqLength * 2];
\r
899 System.arraycopy(output, 0, output2, 0, reqLength);
\r
900 int retLength = Normalizer.decompose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0);
\r
901 if(retLength != reqLength){
\r
902 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength);
\r
907 private void composeTest(Normalizer.Mode mode,
\r
908 String[][] tests, int outCol) throws Exception{
\r
909 for (int i = 0; i < tests.length; i++)
\r
911 String input = Utility.unescape(tests[i][0]);
\r
912 String expect = Utility.unescape(tests[i][outCol]);
\r
914 logln("Normalizing '" + input + "' (" + hex(input) + ")" );
\r
916 String output = Normalizer.compose(input, mode==Normalizer.NFKC);
\r
918 if (!output.equals(expect)) {
\r
919 errln("FAIL: case " + i
\r
920 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
921 + " but got '" + output + "' (" + hex(output) + ")" );
\r
924 char[] output = new char[1];
\r
925 for (int i = 0; i < tests.length; i++)
\r
927 char[] input = Utility.unescape(tests[i][0]).toCharArray();
\r
928 String expect = Utility.unescape(tests[i][outCol]);
\r
930 logln("Normalizing '" + new String(input) + "' (" +
\r
931 hex(new String(input)) + ")" );
\r
935 reqLength=Normalizer.compose(input,output, mode==Normalizer.NFKC,0);
\r
936 if(reqLength<=output.length ){
\r
939 }catch(IndexOutOfBoundsException e){
\r
940 output= new char[Integer.parseInt(e.getMessage())];
\r
944 if (!expect.equals(new String(output,0,reqLength))) {
\r
945 errln("FAIL: case " + i
\r
946 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
947 + " but got '" + new String(output)
\r
948 + "' (" + hex(new String(output)) + ")" );
\r
951 output = new char[1];
\r
952 for (int i = 0; i < tests.length; i++)
\r
954 char[] input = Utility.unescape(tests[i][0]).toCharArray();
\r
955 String expect = Utility.unescape(tests[i][outCol]);
\r
957 logln("Normalizing '" + new String(input) + "' (" +
\r
958 hex(new String(input)) + ")" );
\r
962 reqLength=Normalizer.compose(input,0,input.length, output, 0, output.length, mode==Normalizer.NFKC,0);
\r
963 if(reqLength<=output.length ){
\r
966 }catch(IndexOutOfBoundsException e){
\r
967 output= new char[Integer.parseInt(e.getMessage())];
\r
971 if (!expect.equals(new String(output,0,reqLength))) {
\r
972 errln("FAIL: case " + i
\r
973 + " expected '" + expect + "' (" + hex(expect) + ")"
\r
974 + " but got '" + new String(output)
\r
975 + "' (" + hex(new String(output)) + ")" );
\r
978 char[] output2 = new char[reqLength * 2];
\r
979 System.arraycopy(output, 0, output2, 0, reqLength);
\r
980 int retLength = Normalizer.compose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0);
\r
981 if(retLength != reqLength){
\r
982 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength);
\r
986 private void iterateTest(Normalizer iter, String[][] tests, int outCol){
\r
987 for (int i = 0; i < tests.length; i++)
\r
989 String input = Utility.unescape(tests[i][0]);
\r
990 String expect = Utility.unescape(tests[i][outCol]);
\r
992 logln("Normalizing '" + input + "' (" + hex(input) + ")" );
\r
994 iter.setText(input);
\r
995 assertEqual(expect, iter, "case " + i + " ");
\r
999 private void assertEqual(String expected, Normalizer iter, String msg)
\r
1003 UCharacterIterator cIter = UCharacterIterator.getInstance(expected);
\r
1005 while ((ch=iter.next())!= Normalizer.DONE){
\r
1006 if (index >= expected.length()) {
\r
1007 errln("FAIL: " + msg + "Unexpected character '" + (char)ch
\r
1008 + "' (" + hex(ch) + ")"
\r
1009 + " at index " + index);
\r
1012 int want = UTF16.charAt(expected,index);
\r
1014 errln("FAIL: " + msg + "got '" + (char)ch
\r
1015 + "' (" + hex(ch) + ")"
\r
1016 + " but expected '" + want + "' (" + hex(want)+ ")"
\r
1017 + " at index " + index);
\r
1019 index+= UTF16.getCharCount(ch);
\r
1021 if (index < expected.length()) {
\r
1022 errln("FAIL: " + msg + "Only got " + index + " chars, expected "
\r
1023 + expected.length());
\r
1026 cIter.setToLimit();
\r
1027 while((ch=iter.previous())!=Normalizer.DONE){
\r
1028 int want = cIter.previousCodePoint();
\r
1029 if (ch != want ) {
\r
1030 errln("FAIL: " + msg + "got '" + (char)ch
\r
1031 + "' (" + hex(ch) + ")"
\r
1032 + " but expected '" + want + "' (" + hex(want) + ")"
\r
1033 + " at index " + index);
\r
1037 //--------------------------------------------------------------------------
\r
1039 // NOTE: These tests are used for quick debugging so are not ported
\r
1040 // to ICU4C tsnorm.cpp in intltest
\r
1043 public void TestDebugStatic(){
\r
1044 String in = Utility.unescape("\\U0001D157\\U0001D165");
\r
1045 if(!Normalizer.isNormalized(in,Normalizer.NFC,0)){
\r
1046 errln("isNormalized failed");
\r
1049 String input = "\uAD8B\uAD8B\uAD8B\uAD8B"+
\r
1050 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1051 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1052 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1053 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1054 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1055 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
\r
1056 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
\r
1057 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+
\r
1058 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
\r
1059 "\uAD8B\uAD8B\uAD8B\uAD8B"+
\r
1060 "d\u031B\u0307\u0323";
\r
1061 String expect = "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+
\r
1062 "\u11AA\u1100\u116F\u11AA\uD834\uDD57\uD834\uDD65"+
\r
1063 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1064 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1065 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1066 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1067 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1068 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1069 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1070 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1071 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1072 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1073 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1074 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1075 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1076 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
\r
1077 "\uD834\uDD57\uD834\uDD65aaaaaaaaaaaaaaaaaazzzzzz"+
\r
1078 "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
\r
1079 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
\r
1080 "bbbbbbbbbbbbbbbbbbbbbbbbccccccccccccccccccccccccccccc"+
\r
1081 "cccccccccccccccccccccccccccccccccccccccccccccccc"+
\r
1082 "ddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
\r
1083 "dddddddddddddddddddddddd"+
\r
1084 "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+
\r
1085 "\u11AA\u1100\u116F\u11AA\u0064\u031B\u0323\u0307";
\r
1086 String output = Normalizer.normalize(Utility.unescape(input),
\r
1088 if(!expect.equals(output)){
\r
1089 errln("FAIL expected: "+hex(expect) + " got: "+hex(output));
\r
1095 public void TestDebugIter(){
\r
1096 String src = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e");
\r
1097 String expected = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e");
\r
1098 Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(src)),
\r
1099 Normalizer.NONE,0);
\r
1102 UCharacterIterator cIter = UCharacterIterator.getInstance(expected);
\r
1104 while ((ch=iter.next())!= Normalizer.DONE){
\r
1105 if (index >= expected.length()) {
\r
1106 errln("FAIL: " + "Unexpected character '" + (char)ch
\r
1107 + "' (" + hex(ch) + ")"
\r
1108 + " at index " + index);
\r
1111 int want = UTF16.charAt(expected,index);
\r
1113 errln("FAIL: " + "got '" + (char)ch
\r
1114 + "' (" + hex(ch) + ")"
\r
1115 + " but expected '" + want + "' (" + hex(want)+ ")"
\r
1116 + " at index " + index);
\r
1118 index+= UTF16.getCharCount(ch);
\r
1120 if (index < expected.length()) {
\r
1121 errln("FAIL: " + "Only got " + index + " chars, expected "
\r
1122 + expected.length());
\r
1125 cIter.setToLimit();
\r
1126 while((ch=iter.previous())!=Normalizer.DONE){
\r
1127 int want = cIter.previousCodePoint();
\r
1128 if (ch != want ) {
\r
1129 errln("FAIL: " + "got '" + (char)ch
\r
1130 + "' (" + hex(ch) + ")"
\r
1131 + " but expected '" + want + "' (" + hex(want) + ")"
\r
1132 + " at index " + index);
\r
1136 public void TestDebugIterOld(){
\r
1137 String input = "\\U0001D15E";
\r
1138 String expected = "\uD834\uDD57\uD834\uDD65";
\r
1139 String expectedReverse = "\uD834\uDD65\uD834\uDD57";
\r
1142 Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(input)),
\r
1143 Normalizer.NFKC,0);
\r
1144 StringBuffer got = new StringBuffer();
\r
1145 for (ch = iter.first();ch!=Normalizer.DONE;ch=iter.next())
\r
1147 if (index >= expected.length()) {
\r
1148 errln("FAIL: " + "Unexpected character '" + (char)ch +
\r
1149 "' (" + hex(ch) + ")" + " at index " + index);
\r
1152 got.append(UCharacter.toString(ch));
\r
1155 if (!expected.equals(got.toString())) {
\r
1156 errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")"
\r
1157 + " but expected '" + expected + "' ("
\r
1158 + hex(expected) + ")");
\r
1160 if (got.length() < expected.length()) {
\r
1161 errln("FAIL: " + "Only got " + index + " chars, expected "
\r
1162 + expected.length());
\r
1165 logln("Reverse Iteration\n");
\r
1166 iter.setIndexOnly(iter.endIndex());
\r
1168 for(ch=iter.previous();ch!=Normalizer.DONE;ch=iter.previous()){
\r
1169 if (index >= expected.length()) {
\r
1170 errln("FAIL: " + "Unexpected character '" + (char)ch
\r
1171 + "' (" + hex(ch) + ")" + " at index " + index);
\r
1174 got.append(UCharacter.toString(ch));
\r
1176 if (!expectedReverse.equals(got.toString())) {
\r
1177 errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")"
\r
1178 + " but expected '" + expected
\r
1179 + "' (" + hex(expected) + ")");
\r
1181 if (got.length() < expected.length()) {
\r
1182 errln("FAIL: " + "Only got " + index + " chars, expected "
\r
1183 + expected.length());
\r
1187 //--------------------------------------------------------------------------
\r
1188 // helper class for TestPreviousNext()
\r
1189 // simple UTF-32 character iterator
\r
1190 class UCharIterator {
\r
1192 public UCharIterator(int[] src, int len, int index){
\r
1199 public int current() {
\r
1207 public int next() {
\r
1215 public int previous() {
\r
1223 public int getIndex() {
\r
1228 private int length, i;
\r
1230 public void TestPreviousNext() {
\r
1231 // src and expect strings
\r
1233 UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999),
\r
1234 UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f),
\r
1242 0x4f, 0x302, 0x301
\r
1245 // expected src indexes corresponding to expect indexes
\r
1246 int expectIndex[]={
\r
1251 6 // behind last character
\r
1254 // initial indexes into the src and expect strings
\r
1256 final int SRC_MIDDLE=4;
\r
1257 final int EXPECT_MIDDLE=3;
\r
1260 // movement vector
\r
1261 // - for previous(), 0 for current(), + for next()
\r
1262 // not const so that we can terminate it below for the error message
\r
1263 String moves="0+0+0--0-0-+++0--+++++++0--------";
\r
1266 Normalizer iter = new Normalizer(new String(src),
\r
1267 Normalizer.NFD,0);
\r
1268 UCharIterator iter32 = new UCharIterator(expect, expect.length,
\r
1274 // initially set the indexes into the middle of the strings
\r
1275 iter.setIndexOnly(SRC_MIDDLE);
\r
1277 // move around and compare the iteration code points with
\r
1278 // the expected ones
\r
1279 int movesIndex =0;
\r
1280 while(movesIndex<moves.length()) {
\r
1281 m=moves.charAt(movesIndex++);
\r
1283 c1=iter.previous();
\r
1284 c2=iter32.previous();
\r
1285 } else if(m=='0') {
\r
1286 c1=iter.current();
\r
1287 c2=iter32.current();
\r
1288 } else /* m=='+' */ {
\r
1293 // compare results
\r
1295 // copy the moves until the current (m) move, and terminate
\r
1296 String history = moves.substring(0,movesIndex);
\r
1297 errln("error: mismatch in Normalizer iteration at "+history+": "
\r
1298 +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2));
\r
1302 // compare indexes
\r
1303 if(iter.getIndex()!=expectIndex[iter32.getIndex()]) {
\r
1304 // copy the moves until the current (m) move, and terminate
\r
1305 String history = moves.substring(0,movesIndex);
\r
1306 errln("error: index mismatch in Normalizer iteration at "
\r
1307 +history+ " : "+ "Normalizer index " +iter.getIndex()
\r
1308 +" expected "+ expectIndex[iter32.getIndex()]);
\r
1314 public void TestPreviousNextJCI() {
\r
1315 // src and expect strings
\r
1317 UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999),
\r
1318 UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f),
\r
1326 0x4f, 0x302, 0x301
\r
1329 // expected src indexes corresponding to expect indexes
\r
1330 int expectIndex[]={
\r
1335 6 // behind last character
\r
1338 // initial indexes into the src and expect strings
\r
1340 final int SRC_MIDDLE=4;
\r
1341 final int EXPECT_MIDDLE=3;
\r
1344 // movement vector
\r
1345 // - for previous(), 0 for current(), + for next()
\r
1346 // not const so that we can terminate it below for the error message
\r
1347 String moves="0+0+0--0-0-+++0--+++++++0--------";
\r
1350 StringCharacterIterator text = new StringCharacterIterator(new String(src));
\r
1351 Normalizer iter = new Normalizer(text,Normalizer.NFD,0);
\r
1352 UCharIterator iter32 = new UCharIterator(expect, expect.length,
\r
1358 // initially set the indexes into the middle of the strings
\r
1359 iter.setIndexOnly(SRC_MIDDLE);
\r
1361 // move around and compare the iteration code points with
\r
1362 // the expected ones
\r
1363 int movesIndex =0;
\r
1364 while(movesIndex<moves.length()) {
\r
1365 m=moves.charAt(movesIndex++);
\r
1367 c1=iter.previous();
\r
1368 c2=iter32.previous();
\r
1369 } else if(m=='0') {
\r
1370 c1=iter.current();
\r
1371 c2=iter32.current();
\r
1372 } else /* m=='+' */ {
\r
1377 // compare results
\r
1379 // copy the moves until the current (m) move, and terminate
\r
1380 String history = moves.substring(0,movesIndex);
\r
1381 errln("error: mismatch in Normalizer iteration at "+history+": "
\r
1382 +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2));
\r
1386 // compare indexes
\r
1387 if(iter.getIndex()!=expectIndex[iter32.getIndex()]) {
\r
1388 // copy the moves until the current (m) move, and terminate
\r
1389 String history = moves.substring(0,movesIndex);
\r
1390 errln("error: index mismatch in Normalizer iteration at "
\r
1391 +history+ " : "+ "Normalizer index " +iter.getIndex()
\r
1392 +" expected "+ expectIndex[iter32.getIndex()]);
\r
1398 // test APIs that are not otherwise used - improve test coverage
\r
1399 public void TestNormalizerAPI() throws Exception {
\r
1401 // instantiate a Normalizer from a CharacterIterator
\r
1402 String s=Utility.unescape("a\u0308\uac00\\U0002f800");
\r
1403 // make s a bit longer and more interesting
\r
1404 UCharacterIterator iter = UCharacterIterator.getInstance(s+s);
\r
1405 Normalizer norm = new Normalizer(iter, Normalizer.NFC,0);
\r
1406 if(norm.next()!=0xe4) {
\r
1407 errln("error in Normalizer(CharacterIterator).next()");
\r
1410 // test clone(), ==, and hashCode()
\r
1411 Normalizer clone=(Normalizer)norm.clone();
\r
1412 if(clone.equals(norm)) {
\r
1413 errln("error in Normalizer(Normalizer(CharacterIterator)).clone()!=norm");
\r
1417 if(clone.getLength()!= norm.getLength()){
\r
1418 errln("error in Normalizer.getBeginIndex()");
\r
1420 // clone must have the same hashCode()
\r
1421 //if(clone.hashCode()!=norm.hashCode()) {
\r
1422 // errln("error in Normalizer(Normalizer(CharacterIterator)).clone().hashCode()!=copy.hashCode()");
\r
1424 if(clone.next()!=0xac00) {
\r
1425 errln("error in Normalizer(Normalizer(CharacterIterator)).next()");
\r
1427 int ch = clone.next();
\r
1429 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next()");
\r
1431 // position changed, must change hashCode()
\r
1432 if(clone.hashCode()==norm.hashCode()) {
\r
1433 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next().hashCode()==copy.hashCode()");
\r
1436 // test compose() and decompose()
\r
1438 String nfkc, nfkd;
\r
1439 tel=new StringBuffer("\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121");
\r
1440 tel.insert(1,(char)0x0301);
\r
1442 nfkc=Normalizer.compose(tel.toString(), true);
\r
1443 nfkd=Normalizer.decompose(tel.toString(), true);
\r
1445 !nfkc.equals(Utility.unescape("TE\u0139TELTELTELTELTELTELTELTELTEL"))||
\r
1446 !nfkd.equals(Utility.unescape("TEL\u0301TELTELTELTELTELTELTELTELTEL"))
\r
1448 errln("error in Normalizer::(de)compose(): wrong result(s)");
\r
1451 // test setIndex()
\r
1452 // ch=norm.setIndex(3);
\r
1453 // if(ch!=0x4e3d) {
\r
1454 // errln("error in Normalizer(CharacterIterator).setIndex(3)");
\r
1457 // test setText(CharacterIterator) and getText()
\r
1459 clone.setText(iter);
\r
1461 out = clone.getText();
\r
1462 out2 = iter.getText();
\r
1463 if( !out.equals(out2) ||
\r
1464 clone.startIndex()!=0||
\r
1465 clone.endIndex()!=iter.getLength()
\r
1467 errln("error in Normalizer::setText() or Normalizer::getText()");
\r
1470 char[] fillIn1 = new char[clone.getLength()];
\r
1471 char[] fillIn2 = new char[iter.getLength()];
\r
1472 int len = clone.getText(fillIn1);
\r
1473 iter.getText(fillIn2,0);
\r
1474 if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){
\r
1475 errln("error in Normalizer.getText(). Normalizer: "+
\r
1476 Utility.hex(new String(fillIn1))+
\r
1477 " Iter: " + Utility.hex(new String(fillIn2)));
\r
1480 clone.setText(fillIn1);
\r
1481 len = clone.getText(fillIn2);
\r
1482 if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){
\r
1483 errln("error in Normalizer.setText() or Normalizer.getText()"+
\r
1484 Utility.hex(new String(fillIn1))+
\r
1485 " Iter: " + Utility.hex(new String(fillIn2)));
\r
1488 // test setText(UChar *), getUMode() and setMode()
\r
1490 clone.setIndexOnly(1);
\r
1491 clone.setMode(Normalizer.NFD);
\r
1492 if(clone.getMode()!=Normalizer.NFD) {
\r
1493 errln("error in Normalizer::setMode() or Normalizer::getMode()");
\r
1495 if(clone.next()!=0x308 || clone.next()!=0x1100) {
\r
1496 errln("error in Normalizer::setText() or Normalizer::setMode()");
\r
1499 // test last()/previous() with an internal buffer overflow
\r
1500 StringBuffer buf = new StringBuffer("aaaaaaaaaa");
\r
1501 buf.setCharAt(10-1,'\u0308');
\r
1502 clone.setText(buf);
\r
1503 if(clone.last()!=0x308) {
\r
1504 errln("error in Normalizer(10*U+0308).last()");
\r
1507 // test UNORM_NONE
\r
1508 norm.setMode(Normalizer.NONE);
\r
1509 if(norm.first()!=0x61 || norm.next()!=0x308 || norm.last()!=0x2f800) {
\r
1510 errln("error in Normalizer(UNORM_NONE).first()/next()/last()");
\r
1512 out=Normalizer.normalize(s, Normalizer.NONE);
\r
1513 if(!out.equals(s)) {
\r
1514 errln("error in Normalizer::normalize(UNORM_NONE)");
\r
1517 String exp = "\\U0001D157\\U0001D165";
\r
1518 String ns = Normalizer.normalize(ch,Normalizer.NFC);
\r
1519 if(!ns.equals(Utility.unescape(exp))){
\r
1520 errln("error in Normalizer.normalize(int,Mode)");
\r
1522 ns = Normalizer.normalize(ch,Normalizer.NFC,0);
\r
1523 if(!ns.equals(Utility.unescape(exp))){
\r
1524 errln("error in Normalizer.normalize(int,Mode,int)");
\r
1528 }catch(Exception e){
\r
1533 public void TestConcatenate() {
\r
1535 Object[][]cases=new Object[][]{
\r
1536 /* mode, left, right, result */
\r
1540 "\u0301sum\u00e9",
\r
1541 "r\u00e9sum\u00e9"
\r
1546 "\u1161bcdefghijk",
\r
1547 "a\uac00bcdefghijk"
\r
1549 /* ### TODO: add more interesting cases */
\r
1553 "\u0C4D\uD804\uDCBA\uD834\uDD69", // 0C4D 110BA 1D169
\r
1554 "\u03B1\uD834\uDD69\uD804\uDCBA\u0C4D\u0345" // 03B1 1D169 110BA 0C4D 0345
\r
1558 String left, right, expect, result;
\r
1559 Normalizer.Mode mode;
\r
1562 /* test concatenation */
\r
1563 for(i=0; i<cases.length; ++i) {
\r
1564 mode = (Normalizer.Mode)cases[i][0];
\r
1566 left=(String)cases[i][1];
\r
1567 right=(String)cases[i][2];
\r
1568 expect=(String)cases[i][3];
\r
1570 result=Normalizer.concatenate(left, right, mode,0);
\r
1571 if(!result.equals(expect)) {
\r
1572 errln("error in Normalizer.concatenate(), cases[] failed"
\r
1573 +", result==expect: expected: "
\r
1574 + hex(expect)+" =========> got: " + hex(result));
\r
1578 result=Normalizer.concatenate(left.toCharArray(), right.toCharArray(), mode,0);
\r
1579 if(!result.equals(expect)) {
\r
1580 errln("error in Normalizer.concatenate(), cases[] failed"
\r
1581 +", result==expect: expected: "
\r
1582 + hex(expect)+" =========> got: " + hex(result));
\r
1587 private final int RAND_MAX = 0x7fff;
\r
1589 public void TestCheckFCD()
\r
1591 char[] FAST = {0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
\r
1592 0x0008, 0x0009, 0x000A};
\r
1594 char[] FALSE = {0x0001, 0x0002, 0x02EA, 0x03EB, 0x0300, 0x0301,
\r
1595 0x02B9, 0x0314, 0x0315, 0x0316};
\r
1597 char[] TRUE = {0x0030, 0x0040, 0x0440, 0x056D, 0x064F, 0x06E7,
\r
1598 0x0050, 0x0730, 0x09EE, 0x1E10};
\r
1600 char[][] datastr= { {0x0061, 0x030A, 0x1E05, 0x0302, 0},
\r
1601 {0x0061, 0x030A, 0x00E2, 0x0323, 0},
\r
1602 {0x0061, 0x0323, 0x00E2, 0x0323, 0},
\r
1603 {0x0061, 0x0323, 0x1E05, 0x0302, 0}
\r
1605 Normalizer.QuickCheckResult result[] = {Normalizer.YES, Normalizer.NO, Normalizer.NO, Normalizer.YES};
\r
1607 char[] datachar= { 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
\r
1609 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
\r
1611 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306,
\r
1612 0x0307, 0x0308, 0x0309, 0x030a,
\r
1613 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326,
\r
1614 0x0327, 0x0328, 0x0329, 0x032a,
\r
1615 0x1e00, 0x1e01, 0x1e02, 0x1e03, 0x1e04, 0x1e05, 0x1e06,
\r
1616 0x1e07, 0x1e08, 0x1e09, 0x1e0a
\r
1621 if (Normalizer.quickCheck(FAST,0,FAST.length, Normalizer.FCD,0) != Normalizer.YES)
\r
1622 errln("Normalizer.quickCheck(FCD) failed: expected value for fast Normalizer.quickCheck is Normalizer.YES\n");
\r
1623 if (Normalizer.quickCheck(FALSE,0, FALSE.length,Normalizer.FCD,0) != Normalizer.NO)
\r
1624 errln("Normalizer.quickCheck(FCD) failed: expected value for error Normalizer.quickCheck is Normalizer.NO\n");
\r
1625 if (Normalizer.quickCheck(TRUE,0,TRUE.length,Normalizer.FCD,0) != Normalizer.YES)
\r
1626 errln("Normalizer.quickCheck(FCD) failed: expected value for correct Normalizer.quickCheck is Normalizer.YES\n");
\r
1631 Normalizer.QuickCheckResult fcdresult = Normalizer.quickCheck(datastr[count],0,datastr[count].length, Normalizer.FCD,0);
\r
1632 if (result[count] != fcdresult) {
\r
1633 errln("Normalizer.quickCheck(FCD) failed: Data set "+ count
\r
1634 + " expected value "+ result[count]);
\r
1639 /* random checks of long strings */
\r
1640 //srand((unsigned)time( NULL ));
\r
1641 Random rand = createRandom(); // use test framework's random
\r
1643 for (count = 0; count < 50; count ++)
\r
1646 Normalizer.QuickCheckResult testresult = Normalizer.YES;
\r
1647 char[] data= new char[20];
\r
1648 char[] norm= new char[100];
\r
1649 char[] nfd = new char[100];
\r
1650 int normStart = 0;
\r
1652 while (size != 19) {
\r
1653 data[size] = datachar[rand.nextInt(RAND_MAX)*50/RAND_MAX];
\r
1654 logln("0x"+data[size]);
\r
1655 normStart += Normalizer.normalize(data,size,size+1,
\r
1656 norm,normStart,100,
\r
1657 Normalizer.NFD,0);
\r
1662 nfdsize = Normalizer.normalize(data,0,size, nfd,0,nfd.length,Normalizer.NFD,0);
\r
1663 // nfdsize = unorm_normalize(data, size, UNORM_NFD, UCOL_IGNORE_HANGUL,
\r
1664 // nfd, 100, &status);
\r
1665 if (nfdsize != normStart || Utility.arrayRegionMatches(nfd,0, norm,0,nfdsize) ==false) {
\r
1666 testresult = Normalizer.NO;
\r
1668 if (testresult == Normalizer.YES) {
\r
1669 logln("result Normalizer.YES\n");
\r
1672 logln("result Normalizer.NO\n");
\r
1675 if (Normalizer.quickCheck(data,0,data.length, Normalizer.FCD,0) != testresult) {
\r
1676 errln("Normalizer.quickCheck(FCD) failed: expected "+ testresult +" for random data: "+hex(new String(data)) );
\r
1682 // reference implementation of Normalizer::compare
\r
1683 private int ref_norm_compare(String s1, String s2, int options) {
\r
1684 String t1, t2,r1,r2;
\r
1686 int normOptions=(int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT);
\r
1688 if((options&Normalizer.COMPARE_IGNORE_CASE)!=0) {
\r
1689 // NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
\r
1690 r1 = Normalizer.decompose(s1,false,normOptions);
\r
1691 r2 = Normalizer.decompose(s2,false,normOptions);
\r
1692 r1 = UCharacter.foldCase(r1,options);
\r
1693 r2 = UCharacter.foldCase(r2,options);
\r
1699 t1 = Normalizer.decompose(r1, false, normOptions);
\r
1700 t2 = Normalizer.decompose(r2, false, normOptions);
\r
1702 if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) {
\r
1703 UTF16.StringComparator comp
\r
1704 = new UTF16.StringComparator(true, false,
\r
1705 UTF16.StringComparator.FOLD_CASE_DEFAULT);
\r
1706 return comp.compare(t1,t2);
\r
1708 return t1.compareTo(t2);
\r
1713 // test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately
\r
1714 private int norm_compare(String s1, String s2, int options) {
\r
1715 int normOptions=(int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT);
\r
1717 if( Normalizer.YES==Normalizer.quickCheck(s1,Normalizer.FCD,normOptions) &&
\r
1718 Normalizer.YES==Normalizer.quickCheck(s2,Normalizer.FCD,normOptions)) {
\r
1719 options|=Normalizer.INPUT_IS_FCD;
\r
1722 return Normalizer.compare(s1, s2, options);
\r
1725 // reference implementation of UnicodeString::caseCompare
\r
1726 private int ref_case_compare(String s1, String s2, int options) {
\r
1732 t1 = UCharacter.foldCase(t1,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0));
\r
1733 t2 = UCharacter.foldCase(t2,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0));
\r
1735 if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) {
\r
1736 UTF16.StringComparator comp
\r
1737 = new UTF16.StringComparator(true, false,
\r
1738 UTF16.StringComparator.FOLD_CASE_DEFAULT);
\r
1739 return comp.compare(t1,t2);
\r
1741 return t1.compareTo(t2);
\r
1746 // reduce an integer to -1/0/1
\r
1747 private static int sign(int value) {
\r
1751 return (value>>31)|1;
\r
1754 private static String signString(int value) {
\r
1757 } else if(value==0) {
\r
1759 } else /* value>0 */ {
\r
1763 // test Normalizer::compare and unorm_compare (thinly wrapped by the former)
\r
1764 // by comparing it with its semantic equivalent
\r
1765 // since we trust the pieces, this is sufficient
\r
1767 // test each string with itself and each other
\r
1768 // each time with all options
\r
1769 private String strings[]=new String[]{
\r
1770 // some cases from NormalizationTest.txt
\r
1772 "D\u031B\u0307\u0323",
\r
1773 "\u1E0C\u031B\u0307",
\r
1774 "D\u031B\u0323\u0307",
\r
1775 "d\u031B\u0323\u0307",
\r
1782 // Angstrom sign = A ring
\r
1790 "a\u059A\u0316\u302A\u032Fb",
\r
1791 "a\u302A\u0316\u032F\u059Ab",
\r
1792 "a\u302A\u0316\u032F\u059Ab",
\r
1793 "A\u059A\u0316\u302A\u032Fb",
\r
1795 // from ICU case folding tests
\r
1797 "A\u00df\u00b5\ufb03\\U0001040c\u0131",
\r
1798 "ass\u03bcffi\\U00010434i",
\r
1799 "\u0061\u0042\u0131\u03a3\u00df\ufb03\ud93f\udfff",
\r
1800 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udfff",
\r
1801 "\u0041\u0062\u0131\u03c3\u0053\u0073\u0066\u0046\u0069\ud93f\udfff",
\r
1802 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udffd",
\r
1804 // U+d800 U+10001 see implementation comment in unorm_cmpEquivFold
\r
1805 // vs. U+10000 at bottom - code point order
\r
1807 "\ud800\ud800\udc01",
\r
1810 // other code point order tests from ustrtest.cpp
\r
1813 "\u20ac\ud800\udc00",
\r
1818 "\uff61\ud800\udc02",
\r
1822 // long strings, see cnormtst.c/TestNormCoverage()
\r
1823 // equivalent if case-insensitive
\r
1825 "\uAD8B\uAD8B\uAD8B\uAD8B"+
\r
1826 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1827 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1828 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1829 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1830 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1831 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
\r
1832 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
\r
1833 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+
\r
1834 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
\r
1835 "\uAD8B\uAD8B\uAD8B\uAD8B"+
\r
1836 "d\u031B\u0307\u0323",
\r
1838 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+
\r
1839 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1840 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1841 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1842 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1843 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
\r
1844 "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
\r
1845 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
\r
1846 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+
\r
1847 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
\r
1848 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+
\r
1849 "\u1E0C\u031B\u0307",
\r
1851 // some strings that may make a difference whether the compare function
\r
1852 // case-folds or decomposes first
\r
1854 "\u0360\u0345\u0334",
\r
1855 "\u0360\u03b9\u0334",
\r
1857 "\u0360\u1f80\u0334",
\r
1858 "\u0360\u03b1\u0313\u03b9\u0334",
\r
1860 "\u0360\u1ffc\u0334",
\r
1861 "\u0360\u03c9\u03b9\u0334",
\r
1863 "a\u0360\u0345\u0360\u0345b",
\r
1864 "a\u0345\u0360\u0345\u0360b",
\r
1866 // interesting cases for canonical caseless match with turkic i handling
\r
1871 // strings with post-Unicode 3.2 normalization or normalization corrections
\r
1873 "\u00e4\u193b\\U0002f868",
\r
1874 "\u0061\u193b\u0308\u36fc",
\r
1879 // all combinations of options
\r
1880 // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions
\r
1881 final class Temp {
\r
1884 public Temp(int opt,String str){
\r
1890 // set UNORM_UNICODE_3_2 in one additional combination
\r
1892 private Temp[] opt = new Temp[]{
\r
1893 new Temp(0,"default"),
\r
1894 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER, "code point order" ),
\r
1895 new Temp(Normalizer.COMPARE_IGNORE_CASE, "ignore case" ),
\r
1896 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE, "code point order & ignore case" ),
\r
1897 new Temp(Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "ignore case & special i"),
\r
1898 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "code point order & ignore case & special i"),
\r
1899 new Temp(Normalizer.UNICODE_3_2 << Normalizer.COMPARE_NORM_OPTIONS_SHIFT, "Unicode 3.2")
\r
1903 public void TestCompareDebug(){
\r
1905 String[] s = new String[100]; // at least as many items as in strings[] !
\r
1908 int i, j, k, count=strings.length;
\r
1909 int result, refResult;
\r
1911 // create the UnicodeStrings
\r
1912 for(i=0; i<count; ++i) {
\r
1913 s[i]=Utility.unescape(strings[i]);
\r
1915 UTF16.StringComparator comp = new UTF16.StringComparator(true, false,
\r
1916 UTF16.StringComparator.FOLD_CASE_DEFAULT);
\r
1917 // test them each with each other
\r
1922 // test Normalizer::compare
\r
1923 result=norm_compare(s[i], s[j], opt[k].options);
\r
1924 refResult=ref_norm_compare(s[i], s[j], opt[k].options);
\r
1925 if(sign(result)!=sign(refResult)) {
\r
1926 errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
\r
1929 // test UnicodeString::caseCompare - same internal implementation function
\r
1930 if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) {
\r
1931 // result=s[i]. (s[j], opt[k].options);
\r
1932 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)
\r
1934 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);
\r
1937 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
\r
1940 result=comp.compare(s[i],s[j]);
\r
1941 refResult=ref_case_compare(s[i], s[j], opt[k].options);
\r
1942 if(sign(result)!=sign(refResult)) {
\r
1943 errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
\r
1946 String value1 = "\u00dater\u00fd";
\r
1947 String value2 = "\u00fater\u00fd";
\r
1948 if(Normalizer.compare(value1,value2,0)!=0){
\r
1949 if(Normalizer.compare(value1,value2,Normalizer.COMPARE_IGNORE_CASE)==0){
\r
1955 public void TestCompare() {
\r
1957 String[] s = new String[100]; // at least as many items as in strings[] !
\r
1959 int i, j, k, count=strings.length;
\r
1960 int result, refResult;
\r
1962 // create the UnicodeStrings
\r
1963 for(i=0; i<count; ++i) {
\r
1964 s[i]=Utility.unescape(strings[i]);
\r
1966 UTF16.StringComparator comp = new UTF16.StringComparator();
\r
1967 // test them each with each other
\r
1968 for(i=0; i<count; ++i) {
\r
1969 for(j=i; j<count; ++j) {
\r
1970 for(k=0; k<opt.length; ++k) {
\r
1971 // test Normalizer::compare
\r
1972 result=norm_compare(s[i], s[j], opt[k].options);
\r
1973 refResult=ref_norm_compare(s[i], s[j], opt[k].options);
\r
1974 if(sign(result)!=sign(refResult)) {
\r
1975 errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
\r
1978 // test UnicodeString::caseCompare - same internal implementation function
\r
1979 if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) {
\r
1980 // result=s[i]. (s[j], opt[k].options);
\r
1981 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)
\r
1983 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);
\r
1986 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
\r
1989 comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0);
\r
1990 // result=comp.caseCompare(s[i],s[j], opt[k].options);
\r
1991 result=comp.compare(s[i],s[j]);
\r
1992 refResult=ref_case_compare(s[i], s[j], opt[k].options);
\r
1993 if(sign(result)!=sign(refResult)) {
\r
1994 errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
\r
2001 // test cases with i and I to make sure Turkic works
\r
2002 char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 };
\r
2003 UnicodeSet set = new UnicodeSet(), iSet = new UnicodeSet();
\r
2004 Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstance().impl;
\r
2005 nfcImpl.ensureCanonIterData();
\r
2010 // collect all sets into one for contiguous output
\r
2011 for(i=0; i<iI.length; ++i) {
\r
2012 if(nfcImpl.getCanonStartSet(iI[i], iSet)) {
\r
2017 // test all of these precomposed characters
\r
2018 UnicodeSetIterator it = new UnicodeSetIterator(set);
\r
2019 while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) {
\r
2020 start=it.codepoint;
\r
2021 end=it.codepointEnd;
\r
2022 while(start<=end) {
\r
2023 s1 = Integer.toString(start);
\r
2024 s2 = Normalizer.decompose(s1, false, 0);
\r
2025 // if(U_FAILURE(errorCode)) {
\r
2026 // errln("Normalizer::decompose(U+%04x) failed: %s", start, u_errorName(errorCode));
\r
2029 for(k=0; k<opt.length; ++k) {
\r
2030 // test Normalizer::compare
\r
2032 result= norm_compare(s1, s2, opt[k].options);
\r
2033 refResult=ref_norm_compare(s1, s2, opt[k].options);
\r
2034 if(sign(result)!=sign(refResult)) {
\r
2035 errln("Normalizer.compare(U+"+hex(start)+" with its NFD, "+opt[k].name+")"
\r
2036 + signString(result)+" should be "+signString(refResult));
\r
2039 // test UnicodeString::caseCompare - same internal implementation function
\r
2040 if((opt[k].options & Normalizer.COMPARE_IGNORE_CASE)>0) {
\r
2041 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)
\r
2043 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);
\r
2046 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
\r
2049 comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0);
\r
2051 result=comp.compare(s1,s2);
\r
2052 refResult=ref_case_compare(s1, s2, opt[k].options);
\r
2053 if(sign(result)!=sign(refResult)) {
\r
2054 errln("UTF16.compare(U+"+hex(start)+" with its NFD, "
\r
2055 +opt[k].name+")"+signString(result) +" should be "+signString(refResult));
\r
2066 // verify that case-folding does not un-FCD strings
\r
2067 int countFoldFCDExceptions(int foldingOptions) {
\r
2071 int/*unsigned*/ cc, trailCC, foldCC, foldTrailCC;
\r
2072 Normalizer.QuickCheckResult qcResult;
\r
2077 logln("Test if case folding may un-FCD a string (folding options 0x)"+hex(foldingOptions));
\r
2080 for(c=0; c<=0x10ffff; ++c) {
\r
2081 category=UCharacter.getType(c);
\r
2082 if(category==UCharacterCategory.UNASSIGNED) {
\r
2083 continue; // skip unassigned code points
\r
2086 c=0xd7a3; // skip Hangul - no case folding there
\r
2089 // skip Han blocks - no case folding there either
\r
2103 s= UTF16.valueOf(c);
\r
2105 // get leading and trailing cc for c
\r
2106 d= Normalizer.decompose(s,false);
\r
2108 cc=UCharacter.getCombiningClass(UTF16.charAt(d,0));
\r
2109 trailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1));
\r
2111 // get leading and trailing cc for the case-folding of c
\r
2112 UCharacter.foldCase(s,(foldingOptions==0));
\r
2113 d = Normalizer.decompose(s, false);
\r
2114 foldCC=UCharacter.getCombiningClass(UTF16.charAt(d,0));
\r
2115 foldTrailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1));
\r
2117 qcResult=Normalizer.quickCheck(s, Normalizer.FCD,0);
\r
2121 // - character maps to empty string: adjacent characters may then need reordering
\r
2122 // - folding has different leading/trailing cc's, and they don't become just 0
\r
2123 // - folding itself is not FCD
\r
2124 if( qcResult!=Normalizer.YES ||
\r
2126 (cc!=foldCC && foldCC!=0) || (trailCC!=foldTrailCC && foldTrailCC!=0)
\r
2129 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")");
\r
2130 //errln(" cc %02x trailCC %02x foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x quickCheck(folded)=%d", cc, trailCC, UTF16.charAt(d,0), foldCC, UTF16.charAt(d,d.length()-1), foldTrailCC, qcResult);
\r
2135 // if a code point is in NFD but its case folding is not, then
\r
2136 // unorm_compare will also fail
\r
2137 if(isNFD && Normalizer.YES!=Normalizer.quickCheck(s, Normalizer.NFD,0)) {
\r
2139 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")");
\r
2143 logln("There are "+hex(count)+" code points for which case-folding may un-FCD a string (folding options"+foldingOptions+"x)" );
\r
2147 public void TestFindFoldFCDExceptions() {
\r
2150 count=countFoldFCDExceptions(0);
\r
2151 count+=countFoldFCDExceptions(Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I);
\r
2154 //* If case-folding un-FCDs any strings, then unorm_compare() must be
\r
2155 //* re-implemented.
\r
2156 //* It currently assumes that one can check for FCD then case-fold
\r
2157 //* and then still have FCD strings for raw decomposition without reordering.
\r
2159 errln("error: There are "+count+" code points for which case-folding"+
\r
2160 " may un-FCD a string for all folding options.\n See comment"+
\r
2161 " in BasicNormalizerTest::FindFoldFCDExceptions()!");
\r
2165 public void TestCombiningMarks(){
\r
2166 String src = "\u0f71\u0f72\u0f73\u0f74\u0f75";
\r
2167 String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74";
\r
2168 String result = Normalizer.decompose(src,false);
\r
2169 if(!expected.equals(result)){
\r
2170 errln("Reordering of combining marks failed. Expected: "+Utility.hex(expected)+" Got: "+ Utility.hex(result));
\r
2175 * Re-enable this test when UTC fixes UAX 21
\r
2176 public void TestUAX21Failure(){
\r
2177 final String[][] cases = new String[][]{
\r
2178 {"\u0061\u0345\u0360\u0345\u0062", "\u0061\u0360\u0345\u0345\u0062"},
\r
2179 {"\u0061\u0345\u0345\u0360\u0062", "\u0061\u0360\u0345\u0345\u0062"},
\r
2180 {"\u0061\u0345\u0360\u0362\u0360\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"},
\r
2181 {"\u0061\u0360\u0345\u0360\u0362\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"},
\r
2182 {"\u0061\u0345\u0360\u0362\u0361\u0062", "\u0061\u0362\u0360\u0361\u0345\u0062"},
\r
2183 {"\u0061\u0361\u0345\u0360\u0362\u0062", "\u0061\u0362\u0361\u0360\u0345\u0062"},
\r
2185 for(int i = 0; i< cases.length; i++){
\r
2186 String s1 =cases[0][0];
\r
2187 String s2 = cases[0][1];
\r
2188 if( (Normalizer.compare(s1,s2,Normalizer.FOLD_CASE_DEFAULT ==0)//case sensitive compare
\r
2190 (Normalizer.compare(s1,s2,Normalizer.COMPARE_IGNORE_CASE)!=0)){
\r
2191 errln("Normalizer.compare() failed for s1: "
\r
2192 + Utility.hex(s1) +" s2: " + Utility.hex(s2));
\r
2197 public void TestFCNFKCClosure() {
\r
2198 final class TestStruct{
\r
2201 TestStruct(int cp, String src){
\r
2207 TestStruct[] tests= new TestStruct[]{
\r
2208 new TestStruct( 0x00C4, "" ),
\r
2209 new TestStruct( 0x00E4, "" ),
\r
2210 new TestStruct( 0x037A, "\u0020\u03B9" ),
\r
2211 new TestStruct( 0x03D2, "\u03C5" ),
\r
2212 new TestStruct( 0x20A8, "\u0072\u0073" ) ,
\r
2213 new TestStruct( 0x210B, "\u0068" ),
\r
2214 new TestStruct( 0x210C, "\u0068" ),
\r
2215 new TestStruct( 0x2121, "\u0074\u0065\u006C" ),
\r
2216 new TestStruct( 0x2122, "\u0074\u006D" ),
\r
2217 new TestStruct( 0x2128, "\u007A" ),
\r
2218 new TestStruct( 0x1D5DB,"\u0068" ),
\r
2219 new TestStruct( 0x1D5ED,"\u007A" ),
\r
2220 new TestStruct( 0x0061, "" )
\r
2224 for(int i = 0; i < tests.length; ++ i) {
\r
2225 String result=Normalizer.getFC_NFKC_Closure(tests[i].c);
\r
2226 if(!result.equals(new String(tests[i].s))) {
\r
2227 errln("getFC_NFKC_Closure(U+"+Integer.toHexString(tests[i].c)+") is wrong");
\r
2231 /* error handling */
\r
2233 int length=Normalizer.getFC_NFKC_Closure(0x5c, null);
\r
2235 errln("getFC_NFKC_Closure did not perform error handling correctly");
\r
2238 public void TestBugJ2324(){
\r
2239 /* String[] input = new String[]{
\r
2248 String troublesome = "\u309A";
\r
2249 for(int i=0x3000; i<0x3100;i++){
\r
2250 String input = ((char)i)+troublesome;
\r
2252 /* String result =*/ Normalizer.compose(input,false);
\r
2253 }catch(IndexOutOfBoundsException e){
\r
2254 errln("compose() failed for input: " + Utility.hex(input) + " Exception: " + e.toString());
\r
2260 static final int D = 0, C = 1, KD= 2, KC = 3, FCD=4, NONE=5;
\r
2261 private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets){
\r
2262 if( skipSets.length < 4 ){
\r
2265 skipSets[D].applyPattern(
\r
2266 "[^\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6\\u00D9-\\u00DD"
\r
2267 + "\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6\\u00F9-\\u00FD"
\r
2268 + "\\u00FF-\\u010F\\u0112-\\u0125\\u0128-\\u0130\\u0134-\\u0137"
\r
2269 + "\\u0139-\\u013E\\u0143-\\u0148\\u014C-\\u0151\\u0154-\\u0165"
\r
2270 + "\\u0168-\\u017E\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\\u01DC"
\r
2271 + "\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B"
\r
2272 + "\\u021E\\u021F\\u0226-\\u0233\\u0300-\\u034E\\u0350-\\u036F"
\r
2273 + "\\u0374\\u037E\\u0385-\\u038A\\u038C\\u038E-\\u0390\\u03AA-"
\r
2274 + "\\u03B0\\u03CA-\\u03CE\\u03D3\\u03D4\\u0400\\u0401\\u0403\\u0407"
\r
2275 + "\\u040C-\\u040E\\u0419\\u0439\\u0450\\u0451\\u0453\\u0457\\u045C"
\r
2276 + "-\\u045E\\u0476\\u0477\\u0483-\\u0487\\u04C1\\u04C2\\u04D0-"
\r
2277 + "\\u04D3\\u04D6\\u04D7\\u04DA-\\u04DF\\u04E2-\\u04E7\\u04EA-"
\r
2278 + "\\u04F5\\u04F8\\u04F9\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4"
\r
2279 + "\\u05C5\\u05C7\\u0610-\\u061A\\u0622-\\u0626\\u064B-\\u065E"
\r
2280 + "\\u0670\\u06C0\\u06C2\\u06D3\\u06D6-\\u06DC\\u06DF-\\u06E4"
\r
2281 + "\\u06E7\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-"
\r
2282 + "\\u07F3\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-"
\r
2283 + "\\u082D\\u0929\\u0931\\u0934\\u093C\\u094D\\u0951-\\u0954\\u0958"
\r
2284 + "-\\u095F\\u09BC\\u09CB-\\u09CD\\u09DC\\u09DD\\u09DF\\u0A33"
\r
2285 + "\\u0A36\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C"
\r
2286 + "\\u0B48\\u0B4B-\\u0B4D\\u0B5C\\u0B5D\\u0B94\\u0BCA-\\u0BCD"
\r
2287 + "\\u0C48\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CC0\\u0CC7\\u0CC8\\u0CCA"
\r
2288 + "\\u0CCB\\u0CCD\\u0D4A-\\u0D4D\\u0DCA\\u0DDA\\u0DDC-\\u0DDE"
\r
2289 + "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB8\\u0EB9\\u0EC8-\\u0ECB"
\r
2290 + "\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D\\u0F52\\u0F57"
\r
2291 + "\\u0F5C\\u0F69\\u0F71-\\u0F76\\u0F78\\u0F7A-\\u0F7D\\u0F80-"
\r
2292 + "\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"
\r
2293 + "\\u0FC6\\u1026\\u1037\\u1039\\u103A\\u108D\\u135F\\u1714\\u1734"
\r
2294 + "\\u17D2\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60\\u1A75"
\r
2295 + "-\\u1A7C\\u1A7F\\u1B06\\u1B08\\u1B0A\\u1B0C\\u1B0E\\u1B12\\u1B34"
\r
2296 + "\\u1B3B\\u1B3D\\u1B40\\u1B41\\u1B43\\u1B44\\u1B6B-\\u1B73\\u1BAA"
\r
2297 + "\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8\\u1CED"
\r
2298 + "\\u1DC0-\\u1DE6\\u1DFD-\\u1E99\\u1E9B\\u1EA0-\\u1EF9\\u1F00-"
\r
2299 + "\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-"
\r
2300 + "\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4"
\r
2301 + "\\u1FB6-\\u1FBC\\u1FBE\\u1FC1-\\u1FC4\\u1FC6-\\u1FD3\\u1FD6-"
\r
2302 + "\\u1FDB\\u1FDD-\\u1FEF\\u1FF2-\\u1FF4\\u1FF6-\\u1FFD\\u2000"
\r
2303 + "\\u2001\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2126\\u212A"
\r
2304 + "\\u212B\\u219A\\u219B\\u21AE\\u21CD-\\u21CF\\u2204\\u2209\\u220C"
\r
2305 + "\\u2224\\u2226\\u2241\\u2244\\u2247\\u2249\\u2260\\u2262\\u226D-"
\r
2306 + "\\u2271\\u2274\\u2275\\u2278\\u2279\\u2280\\u2281\\u2284\\u2285"
\r
2307 + "\\u2288\\u2289\\u22AC-\\u22AF\\u22E0-\\u22E3\\u22EA-\\u22ED"
\r
2308 + "\\u2329\\u232A\\u2ADC\\u2CEF-\\u2CF1\\u2DE0-\\u2DFF\\u302A-"
\r
2309 + "\\u302F\\u304C\\u304E\\u3050\\u3052\\u3054\\u3056\\u3058\\u305A"
\r
2310 + "\\u305C\\u305E\\u3060\\u3062\\u3065\\u3067\\u3069\\u3070\\u3071"
\r
2311 + "\\u3073\\u3074\\u3076\\u3077\\u3079\\u307A\\u307C\\u307D\\u3094"
\r
2312 + "\\u3099\\u309A\\u309E\\u30AC\\u30AE\\u30B0\\u30B2\\u30B4\\u30B6"
\r
2313 + "\\u30B8\\u30BA\\u30BC\\u30BE\\u30C0\\u30C2\\u30C5\\u30C7\\u30C9"
\r
2314 + "\\u30D0\\u30D1\\u30D3\\u30D4\\u30D6\\u30D7\\u30D9\\u30DA\\u30DC"
\r
2315 + "\\u30DD\\u30F4\\u30F7-\\u30FA\\u30FE\\uA66F\\uA67C\\uA67D\\uA6F0"
\r
2316 + "\\uA6F1\\uA806\\uA8C4\\uA8E0-\\uA8F1\\uA92B-\\uA92D\\uA953"
\r
2317 + "\\uA9B3\\uA9C0\\uAAB0\\uAAB2-\\uAAB4\\uAAB7\\uAAB8\\uAABE\\uAABF"
\r
2318 + "\\uAAC1\\uABED\\uAC00-\\uD7A3\\uF900-\\uFA0D\\uFA10\\uFA12"
\r
2319 + "\\uFA15-\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D"
\r
2320 + "\\uFA30-\\uFA6D\\uFA70-\\uFAD9\\uFB1D-\\uFB1F\\uFB2A-\\uFB36"
\r
2321 + "\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-"
\r
2322 + "\\uFB4E\\uFE20-\\uFE26\\U000101FD\\U00010A0D\\U00010A0F\\U00010A"
\r
2323 + "38-\\U00010A3A\\U00010A3F\\U0001109A\\U0001109C\\U000110AB"
\r
2324 + "\\U000110B9\\U000110BA\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001"
\r
2325 + "D172\\U0001D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-"
\r
2326 + "\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0002"
\r
2327 + "F800-\\U0002FA1D]", false);
\r
2329 skipSets[C].applyPattern(
\r
2330 "[^<->A-PR-Za-pr-z\\u00A8\\u00C0-\\u00CF\\u00D1-\\u00D6\\u00D8-"
\r
2331 + "\\u00DD\\u00E0-\\u00EF\\u00F1-\\u00F6\\u00F8-\\u00FD\\u00FF-"
\r
2332 + "\\u0103\\u0106-\\u010F\\u0112-\\u0117\\u011A-\\u0121\\u0124"
\r
2333 + "\\u0125\\u0128-\\u012D\\u0130\\u0139\\u013A\\u013D\\u013E\\u0143"
\r
2334 + "\\u0144\\u0147\\u0148\\u014C-\\u0151\\u0154\\u0155\\u0158-"
\r
2335 + "\\u015D\\u0160\\u0161\\u0164\\u0165\\u0168-\\u0171\\u0174-"
\r
2336 + "\\u017F\\u01A0\\u01A1\\u01AF\\u01B0\\u01B7\\u01CD-\\u01DC\\u01DE"
\r
2337 + "-\\u01E1\\u01E6-\\u01EB\\u01F4\\u01F5\\u01F8-\\u01FB\\u0200-"
\r
2338 + "\\u021B\\u021E\\u021F\\u0226-\\u0233\\u0292\\u0300-\\u034E"
\r
2339 + "\\u0350-\\u036F\\u0374\\u037E\\u0387\\u0391\\u0395\\u0397\\u0399"
\r
2340 + "\\u039F\\u03A1\\u03A5\\u03A9\\u03AC\\u03AE\\u03B1\\u03B5\\u03B7"
\r
2341 + "\\u03B9\\u03BF\\u03C1\\u03C5\\u03C9-\\u03CB\\u03CE\\u03D2\\u0406"
\r
2342 + "\\u0410\\u0413\\u0415-\\u0418\\u041A\\u041E\\u0423\\u0427\\u042B"
\r
2343 + "\\u042D\\u0430\\u0433\\u0435-\\u0438\\u043A\\u043E\\u0443\\u0447"
\r
2344 + "\\u044B\\u044D\\u0456\\u0474\\u0475\\u0483-\\u0487\\u04D8\\u04D9"
\r
2345 + "\\u04E8\\u04E9\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5"
\r
2346 + "\\u05C7\\u0610-\\u061A\\u0622\\u0623\\u0627\\u0648\\u064A-"
\r
2347 + "\\u065E\\u0670\\u06C1\\u06D2\\u06D5-\\u06DC\\u06DF-\\u06E4"
\r
2348 + "\\u06E7\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-"
\r
2349 + "\\u07F3\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-"
\r
2350 + "\\u082D\\u0928\\u0930\\u0933\\u093C\\u094D\\u0951-\\u0954\\u0958"
\r
2351 + "-\\u095F\\u09BC\\u09BE\\u09C7\\u09CD\\u09D7\\u09DC\\u09DD\\u09DF"
\r
2352 + "\\u0A33\\u0A36\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD"
\r
2353 + "\\u0B3C\\u0B3E\\u0B47\\u0B4D\\u0B56\\u0B57\\u0B5C\\u0B5D\\u0B92"
\r
2354 + "\\u0BBE\\u0BC6\\u0BC7\\u0BCD\\u0BD7\\u0C46\\u0C4D\\u0C55\\u0C56"
\r
2355 + "\\u0CBC\\u0CBF\\u0CC2\\u0CC6\\u0CCA\\u0CCD\\u0CD5\\u0CD6\\u0D3E"
\r
2356 + "\\u0D46\\u0D47\\u0D4D\\u0D57\\u0DCA\\u0DCF\\u0DD9\\u0DDC\\u0DDF"
\r
2357 + "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB8\\u0EB9\\u0EC8-\\u0ECB"
\r
2358 + "\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D\\u0F52\\u0F57"
\r
2359 + "\\u0F5C\\u0F69\\u0F71-\\u0F76\\u0F78\\u0F7A-\\u0F7D\\u0F80-"
\r
2360 + "\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"
\r
2361 + "\\u0FC6\\u1025\\u102E\\u1037\\u1039\\u103A\\u108D\\u1100-\\u1112"
\r
2362 + "\\u1161-\\u1175\\u11A8-\\u11C2\\u135F\\u1714\\u1734\\u17D2"
\r
2363 + "\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60\\u1A75-"
\r
2364 + "\\u1A7C\\u1A7F\\u1B05\\u1B07\\u1B09\\u1B0B\\u1B0D\\u1B11\\u1B34"
\r
2365 + "\\u1B35\\u1B3A\\u1B3C\\u1B3E\\u1B3F\\u1B42\\u1B44\\u1B6B-\\u1B73"
\r
2366 + "\\u1BAA\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8"
\r
2367 + "\\u1CED\\u1DC0-\\u1DE6\\u1DFD-\\u1E03\\u1E0A-\\u1E0F\\u1E12-"
\r
2368 + "\\u1E1B\\u1E20-\\u1E27\\u1E2A-\\u1E41\\u1E44-\\u1E53\\u1E58-"
\r
2369 + "\\u1E7D\\u1E80-\\u1E87\\u1E8E-\\u1E91\\u1E96-\\u1E99\\u1EA0-"
\r
2370 + "\\u1EF3\\u1EF6-\\u1EF9\\u1F00-\\u1F11\\u1F18\\u1F19\\u1F20-"
\r
2371 + "\\u1F31\\u1F38\\u1F39\\u1F40\\u1F41\\u1F48\\u1F49\\u1F50\\u1F51"
\r
2372 + "\\u1F59\\u1F60-\\u1F71\\u1F73-\\u1F75\\u1F77\\u1F79\\u1F7B-"
\r
2373 + "\\u1F7D\\u1F80\\u1F81\\u1F88\\u1F89\\u1F90\\u1F91\\u1F98\\u1F99"
\r
2374 + "\\u1FA0\\u1FA1\\u1FA8\\u1FA9\\u1FB3\\u1FB6\\u1FBB\\u1FBC\\u1FBE"
\r
2375 + "\\u1FBF\\u1FC3\\u1FC6\\u1FC9\\u1FCB\\u1FCC\\u1FD3\\u1FDB\\u1FE3"
\r
2376 + "\\u1FEB\\u1FEE\\u1FEF\\u1FF3\\u1FF6\\u1FF9\\u1FFB-\\u1FFE\\u2000"
\r
2377 + "\\u2001\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2126\\u212A"
\r
2378 + "\\u212B\\u2190\\u2192\\u2194\\u21D0\\u21D2\\u21D4\\u2203\\u2208"
\r
2379 + "\\u220B\\u2223\\u2225\\u223C\\u2243\\u2245\\u2248\\u224D\\u2261"
\r
2380 + "\\u2264\\u2265\\u2272\\u2273\\u2276\\u2277\\u227A-\\u227D\\u2282"
\r
2381 + "\\u2283\\u2286\\u2287\\u2291\\u2292\\u22A2\\u22A8\\u22A9\\u22AB"
\r
2382 + "\\u22B2-\\u22B5\\u2329\\u232A\\u2ADC\\u2CEF-\\u2CF1\\u2DE0-"
\r
2383 + "\\u2DFF\\u302A-\\u302F\\u3046\\u304B\\u304D\\u304F\\u3051\\u3053"
\r
2384 + "\\u3055\\u3057\\u3059\\u305B\\u305D\\u305F\\u3061\\u3064\\u3066"
\r
2385 + "\\u3068\\u306F\\u3072\\u3075\\u3078\\u307B\\u3099\\u309A\\u309D"
\r
2386 + "\\u30A6\\u30AB\\u30AD\\u30AF\\u30B1\\u30B3\\u30B5\\u30B7\\u30B9"
\r
2387 + "\\u30BB\\u30BD\\u30BF\\u30C1\\u30C4\\u30C6\\u30C8\\u30CF\\u30D2"
\r
2388 + "\\u30D5\\u30D8\\u30DB\\u30EF-\\u30F2\\u30FD\\uA66F\\uA67C\\uA67D"
\r
2389 + "\\uA6F0\\uA6F1\\uA806\\uA8C4\\uA8E0-\\uA8F1\\uA92B-\\uA92D"
\r
2390 + "\\uA953\\uA9B3\\uA9C0\\uAAB0\\uAAB2-\\uAAB4\\uAAB7\\uAAB8\\uAABE"
\r
2391 + "\\uAABF\\uAAC1\\uABED\\uAC00\\uAC1C\\uAC38\\uAC54\\uAC70\\uAC8C"
\r
2392 + "\\uACA8\\uACC4\\uACE0\\uACFC\\uAD18\\uAD34\\uAD50\\uAD6C\\uAD88"
\r
2393 + "\\uADA4\\uADC0\\uADDC\\uADF8\\uAE14\\uAE30\\uAE4C\\uAE68\\uAE84"
\r
2394 + "\\uAEA0\\uAEBC\\uAED8\\uAEF4\\uAF10\\uAF2C\\uAF48\\uAF64\\uAF80"
\r
2395 + "\\uAF9C\\uAFB8\\uAFD4\\uAFF0\\uB00C\\uB028\\uB044\\uB060\\uB07C"
\r
2396 + "\\uB098\\uB0B4\\uB0D0\\uB0EC\\uB108\\uB124\\uB140\\uB15C\\uB178"
\r
2397 + "\\uB194\\uB1B0\\uB1CC\\uB1E8\\uB204\\uB220\\uB23C\\uB258\\uB274"
\r
2398 + "\\uB290\\uB2AC\\uB2C8\\uB2E4\\uB300\\uB31C\\uB338\\uB354\\uB370"
\r
2399 + "\\uB38C\\uB3A8\\uB3C4\\uB3E0\\uB3FC\\uB418\\uB434\\uB450\\uB46C"
\r
2400 + "\\uB488\\uB4A4\\uB4C0\\uB4DC\\uB4F8\\uB514\\uB530\\uB54C\\uB568"
\r
2401 + "\\uB584\\uB5A0\\uB5BC\\uB5D8\\uB5F4\\uB610\\uB62C\\uB648\\uB664"
\r
2402 + "\\uB680\\uB69C\\uB6B8\\uB6D4\\uB6F0\\uB70C\\uB728\\uB744\\uB760"
\r
2403 + "\\uB77C\\uB798\\uB7B4\\uB7D0\\uB7EC\\uB808\\uB824\\uB840\\uB85C"
\r
2404 + "\\uB878\\uB894\\uB8B0\\uB8CC\\uB8E8\\uB904\\uB920\\uB93C\\uB958"
\r
2405 + "\\uB974\\uB990\\uB9AC\\uB9C8\\uB9E4\\uBA00\\uBA1C\\uBA38\\uBA54"
\r
2406 + "\\uBA70\\uBA8C\\uBAA8\\uBAC4\\uBAE0\\uBAFC\\uBB18\\uBB34\\uBB50"
\r
2407 + "\\uBB6C\\uBB88\\uBBA4\\uBBC0\\uBBDC\\uBBF8\\uBC14\\uBC30\\uBC4C"
\r
2408 + "\\uBC68\\uBC84\\uBCA0\\uBCBC\\uBCD8\\uBCF4\\uBD10\\uBD2C\\uBD48"
\r
2409 + "\\uBD64\\uBD80\\uBD9C\\uBDB8\\uBDD4\\uBDF0\\uBE0C\\uBE28\\uBE44"
\r
2410 + "\\uBE60\\uBE7C\\uBE98\\uBEB4\\uBED0\\uBEEC\\uBF08\\uBF24\\uBF40"
\r
2411 + "\\uBF5C\\uBF78\\uBF94\\uBFB0\\uBFCC\\uBFE8\\uC004\\uC020\\uC03C"
\r
2412 + "\\uC058\\uC074\\uC090\\uC0AC\\uC0C8\\uC0E4\\uC100\\uC11C\\uC138"
\r
2413 + "\\uC154\\uC170\\uC18C\\uC1A8\\uC1C4\\uC1E0\\uC1FC\\uC218\\uC234"
\r
2414 + "\\uC250\\uC26C\\uC288\\uC2A4\\uC2C0\\uC2DC\\uC2F8\\uC314\\uC330"
\r
2415 + "\\uC34C\\uC368\\uC384\\uC3A0\\uC3BC\\uC3D8\\uC3F4\\uC410\\uC42C"
\r
2416 + "\\uC448\\uC464\\uC480\\uC49C\\uC4B8\\uC4D4\\uC4F0\\uC50C\\uC528"
\r
2417 + "\\uC544\\uC560\\uC57C\\uC598\\uC5B4\\uC5D0\\uC5EC\\uC608\\uC624"
\r
2418 + "\\uC640\\uC65C\\uC678\\uC694\\uC6B0\\uC6CC\\uC6E8\\uC704\\uC720"
\r
2419 + "\\uC73C\\uC758\\uC774\\uC790\\uC7AC\\uC7C8\\uC7E4\\uC800\\uC81C"
\r
2420 + "\\uC838\\uC854\\uC870\\uC88C\\uC8A8\\uC8C4\\uC8E0\\uC8FC\\uC918"
\r
2421 + "\\uC934\\uC950\\uC96C\\uC988\\uC9A4\\uC9C0\\uC9DC\\uC9F8\\uCA14"
\r
2422 + "\\uCA30\\uCA4C\\uCA68\\uCA84\\uCAA0\\uCABC\\uCAD8\\uCAF4\\uCB10"
\r
2423 + "\\uCB2C\\uCB48\\uCB64\\uCB80\\uCB9C\\uCBB8\\uCBD4\\uCBF0\\uCC0C"
\r
2424 + "\\uCC28\\uCC44\\uCC60\\uCC7C\\uCC98\\uCCB4\\uCCD0\\uCCEC\\uCD08"
\r
2425 + "\\uCD24\\uCD40\\uCD5C\\uCD78\\uCD94\\uCDB0\\uCDCC\\uCDE8\\uCE04"
\r
2426 + "\\uCE20\\uCE3C\\uCE58\\uCE74\\uCE90\\uCEAC\\uCEC8\\uCEE4\\uCF00"
\r
2427 + "\\uCF1C\\uCF38\\uCF54\\uCF70\\uCF8C\\uCFA8\\uCFC4\\uCFE0\\uCFFC"
\r
2428 + "\\uD018\\uD034\\uD050\\uD06C\\uD088\\uD0A4\\uD0C0\\uD0DC\\uD0F8"
\r
2429 + "\\uD114\\uD130\\uD14C\\uD168\\uD184\\uD1A0\\uD1BC\\uD1D8\\uD1F4"
\r
2430 + "\\uD210\\uD22C\\uD248\\uD264\\uD280\\uD29C\\uD2B8\\uD2D4\\uD2F0"
\r
2431 + "\\uD30C\\uD328\\uD344\\uD360\\uD37C\\uD398\\uD3B4\\uD3D0\\uD3EC"
\r
2432 + "\\uD408\\uD424\\uD440\\uD45C\\uD478\\uD494\\uD4B0\\uD4CC\\uD4E8"
\r
2433 + "\\uD504\\uD520\\uD53C\\uD558\\uD574\\uD590\\uD5AC\\uD5C8\\uD5E4"
\r
2434 + "\\uD600\\uD61C\\uD638\\uD654\\uD670\\uD68C\\uD6A8\\uD6C4\\uD6E0"
\r
2435 + "\\uD6FC\\uD718\\uD734\\uD750\\uD76C\\uD788\\uF900-\\uFA0D\\uFA10"
\r
2436 + "\\uFA12\\uFA15-\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A-"
\r
2437 + "\\uFA2D\\uFA30-\\uFA6D\\uFA70-\\uFAD9\\uFB1D-\\uFB1F\\uFB2A-"
\r
2438 + "\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46"
\r
2439 + "-\\uFB4E\\uFE20-\\uFE26\\U000101FD\\U00010A0D\\U00010A0F\\U00010"
\r
2440 + "A38-\\U00010A3A\\U00010A3F\\U00011099\\U0001109B\\U000110A5"
\r
2441 + "\\U000110B9\\U000110BA\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001"
\r
2442 + "D172\\U0001D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-"
\r
2443 + "\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0002"
\r
2444 + "F800-\\U0002FA1D]", false);
\r
2446 skipSets[KD].applyPattern(
\r
2447 "[^\\u00A0\\u00A8\\u00AA\\u00AF\\u00B2-\\u00B5\\u00B8-\\u00BA"
\r
2448 + "\\u00BC-\\u00BE\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6"
\r
2449 + "\\u00D9-\\u00DD\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6"
\r
2450 + "\\u00F9-\\u00FD\\u00FF-\\u010F\\u0112-\\u0125\\u0128-\\u0130"
\r
2451 + "\\u0132-\\u0137\\u0139-\\u0140\\u0143-\\u0149\\u014C-\\u0151"
\r
2452 + "\\u0154-\\u0165\\u0168-\\u017F\\u01A0\\u01A1\\u01AF\\u01B0"
\r
2453 + "\\u01C4-\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F5\\u01F8-\\u021B"
\r
2454 + "\\u021E\\u021F\\u0226-\\u0233\\u02B0-\\u02B8\\u02D8-\\u02DD"
\r
2455 + "\\u02E0-\\u02E4\\u0300-\\u034E\\u0350-\\u036F\\u0374\\u037A"
\r
2456 + "\\u037E\\u0384-\\u038A\\u038C\\u038E-\\u0390\\u03AA-\\u03B0"
\r
2457 + "\\u03CA-\\u03CE\\u03D0-\\u03D6\\u03F0-\\u03F2\\u03F4\\u03F5"
\r
2458 + "\\u03F9\\u0400\\u0401\\u0403\\u0407\\u040C-\\u040E\\u0419\\u0439"
\r
2459 + "\\u0450\\u0451\\u0453\\u0457\\u045C-\\u045E\\u0476\\u0477\\u0483"
\r
2460 + "-\\u0487\\u04C1\\u04C2\\u04D0-\\u04D3\\u04D6\\u04D7\\u04DA-"
\r
2461 + "\\u04DF\\u04E2-\\u04E7\\u04EA-\\u04F5\\u04F8\\u04F9\\u0587"
\r
2462 + "\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5\\u05C7\\u0610"
\r
2463 + "-\\u061A\\u0622-\\u0626\\u064B-\\u065E\\u0670\\u0675-\\u0678"
\r
2464 + "\\u06C0\\u06C2\\u06D3\\u06D6-\\u06DC\\u06DF-\\u06E4\\u06E7"
\r
2465 + "\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-\\u07F3"
\r
2466 + "\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-\\u082D"
\r
2467 + "\\u0929\\u0931\\u0934\\u093C\\u094D\\u0951-\\u0954\\u0958-"
\r
2468 + "\\u095F\\u09BC\\u09CB-\\u09CD\\u09DC\\u09DD\\u09DF\\u0A33\\u0A36"
\r
2469 + "\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C\\u0B48"
\r
2470 + "\\u0B4B-\\u0B4D\\u0B5C\\u0B5D\\u0B94\\u0BCA-\\u0BCD\\u0C48"
\r
2471 + "\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CC0\\u0CC7\\u0CC8\\u0CCA\\u0CCB"
\r
2472 + "\\u0CCD\\u0D4A-\\u0D4D\\u0DCA\\u0DDA\\u0DDC-\\u0DDE\\u0E33"
\r
2473 + "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB3\\u0EB8\\u0EB9\\u0EC8-"
\r
2474 + "\\u0ECB\\u0EDC\\u0EDD\\u0F0C\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39"
\r
2475 + "\\u0F43\\u0F4D\\u0F52\\u0F57\\u0F5C\\u0F69\\u0F71-\\u0F7D\\u0F80"
\r
2476 + "-\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"
\r
2477 + "\\u0FC6\\u1026\\u1037\\u1039\\u103A\\u108D\\u10FC\\u135F\\u1714"
\r
2478 + "\\u1734\\u17D2\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60"
\r
2479 + "\\u1A75-\\u1A7C\\u1A7F\\u1B06\\u1B08\\u1B0A\\u1B0C\\u1B0E\\u1B12"
\r
2480 + "\\u1B34\\u1B3B\\u1B3D\\u1B40\\u1B41\\u1B43\\u1B44\\u1B6B-\\u1B73"
\r
2481 + "\\u1BAA\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8"
\r
2482 + "\\u1CED\\u1D2C-\\u1D2E\\u1D30-\\u1D3A\\u1D3C-\\u1D4D\\u1D4F-"
\r
2483 + "\\u1D6A\\u1D78\\u1D9B-\\u1DE6\\u1DFD-\\u1E9B\\u1EA0-\\u1EF9"
\r
2484 + "\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D"
\r
2485 + "\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-"
\r
2486 + "\\u1FB4\\u1FB6-\\u1FC4\\u1FC6-\\u1FD3\\u1FD6-\\u1FDB\\u1FDD-"
\r
2487 + "\\u1FEF\\u1FF2-\\u1FF4\\u1FF6-\\u1FFE\\u2000-\\u200A\\u2011"
\r
2488 + "\\u2017\\u2024-\\u2026\\u202F\\u2033\\u2034\\u2036\\u2037\\u203C"
\r
2489 + "\\u203E\\u2047-\\u2049\\u2057\\u205F\\u2070\\u2071\\u2074-"
\r
2490 + "\\u208E\\u2090-\\u2094\\u20A8\\u20D0-\\u20DC\\u20E1\\u20E5-"
\r
2491 + "\\u20F0\\u2100-\\u2103\\u2105-\\u2107\\u2109-\\u2113\\u2115"
\r
2492 + "\\u2116\\u2119-\\u211D\\u2120-\\u2122\\u2124\\u2126\\u2128"
\r
2493 + "\\u212A-\\u212D\\u212F-\\u2131\\u2133-\\u2139\\u213B-\\u2140"
\r
2494 + "\\u2145-\\u2149\\u2150-\\u217F\\u2189\\u219A\\u219B\\u21AE"
\r
2495 + "\\u21CD-\\u21CF\\u2204\\u2209\\u220C\\u2224\\u2226\\u222C\\u222D"
\r
2496 + "\\u222F\\u2230\\u2241\\u2244\\u2247\\u2249\\u2260\\u2262\\u226D-"
\r
2497 + "\\u2271\\u2274\\u2275\\u2278\\u2279\\u2280\\u2281\\u2284\\u2285"
\r
2498 + "\\u2288\\u2289\\u22AC-\\u22AF\\u22E0-\\u22E3\\u22EA-\\u22ED"
\r
2499 + "\\u2329\\u232A\\u2460-\\u24EA\\u2A0C\\u2A74-\\u2A76\\u2ADC"
\r
2500 + "\\u2C7C\\u2C7D\\u2CEF-\\u2CF1\\u2D6F\\u2DE0-\\u2DFF\\u2E9F"
\r
2501 + "\\u2EF3\\u2F00-\\u2FD5\\u3000\\u302A-\\u302F\\u3036\\u3038-"
\r
2502 + "\\u303A\\u304C\\u304E\\u3050\\u3052\\u3054\\u3056\\u3058\\u305A"
\r
2503 + "\\u305C\\u305E\\u3060\\u3062\\u3065\\u3067\\u3069\\u3070\\u3071"
\r
2504 + "\\u3073\\u3074\\u3076\\u3077\\u3079\\u307A\\u307C\\u307D\\u3094"
\r
2505 + "\\u3099-\\u309C\\u309E\\u309F\\u30AC\\u30AE\\u30B0\\u30B2\\u30B4"
\r
2506 + "\\u30B6\\u30B8\\u30BA\\u30BC\\u30BE\\u30C0\\u30C2\\u30C5\\u30C7"
\r
2507 + "\\u30C9\\u30D0\\u30D1\\u30D3\\u30D4\\u30D6\\u30D7\\u30D9\\u30DA"
\r
2508 + "\\u30DC\\u30DD\\u30F4\\u30F7-\\u30FA\\u30FE\\u30FF\\u3131-"
\r
2509 + "\\u318E\\u3192-\\u319F\\u3200-\\u321E\\u3220-\\u3247\\u3250-"
\r
2510 + "\\u327E\\u3280-\\u32FE\\u3300-\\u33FF\\uA66F\\uA67C\\uA67D"
\r
2511 + "\\uA6F0\\uA6F1\\uA770\\uA806\\uA8C4\\uA8E0-\\uA8F1\\uA92B-"
\r
2512 + "\\uA92D\\uA953\\uA9B3\\uA9C0\\uAAB0\\uAAB2-\\uAAB4\\uAAB7\\uAAB8"
\r
2513 + "\\uAABE\\uAABF\\uAAC1\\uABED\\uAC00-\\uD7A3\\uF900-\\uFA0D"
\r
2514 + "\\uFA10\\uFA12\\uFA15-\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A"
\r
2515 + "-\\uFA2D\\uFA30-\\uFA6D\\uFA70-\\uFAD9\\uFB00-\\uFB06\\uFB13-"
\r
2516 + "\\uFB17\\uFB1D-\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41"
\r
2517 + "\\uFB43\\uFB44\\uFB46-\\uFBB1\\uFBD3-\\uFD3D\\uFD50-\\uFD8F"
\r
2518 + "\\uFD92-\\uFDC7\\uFDF0-\\uFDFC\\uFE10-\\uFE19\\uFE20-\\uFE26"
\r
2519 + "\\uFE30-\\uFE44\\uFE47-\\uFE52\\uFE54-\\uFE66\\uFE68-\\uFE6B"
\r
2520 + "\\uFE70-\\uFE72\\uFE74\\uFE76-\\uFEFC\\uFF01-\\uFFBE\\uFFC2-"
\r
2521 + "\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7\\uFFDA-\\uFFDC\\uFFE0-"
\r
2522 + "\\uFFE6\\uFFE8-\\uFFEE\\U000101FD\\U00010A0D\\U00010A0F\\U00010A"
\r
2523 + "38-\\U00010A3A\\U00010A3F\\U0001109A\\U0001109C\\U000110AB"
\r
2524 + "\\U000110B9\\U000110BA\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001"
\r
2525 + "D172\\U0001D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-"
\r
2526 + "\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0001"
\r
2527 + "D400-\\U0001D454\\U0001D456-\\U0001D49C\\U0001D49E\\U0001D49F"
\r
2528 + "\\U0001D4A2\\U0001D4A5\\U0001D4A6\\U0001D4A9-\\U0001D4AC\\U0001D"
\r
2529 + "4AE-\\U0001D4B9\\U0001D4BB\\U0001D4BD-\\U0001D4C3\\U0001D4C5-"
\r
2530 + "\\U0001D505\\U0001D507-\\U0001D50A\\U0001D50D-\\U0001D514\\U0001"
\r
2531 + "D516-\\U0001D51C\\U0001D51E-\\U0001D539\\U0001D53B-\\U0001D53E"
\r
2532 + "\\U0001D540-\\U0001D544\\U0001D546\\U0001D54A-\\U0001D550\\U0001"
\r
2533 + "D552-\\U0001D6A5\\U0001D6A8-\\U0001D7CB\\U0001D7CE-\\U0001D7FF"
\r
2534 + "\\U0001F100-\\U0001F10A\\U0001F110-\\U0001F12E\\U0001F131\\U0001"
\r
2535 + "F13D\\U0001F13F\\U0001F142\\U0001F146\\U0001F14A-\\U0001F14E"
\r
2536 + "\\U0001F190\\U0001F200\\U0001F210-\\U0001F231\\U0001F240-\\U0001"
\r
2537 + "F248\\U0002F800-\\U0002FA1D]", false);
\r
2539 skipSets[KC].applyPattern(
\r
2540 "[^<->A-PR-Za-pr-z\\u00A0\\u00A8\\u00AA\\u00AF\\u00B2-\\u00B5"
\r
2541 + "\\u00B8-\\u00BA\\u00BC-\\u00BE\\u00C0-\\u00CF\\u00D1-\\u00D6"
\r
2542 + "\\u00D8-\\u00DD\\u00E0-\\u00EF\\u00F1-\\u00F6\\u00F8-\\u00FD"
\r
2543 + "\\u00FF-\\u0103\\u0106-\\u010F\\u0112-\\u0117\\u011A-\\u0121"
\r
2544 + "\\u0124\\u0125\\u0128-\\u012D\\u0130\\u0132\\u0133\\u0139\\u013A"
\r
2545 + "\\u013D-\\u0140\\u0143\\u0144\\u0147-\\u0149\\u014C-\\u0151"
\r
2546 + "\\u0154\\u0155\\u0158-\\u015D\\u0160\\u0161\\u0164\\u0165\\u0168"
\r
2547 + "-\\u0171\\u0174-\\u017F\\u01A0\\u01A1\\u01AF\\u01B0\\u01B7"
\r
2548 + "\\u01C4-\\u01DC\\u01DE-\\u01E1\\u01E6-\\u01EB\\u01F1-\\u01F5"
\r
2549 + "\\u01F8-\\u01FB\\u0200-\\u021B\\u021E\\u021F\\u0226-\\u0233"
\r
2550 + "\\u0292\\u02B0-\\u02B8\\u02D8-\\u02DD\\u02E0-\\u02E4\\u0300-"
\r
2551 + "\\u034E\\u0350-\\u036F\\u0374\\u037A\\u037E\\u0384\\u0385\\u0387"
\r
2552 + "\\u0391\\u0395\\u0397\\u0399\\u039F\\u03A1\\u03A5\\u03A9\\u03AC"
\r
2553 + "\\u03AE\\u03B1\\u03B5\\u03B7\\u03B9\\u03BF\\u03C1\\u03C5\\u03C9-"
\r
2554 + "\\u03CB\\u03CE\\u03D0-\\u03D6\\u03F0-\\u03F2\\u03F4\\u03F5"
\r
2555 + "\\u03F9\\u0406\\u0410\\u0413\\u0415-\\u0418\\u041A\\u041E\\u0423"
\r
2556 + "\\u0427\\u042B\\u042D\\u0430\\u0433\\u0435-\\u0438\\u043A\\u043E"
\r
2557 + "\\u0443\\u0447\\u044B\\u044D\\u0456\\u0474\\u0475\\u0483-\\u0487"
\r
2558 + "\\u04D8\\u04D9\\u04E8\\u04E9\\u0587\\u0591-\\u05BD\\u05BF\\u05C1"
\r
2559 + "\\u05C2\\u05C4\\u05C5\\u05C7\\u0610-\\u061A\\u0622\\u0623\\u0627"
\r
2560 + "\\u0648\\u064A-\\u065E\\u0670\\u0675-\\u0678\\u06C1\\u06D2"
\r
2561 + "\\u06D5-\\u06DC\\u06DF-\\u06E4\\u06E7\\u06E8\\u06EA-\\u06ED"
\r
2562 + "\\u0711\\u0730-\\u074A\\u07EB-\\u07F3\\u0816-\\u0819\\u081B-"
\r
2563 + "\\u0823\\u0825-\\u0827\\u0829-\\u082D\\u0928\\u0930\\u0933"
\r
2564 + "\\u093C\\u094D\\u0951-\\u0954\\u0958-\\u095F\\u09BC\\u09BE"
\r
2565 + "\\u09C7\\u09CD\\u09D7\\u09DC\\u09DD\\u09DF\\u0A33\\u0A36\\u0A3C"
\r
2566 + "\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C\\u0B3E\\u0B47"
\r
2567 + "\\u0B4D\\u0B56\\u0B57\\u0B5C\\u0B5D\\u0B92\\u0BBE\\u0BC6\\u0BC7"
\r
2568 + "\\u0BCD\\u0BD7\\u0C46\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CBF\\u0CC2"
\r
2569 + "\\u0CC6\\u0CCA\\u0CCD\\u0CD5\\u0CD6\\u0D3E\\u0D46\\u0D47\\u0D4D"
\r
2570 + "\\u0D57\\u0DCA\\u0DCF\\u0DD9\\u0DDC\\u0DDF\\u0E33\\u0E38-\\u0E3A"
\r
2571 + "\\u0E48-\\u0E4B\\u0EB3\\u0EB8\\u0EB9\\u0EC8-\\u0ECB\\u0EDC"
\r
2572 + "\\u0EDD\\u0F0C\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D"
\r
2573 + "\\u0F52\\u0F57\\u0F5C\\u0F69\\u0F71-\\u0F7D\\u0F80-\\u0F84"
\r
2574 + "\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9\\u0FC6"
\r
2575 + "\\u1025\\u102E\\u1037\\u1039\\u103A\\u108D\\u10FC\\u1100-\\u1112"
\r
2576 + "\\u1161-\\u1175\\u11A8-\\u11C2\\u135F\\u1714\\u1734\\u17D2"
\r
2577 + "\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60\\u1A75-"
\r
2578 + "\\u1A7C\\u1A7F\\u1B05\\u1B07\\u1B09\\u1B0B\\u1B0D\\u1B11\\u1B34"
\r
2579 + "\\u1B35\\u1B3A\\u1B3C\\u1B3E\\u1B3F\\u1B42\\u1B44\\u1B6B-\\u1B73"
\r
2580 + "\\u1BAA\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8"
\r
2581 + "\\u1CED\\u1D2C-\\u1D2E\\u1D30-\\u1D3A\\u1D3C-\\u1D4D\\u1D4F-"
\r
2582 + "\\u1D6A\\u1D78\\u1D9B-\\u1DE6\\u1DFD-\\u1E03\\u1E0A-\\u1E0F"
\r
2583 + "\\u1E12-\\u1E1B\\u1E20-\\u1E27\\u1E2A-\\u1E41\\u1E44-\\u1E53"
\r
2584 + "\\u1E58-\\u1E7D\\u1E80-\\u1E87\\u1E8E-\\u1E91\\u1E96-\\u1E9B"
\r
2585 + "\\u1EA0-\\u1EF3\\u1EF6-\\u1EF9\\u1F00-\\u1F11\\u1F18\\u1F19"
\r
2586 + "\\u1F20-\\u1F31\\u1F38\\u1F39\\u1F40\\u1F41\\u1F48\\u1F49\\u1F50"
\r
2587 + "\\u1F51\\u1F59\\u1F60-\\u1F71\\u1F73-\\u1F75\\u1F77\\u1F79"
\r
2588 + "\\u1F7B-\\u1F7D\\u1F80\\u1F81\\u1F88\\u1F89\\u1F90\\u1F91\\u1F98"
\r
2589 + "\\u1F99\\u1FA0\\u1FA1\\u1FA8\\u1FA9\\u1FB3\\u1FB6\\u1FBB-\\u1FC1"
\r
2590 + "\\u1FC3\\u1FC6\\u1FC9\\u1FCB-\\u1FCF\\u1FD3\\u1FDB\\u1FDD-"
\r
2591 + "\\u1FDF\\u1FE3\\u1FEB\\u1FED-\\u1FEF\\u1FF3\\u1FF6\\u1FF9\\u1FFB"
\r
2592 + "-\\u1FFE\\u2000-\\u200A\\u2011\\u2017\\u2024-\\u2026\\u202F"
\r
2593 + "\\u2033\\u2034\\u2036\\u2037\\u203C\\u203E\\u2047-\\u2049\\u2057"
\r
2594 + "\\u205F\\u2070\\u2071\\u2074-\\u208E\\u2090-\\u2094\\u20A8"
\r
2595 + "\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2100-\\u2103\\u2105-"
\r
2596 + "\\u2107\\u2109-\\u2113\\u2115\\u2116\\u2119-\\u211D\\u2120-"
\r
2597 + "\\u2122\\u2124\\u2126\\u2128\\u212A-\\u212D\\u212F-\\u2131"
\r
2598 + "\\u2133-\\u2139\\u213B-\\u2140\\u2145-\\u2149\\u2150-\\u217F"
\r
2599 + "\\u2189\\u2190\\u2192\\u2194\\u21D0\\u21D2\\u21D4\\u2203\\u2208"
\r
2600 + "\\u220B\\u2223\\u2225\\u222C\\u222D\\u222F\\u2230\\u223C\\u2243"
\r
2601 + "\\u2245\\u2248\\u224D\\u2261\\u2264\\u2265\\u2272\\u2273\\u2276"
\r
2602 + "\\u2277\\u227A-\\u227D\\u2282\\u2283\\u2286\\u2287\\u2291\\u2292"
\r
2603 + "\\u22A2\\u22A8\\u22A9\\u22AB\\u22B2-\\u22B5\\u2329\\u232A\\u2460"
\r
2604 + "-\\u24EA\\u2A0C\\u2A74-\\u2A76\\u2ADC\\u2C7C\\u2C7D\\u2CEF-"
\r
2605 + "\\u2CF1\\u2D6F\\u2DE0-\\u2DFF\\u2E9F\\u2EF3\\u2F00-\\u2FD5"
\r
2606 + "\\u3000\\u302A-\\u302F\\u3036\\u3038-\\u303A\\u3046\\u304B"
\r
2607 + "\\u304D\\u304F\\u3051\\u3053\\u3055\\u3057\\u3059\\u305B\\u305D"
\r
2608 + "\\u305F\\u3061\\u3064\\u3066\\u3068\\u306F\\u3072\\u3075\\u3078"
\r
2609 + "\\u307B\\u3099-\\u309D\\u309F\\u30A6\\u30AB\\u30AD\\u30AF\\u30B1"
\r
2610 + "\\u30B3\\u30B5\\u30B7\\u30B9\\u30BB\\u30BD\\u30BF\\u30C1\\u30C4"
\r
2611 + "\\u30C6\\u30C8\\u30CF\\u30D2\\u30D5\\u30D8\\u30DB\\u30EF-\\u30F2"
\r
2612 + "\\u30FD\\u30FF\\u3131-\\u318E\\u3192-\\u319F\\u3200-\\u321E"
\r
2613 + "\\u3220-\\u3247\\u3250-\\u327E\\u3280-\\u32FE\\u3300-\\u33FF"
\r
2614 + "\\uA66F\\uA67C\\uA67D\\uA6F0\\uA6F1\\uA770\\uA806\\uA8C4\\uA8E0-"
\r
2615 + "\\uA8F1\\uA92B-\\uA92D\\uA953\\uA9B3\\uA9C0\\uAAB0\\uAAB2-"
\r
2616 + "\\uAAB4\\uAAB7\\uAAB8\\uAABE\\uAABF\\uAAC1\\uABED\\uAC00\\uAC1C"
\r
2617 + "\\uAC38\\uAC54\\uAC70\\uAC8C\\uACA8\\uACC4\\uACE0\\uACFC\\uAD18"
\r
2618 + "\\uAD34\\uAD50\\uAD6C\\uAD88\\uADA4\\uADC0\\uADDC\\uADF8\\uAE14"
\r
2619 + "\\uAE30\\uAE4C\\uAE68\\uAE84\\uAEA0\\uAEBC\\uAED8\\uAEF4\\uAF10"
\r
2620 + "\\uAF2C\\uAF48\\uAF64\\uAF80\\uAF9C\\uAFB8\\uAFD4\\uAFF0\\uB00C"
\r
2621 + "\\uB028\\uB044\\uB060\\uB07C\\uB098\\uB0B4\\uB0D0\\uB0EC\\uB108"
\r
2622 + "\\uB124\\uB140\\uB15C\\uB178\\uB194\\uB1B0\\uB1CC\\uB1E8\\uB204"
\r
2623 + "\\uB220\\uB23C\\uB258\\uB274\\uB290\\uB2AC\\uB2C8\\uB2E4\\uB300"
\r
2624 + "\\uB31C\\uB338\\uB354\\uB370\\uB38C\\uB3A8\\uB3C4\\uB3E0\\uB3FC"
\r
2625 + "\\uB418\\uB434\\uB450\\uB46C\\uB488\\uB4A4\\uB4C0\\uB4DC\\uB4F8"
\r
2626 + "\\uB514\\uB530\\uB54C\\uB568\\uB584\\uB5A0\\uB5BC\\uB5D8\\uB5F4"
\r
2627 + "\\uB610\\uB62C\\uB648\\uB664\\uB680\\uB69C\\uB6B8\\uB6D4\\uB6F0"
\r
2628 + "\\uB70C\\uB728\\uB744\\uB760\\uB77C\\uB798\\uB7B4\\uB7D0\\uB7EC"
\r
2629 + "\\uB808\\uB824\\uB840\\uB85C\\uB878\\uB894\\uB8B0\\uB8CC\\uB8E8"
\r
2630 + "\\uB904\\uB920\\uB93C\\uB958\\uB974\\uB990\\uB9AC\\uB9C8\\uB9E4"
\r
2631 + "\\uBA00\\uBA1C\\uBA38\\uBA54\\uBA70\\uBA8C\\uBAA8\\uBAC4\\uBAE0"
\r
2632 + "\\uBAFC\\uBB18\\uBB34\\uBB50\\uBB6C\\uBB88\\uBBA4\\uBBC0\\uBBDC"
\r
2633 + "\\uBBF8\\uBC14\\uBC30\\uBC4C\\uBC68\\uBC84\\uBCA0\\uBCBC\\uBCD8"
\r
2634 + "\\uBCF4\\uBD10\\uBD2C\\uBD48\\uBD64\\uBD80\\uBD9C\\uBDB8\\uBDD4"
\r
2635 + "\\uBDF0\\uBE0C\\uBE28\\uBE44\\uBE60\\uBE7C\\uBE98\\uBEB4\\uBED0"
\r
2636 + "\\uBEEC\\uBF08\\uBF24\\uBF40\\uBF5C\\uBF78\\uBF94\\uBFB0\\uBFCC"
\r
2637 + "\\uBFE8\\uC004\\uC020\\uC03C\\uC058\\uC074\\uC090\\uC0AC\\uC0C8"
\r
2638 + "\\uC0E4\\uC100\\uC11C\\uC138\\uC154\\uC170\\uC18C\\uC1A8\\uC1C4"
\r
2639 + "\\uC1E0\\uC1FC\\uC218\\uC234\\uC250\\uC26C\\uC288\\uC2A4\\uC2C0"
\r
2640 + "\\uC2DC\\uC2F8\\uC314\\uC330\\uC34C\\uC368\\uC384\\uC3A0\\uC3BC"
\r
2641 + "\\uC3D8\\uC3F4\\uC410\\uC42C\\uC448\\uC464\\uC480\\uC49C\\uC4B8"
\r
2642 + "\\uC4D4\\uC4F0\\uC50C\\uC528\\uC544\\uC560\\uC57C\\uC598\\uC5B4"
\r
2643 + "\\uC5D0\\uC5EC\\uC608\\uC624\\uC640\\uC65C\\uC678\\uC694\\uC6B0"
\r
2644 + "\\uC6CC\\uC6E8\\uC704\\uC720\\uC73C\\uC758\\uC774\\uC790\\uC7AC"
\r
2645 + "\\uC7C8\\uC7E4\\uC800\\uC81C\\uC838\\uC854\\uC870\\uC88C\\uC8A8"
\r
2646 + "\\uC8C4\\uC8E0\\uC8FC\\uC918\\uC934\\uC950\\uC96C\\uC988\\uC9A4"
\r
2647 + "\\uC9C0\\uC9DC\\uC9F8\\uCA14\\uCA30\\uCA4C\\uCA68\\uCA84\\uCAA0"
\r
2648 + "\\uCABC\\uCAD8\\uCAF4\\uCB10\\uCB2C\\uCB48\\uCB64\\uCB80\\uCB9C"
\r
2649 + "\\uCBB8\\uCBD4\\uCBF0\\uCC0C\\uCC28\\uCC44\\uCC60\\uCC7C\\uCC98"
\r
2650 + "\\uCCB4\\uCCD0\\uCCEC\\uCD08\\uCD24\\uCD40\\uCD5C\\uCD78\\uCD94"
\r
2651 + "\\uCDB0\\uCDCC\\uCDE8\\uCE04\\uCE20\\uCE3C\\uCE58\\uCE74\\uCE90"
\r
2652 + "\\uCEAC\\uCEC8\\uCEE4\\uCF00\\uCF1C\\uCF38\\uCF54\\uCF70\\uCF8C"
\r
2653 + "\\uCFA8\\uCFC4\\uCFE0\\uCFFC\\uD018\\uD034\\uD050\\uD06C\\uD088"
\r
2654 + "\\uD0A4\\uD0C0\\uD0DC\\uD0F8\\uD114\\uD130\\uD14C\\uD168\\uD184"
\r
2655 + "\\uD1A0\\uD1BC\\uD1D8\\uD1F4\\uD210\\uD22C\\uD248\\uD264\\uD280"
\r
2656 + "\\uD29C\\uD2B8\\uD2D4\\uD2F0\\uD30C\\uD328\\uD344\\uD360\\uD37C"
\r
2657 + "\\uD398\\uD3B4\\uD3D0\\uD3EC\\uD408\\uD424\\uD440\\uD45C\\uD478"
\r
2658 + "\\uD494\\uD4B0\\uD4CC\\uD4E8\\uD504\\uD520\\uD53C\\uD558\\uD574"
\r
2659 + "\\uD590\\uD5AC\\uD5C8\\uD5E4\\uD600\\uD61C\\uD638\\uD654\\uD670"
\r
2660 + "\\uD68C\\uD6A8\\uD6C4\\uD6E0\\uD6FC\\uD718\\uD734\\uD750\\uD76C"
\r
2661 + "\\uD788\\uF900-\\uFA0D\\uFA10\\uFA12\\uFA15-\\uFA1E\\uFA20"
\r
2662 + "\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D\\uFA30-\\uFA6D\\uFA70-"
\r
2663 + "\\uFAD9\\uFB00-\\uFB06\\uFB13-\\uFB17\\uFB1D-\\uFB36\\uFB38-"
\r
2664 + "\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-\\uFBB1\\uFBD3"
\r
2665 + "-\\uFD3D\\uFD50-\\uFD8F\\uFD92-\\uFDC7\\uFDF0-\\uFDFC\\uFE10-"
\r
2666 + "\\uFE19\\uFE20-\\uFE26\\uFE30-\\uFE44\\uFE47-\\uFE52\\uFE54-"
\r
2667 + "\\uFE66\\uFE68-\\uFE6B\\uFE70-\\uFE72\\uFE74\\uFE76-\\uFEFC"
\r
2668 + "\\uFF01-\\uFFBE\\uFFC2-\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7"
\r
2669 + "\\uFFDA-\\uFFDC\\uFFE0-\\uFFE6\\uFFE8-\\uFFEE\\U000101FD\\U00010"
\r
2670 + "A0D\\U00010A0F\\U00010A38-\\U00010A3A\\U00010A3F\\U00011099"
\r
2671 + "\\U0001109B\\U000110A5\\U000110B9\\U000110BA\\U0001D15E-\\U0001D"
\r
2672 + "169\\U0001D16D-\\U0001D172\\U0001D17B-\\U0001D182\\U0001D185-"
\r
2673 + "\\U0001D18B\\U0001D1AA-\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001"
\r
2674 + "D242-\\U0001D244\\U0001D400-\\U0001D454\\U0001D456-\\U0001D49C"
\r
2675 + "\\U0001D49E\\U0001D49F\\U0001D4A2\\U0001D4A5\\U0001D4A6\\U0001D4"
\r
2676 + "A9-\\U0001D4AC\\U0001D4AE-\\U0001D4B9\\U0001D4BB\\U0001D4BD-"
\r
2677 + "\\U0001D4C3\\U0001D4C5-\\U0001D505\\U0001D507-\\U0001D50A\\U0001"
\r
2678 + "D50D-\\U0001D514\\U0001D516-\\U0001D51C\\U0001D51E-\\U0001D539"
\r
2679 + "\\U0001D53B-\\U0001D53E\\U0001D540-\\U0001D544\\U0001D546\\U0001"
\r
2680 + "D54A-\\U0001D550\\U0001D552-\\U0001D6A5\\U0001D6A8-\\U0001D7CB"
\r
2681 + "\\U0001D7CE-\\U0001D7FF\\U0001F100-\\U0001F10A\\U0001F110-"
\r
2682 + "\\U0001F12E\\U0001F131\\U0001F13D\\U0001F13F\\U0001F142\\U0001F1"
\r
2683 + "46\\U0001F14A-\\U0001F14E\\U0001F190\\U0001F200\\U0001F210-"
\r
2684 + "\\U0001F231\\U0001F240-\\U0001F248\\U0002F800-\\U0002FA1D]", false);
\r
2689 public void TestSkippable() {
\r
2690 UnicodeSet[] skipSets = new UnicodeSet[] {
\r
2691 new UnicodeSet(), //NFD
\r
2692 new UnicodeSet(), //NFC
\r
2693 new UnicodeSet(), //NFKD
\r
2694 new UnicodeSet() //NFKC
\r
2696 UnicodeSet[] expectSets = new UnicodeSet[] {
\r
2702 StringBuilder s, pattern;
\r
2704 // build NF*Skippable sets from runtime data
\r
2705 skipSets[D].applyPattern("[:NFD_Inert:]");
\r
2706 skipSets[C].applyPattern("[:NFC_Inert:]");
\r
2707 skipSets[KD].applyPattern("[:NFKD_Inert:]");
\r
2708 skipSets[KC].applyPattern("[:NFKC_Inert:]");
\r
2710 expectSets = initSkippables(expectSets);
\r
2711 if(expectSets[D].contains(0x0350)){
\r
2712 errln("expectSets[D] contains 0x0350");
\r
2714 for(int i=0; i<expectSets.length; ++i) {
\r
2715 if(!skipSets[i].equals(expectSets[i])) {
\r
2716 errln("error: TestSkippable skipSets["+i+"]!=expectedSets["+i+"]\n"+
\r
2717 "May need to update hardcoded UnicodeSet patterns in com.ibm.icu.dev.test.normalizer.BasicTest.java\n"+
\r
2718 "See ICU4J - unicodetools.com.ibm.text.UCD.NFSkippable\n" +
\r
2719 "Run com.ibm.text.UCD.Main with the option NFSkippable.");
\r
2721 s=new StringBuilder();
\r
2723 s.append("\n\nskip= ");
\r
2724 s.append(skipSets[i].toPattern(true));
\r
2727 s.append("skip-expect=");
\r
2728 pattern = new StringBuilder(((UnicodeSet)skipSets[i].clone()).removeAll(expectSets[i]).toPattern(true));
\r
2729 s.append(pattern);
\r
2731 pattern.delete(0,pattern.length());
\r
2732 s.append("\n\nexpect-skip=");
\r
2733 pattern = new StringBuilder(((UnicodeSet)expectSets[i].clone()).removeAll(skipSets[i]).toPattern(true));
\r
2734 s.append(pattern);
\r
2737 pattern.delete(0,pattern.length());
\r
2738 s.append("\n\nintersection(expect,skip)=");
\r
2739 UnicodeSet intersection = ((UnicodeSet) expectSets[i].clone()).retainAll(skipSets[i]);
\r
2740 pattern = new StringBuilder(intersection.toPattern(true));
\r
2741 s.append(pattern);
\r
2744 errln(s.toString());
\r
2749 public void TestBugJ2068(){
\r
2750 String sample = "The quick brown fox jumped over the lazy dog";
\r
2751 UCharacterIterator text = UCharacterIterator.getInstance(sample);
\r
2752 Normalizer norm = new Normalizer(text,Normalizer.NFC,0);
\r
2754 if(text.current() == norm.current()){
\r
2755 errln("Normalizer is not cloning the UCharacterIterator");
\r
2758 public void TestGetCombiningClass(){
\r
2759 for(int i=0;i<0x10FFFF;i++){
\r
2760 int cc = UCharacter.getCombiningClass(i);
\r
2761 if(0xD800<= i && i<=0xDFFF && cc >0 ){
\r
2762 cc = UCharacter.getCombiningClass(i);
\r
2763 errln("CC: "+ cc + " for codepoint: " +Utility.hex(i,8));
\r
2768 public void TestSerializedSet(){
\r
2769 USerializedSet sset=new USerializedSet();
\r
2770 UnicodeSet set = new UnicodeSet();
\r
2773 char[] serialized = {
\r
2776 0xc0, 0xfe, 0xfffc,
\r
2777 1, 9, 0x10, 0xfffc
\r
2779 sset.getSet(serialized, 0);
\r
2781 // collect all sets into one for contiguous output
\r
2782 int[] startEnd = new int[2];
\r
2783 int count=sset.countRanges();
\r
2784 for(int j=0; j<count; ++j) {
\r
2785 sset.getRange(j, startEnd);
\r
2786 set.add(startEnd[0], startEnd[1]);
\r
2789 // test all of these characters
\r
2790 UnicodeSetIterator it = new UnicodeSetIterator(set);
\r
2791 while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) {
\r
2792 start=it.codepoint;
\r
2793 end=it.codepointEnd;
\r
2794 while(start<=end) {
\r
2795 if(!sset.contains(start)){
\r
2796 errln("USerializedSet.contains failed for "+Utility.hex(start,8));
\r
2803 public void TestReturnFailure(){
\r
2804 char[] term = {'r','\u00e9','s','u','m','\u00e9' };
\r
2805 char[] decomposed_term = new char[10 + term.length + 2];
\r
2806 int rc = Normalizer.decompose(term,0,term.length, decomposed_term,0,decomposed_term.length,true, 0);
\r
2807 int rc1 = Normalizer.decompose(term,0,term.length, decomposed_term,10,decomposed_term.length,true, 0);
\r
2809 errln("Normalizer decompose did not return correct length");
\r
2813 private final static class TestCompositionCase {
\r
2814 public Normalizer.Mode mode;
\r
2815 public int options;
\r
2816 public String input, expect;
\r
2817 TestCompositionCase(Normalizer.Mode mode, int options, String input, String expect) {
\r
2819 this.options=options;
\r
2821 this.expect=expect;
\r
2825 public void TestComposition() {
\r
2826 final TestCompositionCase cases[]=new TestCompositionCase[]{
\r
2828 * special cases for UAX #15 bug
\r
2829 * see Unicode Corrigendum #5: Normalization Idempotency
\r
2830 * at http://unicode.org/versions/corrigendum5.html
\r
2831 * (was Public Review Issue #29)
\r
2833 new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327", "\u1100\u0300\u1161\u0327"),
\r
2834 new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327\u11a8","\u1100\u0300\u1161\u0327\u11a8"),
\r
2835 new TestCompositionCase(Normalizer.NFC, 0, "\uac00\u0300\u0327\u11a8", "\uac00\u0327\u0300\u11a8"),
\r
2836 new TestCompositionCase(Normalizer.NFC, 0, "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"),
\r
2838 /* TODO: add test cases for UNORM_FCC here (j2151) */
\r
2844 for(i=0; i<cases.length; ++i) {
\r
2845 output=Normalizer.normalize(cases[i].input, cases[i].mode, cases[i].options);
\r
2846 if(!output.equals(cases[i].expect)) {
\r
2847 errln("unexpected result for case "+i);
\r
2852 public void TestCustomComp() {
\r
2853 String [][] pairs={
\r
2854 { "\\uD801\\uE000\\uDFFE", "" },
\r
2855 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" },
\r
2856 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" },
\r
2857 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE002\\U000110B9\\u0327\\u0345" },
\r
2858 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" },
\r
2859 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" },
\r
2860 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" },
\r
2861 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" }
\r
2863 Normalizer2 customNorm2;
\r
2865 Normalizer2.getInstance(
\r
2866 BasicTest.class.getResourceAsStream("/com/ibm/icu/dev/data/testdata/testnorm.nrm"),
\r
2868 Normalizer2.Mode.COMPOSE);
\r
2869 for(int i=0; i<pairs.length; ++i) {
\r
2870 String[] pair=pairs[i];
\r
2871 String input=Utility.unescape(pair[0]);
\r
2872 String expected=Utility.unescape(pair[1]);
\r
2873 String result=customNorm2.normalize(input);
\r
2874 if(!result.equals(expected)) {
\r
2875 errln("custom compose Normalizer2 did not normalize input "+i+" as expected");
\r
2880 public void TestCustomFCC() {
\r
2881 String[][] pairs={
\r
2882 { "\\uD801\\uE000\\uDFFE", "" },
\r
2883 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" },
\r
2884 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" },
\r
2885 // The following expected result is different from CustomComp
\r
2886 // because of only-contiguous composition.
\r
2887 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE001\\U000110B9\\u0327\\u0308\\u0345" },
\r
2888 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" },
\r
2889 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" },
\r
2890 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" },
\r
2891 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" }
\r
2893 Normalizer2 customNorm2;
\r
2895 Normalizer2.getInstance(
\r
2896 BasicTest.class.getResourceAsStream("/com/ibm/icu/dev/data/testdata/testnorm.nrm"),
\r
2898 Normalizer2.Mode.COMPOSE_CONTIGUOUS);
\r
2899 for(int i=0; i<pairs.length; ++i) {
\r
2900 String[] pair=pairs[i];
\r
2901 String input=Utility.unescape(pair[0]);
\r
2902 String expected=Utility.unescape(pair[1]);
\r
2903 String result=customNorm2.normalize(input);
\r
2904 if(!result.equals(expected)) {
\r
2905 errln("custom FCC Normalizer2 did not normalize input "+i+" as expected");
\r
2910 public void TestCanonIterData() {
\r
2911 // For now, just a regression test.
\r
2912 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl.ensureCanonIterData();
\r
2913 // U+0FB5 TIBETAN SUBJOINED LETTER SSA is the trailing character
\r
2914 // in some decomposition mappings where there is a composition exclusion.
\r
2915 // In fact, U+0FB5 is normalization-inert (NFC_QC=Yes, NFD_QC=Yes, ccc=0)
\r
2916 // but it is not a segment starter because it occurs in a decomposition mapping.
\r
2917 if(impl.isCanonSegmentStarter(0xfb5)) {
\r
2918 errln("isCanonSegmentStarter(U+0fb5)=true is wrong");
\r
2920 // For [:Segment_Starter:] to work right, not just the property function has to work right,
\r
2921 // UnicodeSet also needs a correct range starts set.
\r
2922 UnicodeSet segStarters=new UnicodeSet("[:Segment_Starter:]").freeze();
\r
2923 if(segStarters.contains(0xfb5)) {
\r
2924 errln("[:Segment_Starter:].contains(U+0fb5)=true is wrong");
\r
2926 // Try characters up to Kana and miscellaneous CJK but below Han (for expediency).
\r
2927 for(int c=0; c<=0x33ff; ++c) {
\r
2928 boolean isStarter=impl.isCanonSegmentStarter(c);
\r
2929 boolean isContained=segStarters.contains(c);
\r
2930 if(isStarter!=isContained) {
\r
2931 errln(String.format(
\r
2932 "discrepancy: isCanonSegmentStarter(U+%04x)=%5b != " +
\r
2933 "[:Segment_Starter:].contains(same)",
\r