]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_4_2-src/main/tests/core/src/com/ibm/icu/dev/test/normalizer/BasicTest.java
go
[Dictionary.git] / jars / icu4j-4_4_2-src / main / tests / core / src / com / ibm / icu / dev / test / normalizer / BasicTest.java
1 /*\r
2  *******************************************************************************\r
3  * Copyright (C) 1996-2010, International Business Machines Corporation and\r
4  * others. All Rights Reserved.\r
5  *******************************************************************************\r
6  */\r
7 \r
8 package com.ibm.icu.dev.test.normalizer;\r
9 \r
10 import java.text.StringCharacterIterator;\r
11 import java.util.Random;\r
12 \r
13 import com.ibm.icu.dev.test.TestFmwk;\r
14 import com.ibm.icu.impl.Norm2AllModes;\r
15 import com.ibm.icu.impl.Normalizer2Impl;\r
16 import com.ibm.icu.impl.USerializedSet;\r
17 import com.ibm.icu.impl.Utility;\r
18 import com.ibm.icu.lang.UCharacter;\r
19 import com.ibm.icu.lang.UCharacterCategory;\r
20 import com.ibm.icu.lang.UProperty;\r
21 import com.ibm.icu.text.Normalizer;\r
22 import com.ibm.icu.text.Normalizer2;\r
23 import com.ibm.icu.text.UCharacterIterator;\r
24 import com.ibm.icu.text.UTF16;\r
25 import com.ibm.icu.text.UnicodeSet;\r
26 import com.ibm.icu.text.UnicodeSetIterator;\r
27 \r
28 \r
29 public class BasicTest extends TestFmwk {\r
30     public static void main(String[] args) throws Exception {\r
31         new BasicTest().run(args);\r
32     }\r
33 \r
34     String[][] canonTests = {\r
35         // Input                Decomposed              Composed\r
36         { "cat",                "cat",                  "cat"               },\r
37         { "\u00e0ardvark",      "a\u0300ardvark",       "\u00e0ardvark",    },\r
38 \r
39         { "\u1e0a",             "D\u0307",              "\u1e0a"            }, // D-dot_above\r
40         { "D\u0307",            "D\u0307",              "\u1e0a"            }, // D dot_above\r
41 \r
42         { "\u1e0c\u0307",       "D\u0323\u0307",        "\u1e0c\u0307"      }, // D-dot_below dot_above\r
43         { "\u1e0a\u0323",       "D\u0323\u0307",        "\u1e0c\u0307"      }, // D-dot_above dot_below\r
44         { "D\u0307\u0323",      "D\u0323\u0307",        "\u1e0c\u0307"      }, // D dot_below dot_above\r
45 \r
46         { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307",  "\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above\r
47         { "D\u0307\u0328\u0323","D\u0328\u0323\u0307",  "\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below\r
48 \r
49         { "\u1E14",             "E\u0304\u0300",        "\u1E14"            }, // E-macron-grave\r
50         { "\u0112\u0300",       "E\u0304\u0300",        "\u1E14"            }, // E-macron + grave\r
51         { "\u00c8\u0304",       "E\u0300\u0304",        "\u00c8\u0304"      }, // E-grave + macron\r
52 \r
53         { "\u212b",             "A\u030a",              "\u00c5"            }, // angstrom_sign\r
54         { "\u00c5",             "A\u030a",              "\u00c5"            }, // A-ring\r
55 \r
56         { "\u00c4ffin",         "A\u0308ffin",          "\u00c4ffin"        },\r
57         { "\u00c4\uFB03n",      "A\u0308\uFB03n",       "\u00c4\uFB03n"     },\r
58 \r
59         { "\u00fdffin",         "y\u0301ffin",          "\u00fdffin"        }, //updated with 3.0\r
60         { "\u00fd\uFB03n",      "y\u0301\uFB03n",       "\u00fd\uFB03n"     }, //updated with 3.0\r
61 \r
62         { "Henry IV",           "Henry IV",             "Henry IV"          },\r
63         { "Henry \u2163",       "Henry \u2163",         "Henry \u2163"      },\r
64 \r
65         { "\u30AC",             "\u30AB\u3099",         "\u30AC"            }, // ga (Katakana)\r
66         { "\u30AB\u3099",       "\u30AB\u3099",         "\u30AC"            }, // ka + ten\r
67         { "\uFF76\uFF9E",       "\uFF76\uFF9E",         "\uFF76\uFF9E"      }, // hw_ka + hw_ten\r
68         { "\u30AB\uFF9E",       "\u30AB\uFF9E",         "\u30AB\uFF9E"      }, // ka + hw_ten\r
69         { "\uFF76\u3099",       "\uFF76\u3099",         "\uFF76\u3099"      }, // hw_ka + ten\r
70 \r
71         { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" },\r
72         {"\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e","\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165", "\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165"},\r
73     };\r
74 \r
75     String[][] compatTests = {\r
76             // Input                Decomposed              Composed\r
77         { "cat",                 "cat",                     "cat"           },\r
78         { "\uFB4f",             "\u05D0\u05DC",         "\u05D0\u05DC",     }, // Alef-Lamed vs. Alef, Lamed\r
79 \r
80         { "\u00C4ffin",         "A\u0308ffin",          "\u00C4ffin"        },\r
81         { "\u00C4\uFB03n",      "A\u0308ffin",          "\u00C4ffin"        }, // ffi ligature -> f + f + i\r
82 \r
83         { "\u00fdffin",         "y\u0301ffin",          "\u00fdffin"        },        //updated for 3.0\r
84         { "\u00fd\uFB03n",      "y\u0301ffin",          "\u00fdffin"        }, // ffi ligature -> f + f + i\r
85 \r
86         { "Henry IV",           "Henry IV",             "Henry IV"          },\r
87         { "Henry \u2163",       "Henry IV",             "Henry IV"          },\r
88 \r
89         { "\u30AC",             "\u30AB\u3099",         "\u30AC"            }, // ga (Katakana)\r
90         { "\u30AB\u3099",       "\u30AB\u3099",         "\u30AC"            }, // ka + ten\r
91 \r
92         { "\uFF76\u3099",       "\u30AB\u3099",         "\u30AC"            }, // hw_ka + ten\r
93 \r
94         /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/\r
95         { "\uFF76\uFF9E",       "\u30AB\u3099",         "\u30AC"            }, // hw_ka + hw_ten\r
96         { "\u30AB\uFF9E",       "\u30AB\u3099",         "\u30AC"            }, // ka + hw_ten\r
97 \r
98     };\r
99 \r
100     // With Canonical decomposition, Hangul syllables should get decomposed\r
101     // into Jamo, but Jamo characters should not be decomposed into\r
102     // conjoining Jamo\r
103     String[][] hangulCanon = {\r
104         // Input                Decomposed              Composed\r
105         { "\ud4db",             "\u1111\u1171\u11b6",   "\ud4db"        },\r
106         { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6",   "\ud4db"        },\r
107     };\r
108 \r
109     // With compatibility decomposition turned on,\r
110     // it should go all the way down to conjoining Jamo characters.\r
111     // THIS IS NO LONGER TRUE IN UNICODE v2.1.8, SO THIS TEST IS OBSOLETE\r
112     String[][] hangulCompat = {\r
113         // Input        Decomposed                          Composed\r
114         // { "\ud4db",     "\u1111\u116e\u1175\u11af\u11c2",   "\ud478\u1175\u11af\u11c2"  },\r
115     };\r
116 \r
117     public void TestHangulCompose()\r
118                 throws Exception{\r
119         // Make sure that the static composition methods work\r
120         logln("Canonical composition...");\r
121         staticTest(Normalizer.NFC, hangulCanon,  2);\r
122         logln("Compatibility composition...");\r
123         staticTest(Normalizer.NFKC, hangulCompat, 2);\r
124         // Now try iterative composition....\r
125         logln("Iterative composition...");\r
126         Normalizer norm = new Normalizer("", Normalizer.NFC,0);\r
127         iterateTest(norm, hangulCanon, 2);\r
128 \r
129         norm.setMode(Normalizer.NFKD);\r
130         iterateTest(norm, hangulCompat, 2);\r
131 \r
132         // And finally, make sure you can do it in reverse too\r
133         logln("Reverse iteration...");\r
134         norm.setMode(Normalizer.NFC);\r
135         backAndForth(norm, hangulCanon);\r
136      }\r
137 \r
138     public void TestHangulDecomp() throws Exception{\r
139         // Make sure that the static decomposition methods work\r
140         logln("Canonical decomposition...");\r
141         staticTest(Normalizer.NFD, hangulCanon,  1);\r
142         logln("Compatibility decomposition...");\r
143         staticTest(Normalizer.NFKD, hangulCompat, 1);\r
144 \r
145          // Now the iterative decomposition methods...\r
146         logln("Iterative decomposition...");\r
147         Normalizer norm = new Normalizer("", Normalizer.NFD,0);\r
148         iterateTest(norm, hangulCanon, 1);\r
149 \r
150         norm.setMode(Normalizer.NFKD);\r
151         iterateTest(norm, hangulCompat, 1);\r
152 \r
153         // And finally, make sure you can do it in reverse too\r
154         logln("Reverse iteration...");\r
155         norm.setMode(Normalizer.NFD);\r
156         backAndForth(norm, hangulCanon);\r
157     }\r
158     public void TestNone() throws Exception{\r
159         Normalizer norm = new Normalizer("", Normalizer.NONE,0);\r
160         iterateTest(norm, canonTests, 0);\r
161         staticTest(Normalizer.NONE, canonTests, 0);\r
162     }\r
163     public void TestDecomp() throws Exception{\r
164         Normalizer norm = new Normalizer("", Normalizer.NFD,0);\r
165         iterateTest(norm, canonTests, 1);\r
166         staticTest(Normalizer.NFD, canonTests, 1);\r
167         decomposeTest(Normalizer.NFD, canonTests, 1);\r
168     }\r
169 \r
170     public void TestCompatDecomp() throws Exception{\r
171         Normalizer norm = new Normalizer("", Normalizer.NFKD,0);\r
172         iterateTest(norm, compatTests, 1);\r
173         staticTest(Normalizer.NFKD,compatTests, 1);\r
174         decomposeTest(Normalizer.NFKD,compatTests, 1);\r
175     }\r
176 \r
177     public void TestCanonCompose() throws Exception{\r
178         Normalizer norm = new Normalizer("", Normalizer.NFC,0);\r
179         iterateTest(norm, canonTests, 2);\r
180         staticTest(Normalizer.NFC, canonTests, 2);\r
181         composeTest(Normalizer.NFC, canonTests, 2);\r
182     }\r
183 \r
184     public void TestCompatCompose() throws Exception{\r
185         Normalizer norm = new Normalizer("", Normalizer.NFKC,0);\r
186         iterateTest(norm, compatTests, 2);\r
187         staticTest(Normalizer.NFKC,compatTests, 2);\r
188         composeTest(Normalizer.NFKC,compatTests, 2);\r
189     }\r
190 \r
191     public void TestExplodingBase() throws Exception{\r
192         // \u017f - Latin small letter long s\r
193         // \u0307 - combining dot above\r
194         // \u1e61 - Latin small letter s with dot above\r
195         // \u1e9b - Latin small letter long s with dot above\r
196         String[][] canon = {\r
197             // Input                Decomposed              Composed\r
198             { "Tschu\u017f",        "Tschu\u017f",          "Tschu\u017f"    },\r
199             { "Tschu\u1e9b",        "Tschu\u017f\u0307",    "Tschu\u1e9b"    },\r
200         };\r
201         String[][] compat = {\r
202             // Input                Decomposed              Composed\r
203             { "\u017f",        "s",              "s"           },\r
204             { "\u1e9b",        "s\u0307",        "\u1e61"      },\r
205         };\r
206 \r
207         staticTest(Normalizer.NFD, canon,  1);\r
208         staticTest(Normalizer.NFC, canon,  2);\r
209 \r
210         staticTest(Normalizer.NFKD, compat, 1);\r
211         staticTest(Normalizer.NFKC, compat, 2);\r
212 \r
213     }\r
214 \r
215     /**\r
216      * The Tibetan vowel sign AA, 0f71, was messed up prior to\r
217      * Unicode version 2.1.9.\r
218      * Once 2.1.9 or 3.0 is released, uncomment this test.\r
219      */\r
220     public void TestTibetan() throws Exception{\r
221         String[][] decomp = {\r
222             { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" }\r
223         };\r
224         String[][] compose = {\r
225             { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" }\r
226         };\r
227 \r
228         staticTest(Normalizer.NFD, decomp, 1);\r
229         staticTest(Normalizer.NFKD,decomp, 2);\r
230         staticTest(Normalizer.NFC, compose, 1);\r
231         staticTest(Normalizer.NFKC,compose, 2);\r
232     }\r
233 \r
234     /**\r
235      * Make sure characters in the CompositionExclusion.txt list do not get\r
236      * composed to.\r
237      */\r
238     public void TestCompositionExclusion()\r
239                 throws Exception{\r
240         // This list is generated from CompositionExclusion.txt.\r
241         // Update whenever the normalizer tables are updated.  Note\r
242         // that we test all characters listed, even those that can be\r
243         // derived from the Unicode DB and are therefore commented\r
244         // out.\r
245         String EXCLUDED =\r
246             "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" +\r
247             "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" +\r
248             "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" +\r
249             "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" +\r
250             "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" +\r
251             "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" +\r
252             "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u1FDB" +\r
253             "\u1FE3\u1FEB\u1FEE\u1FEF\u1FF9\u1FFB\u1FFD\u2000" +\r
254             "\u2001\u2126\u212A\u212B\u2329\u232A\uF900\uFA10" +\r
255             "\uFA12\uFA15\uFA20\uFA22\uFA25\uFA26\uFA2A\uFB1F" +\r
256             "\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E\uFB2F\uFB30\uFB31" +\r
257             "\uFB32\uFB33\uFB34\uFB35\uFB36\uFB38\uFB39\uFB3A" +\r
258             "\uFB3B\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46" +\r
259             "\uFB47\uFB48\uFB49\uFB4A\uFB4B\uFB4C\uFB4D\uFB4E";\r
260         for (int i=0; i<EXCLUDED.length(); ++i) {\r
261             String a = String.valueOf(EXCLUDED.charAt(i));\r
262             String b = Normalizer.normalize(a, Normalizer.NFKD);\r
263             String c = Normalizer.normalize(b, Normalizer.NFC);\r
264             if (c.equals(a)) {\r
265                 errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " +\r
266                       hex(b) + " x COMPOSE => " +\r
267                       hex(c));\r
268             } else if (isVerbose()) {\r
269                 logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " +\r
270                       hex(b) + " x COMPOSE => " +\r
271                       hex(c));\r
272             }\r
273         }\r
274         // The following method works too, but it is somewhat\r
275         // incestuous.  It uses UInfo, which is the same database that\r
276         // NormalizerBuilder uses, so if something is wrong with\r
277         // UInfo, the following test won't show it.  All it will show\r
278         // is that NormalizerBuilder has been run with whatever the\r
279         // current UInfo is.\r
280         //\r
281         // We comment this out in favor of the test above, which\r
282         // provides independent verification (but also requires\r
283         // independent updating).\r
284 //      logln("---");\r
285 //      UInfo uinfo = new UInfo();\r
286 //      for (int i=0; i<=0xFFFF; ++i) {\r
287 //          if (!uinfo.isExcludedComposition((char)i) ||\r
288 //              (!uinfo.hasCanonicalDecomposition((char)i) &&\r
289 //               !uinfo.hasCompatibilityDecomposition((char)i))) continue;\r
290 //          String a = String.valueOf((char)i);\r
291 //          String b = Normalizer.normalize(a,Normalizer.DECOMP_COMPAT,0);\r
292 //          String c = Normalizer.normalize(b,Normalizer.COMPOSE,0);\r
293 //          if (c.equals(a)) {\r
294 //              errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " +\r
295 //                    hex(b) + " x COMPOSE => " +\r
296 //                    hex(c));\r
297 //          } else if (isVerbose()) {\r
298 //              logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " +\r
299 //                    hex(b) + " x COMPOSE => " +\r
300 //                    hex(c));\r
301 //          }\r
302 //      }\r
303     }\r
304 \r
305     /**\r
306      * Test for a problem that showed up just before ICU 1.6 release\r
307      * having to do with combining characters with an index of zero.\r
308      * Such characters do not participate in any canonical\r
309      * decompositions.  However, having an index of zero means that\r
310      * they all share one typeMask[] entry, that is, they all have to\r
311      * map to the same canonical class, which is not the case, in\r
312      * reality.\r
313      */\r
314     public void TestZeroIndex()\r
315                 throws Exception{\r
316         String[] DATA = {\r
317             // Expect col1 x COMPOSE_COMPAT => col2\r
318             // Expect col2 x DECOMP => col3\r
319             "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300",\r
320             "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300",\r
321             "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300",\r
322             "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327",\r
323             "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321",\r
324         };\r
325 \r
326         for (int i=0; i<DATA.length; i+=3) {\r
327             String a = DATA[i];\r
328             String b = Normalizer.normalize(a, Normalizer.NFKC);\r
329             String exp = DATA[i+1];\r
330             if (b.equals(exp)) {\r
331                 logln("Ok: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b));\r
332             } else {\r
333                 errln("FAIL: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b) +\r
334                       ", expect " + hex(exp));\r
335             }\r
336             a = Normalizer.normalize(b, Normalizer.NFD);\r
337             exp = DATA[i+2];\r
338             if (a.equals(exp)) {\r
339                 logln("Ok: " + hex(b) + " x DECOMP => " + hex(a));\r
340             } else {\r
341                 errln("FAIL: " + hex(b) + " x DECOMP => " + hex(a) +\r
342                       ", expect " + hex(exp));\r
343             }\r
344         }\r
345     }\r
346 \r
347     /**\r
348      * Test for a problem found by Verisign.  Problem is that\r
349      * characters at the start of a string are not put in canonical\r
350      * order correctly by compose() if there is no starter.\r
351      */\r
352     public void TestVerisign()\r
353                 throws Exception{\r
354         String[] inputs = {\r
355             "\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f",\r
356             "\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad"\r
357         };\r
358         String[] outputs = {\r
359             "\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f",\r
360             "\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4"\r
361         };\r
362 \r
363         for (int i = 0; i < inputs.length; ++i) {\r
364             String input = inputs[i];\r
365             String output = outputs[i];\r
366             String result = Normalizer.decompose(input, false);\r
367             if (!result.equals(output)) {\r
368                 errln("FAIL input: " + hex(input));\r
369                 errln(" decompose: " + hex(result));\r
370                 errln("  expected: " + hex(output));\r
371             }\r
372             result = Normalizer.compose(input, false);\r
373             if (!result.equals(output)) {\r
374                 errln("FAIL input: " + hex(input));\r
375                 errln("   compose: " + hex(result));\r
376                 errln("  expected: " + hex(output));\r
377             }\r
378         }\r
379 \r
380     }\r
381     public void  TestQuickCheckResultNO()\r
382                  throws Exception{\r
383         final char CPNFD[] = {0x00C5, 0x0407, 0x1E00, 0x1F57, 0x220C,\r
384                                 0x30AE, 0xAC00, 0xD7A3, 0xFB36, 0xFB4E};\r
385         final char CPNFC[] = {0x0340, 0x0F93, 0x1F77, 0x1FBB, 0x1FEB,\r
386                                 0x2000, 0x232A, 0xF900, 0xFA1E, 0xFB4E};\r
387         final char CPNFKD[] = {0x00A0, 0x02E4, 0x1FDB, 0x24EA, 0x32FE,\r
388                                 0xAC00, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D};\r
389         final char CPNFKC[] = {0x00A0, 0x017F, 0x2000, 0x24EA, 0x32FE,\r
390                                 0x33FE, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D};\r
391 \r
392 \r
393         final int SIZE = 10;\r
394 \r
395         int count = 0;\r
396         for (; count < SIZE; count ++)\r
397         {\r
398             if (Normalizer.quickCheck(String.valueOf(CPNFD[count]),\r
399                     Normalizer.NFD,0) != Normalizer.NO)\r
400             {\r
401                 errln("ERROR in NFD quick check at U+" +\r
402                        Integer.toHexString(CPNFD[count]));\r
403                 return;\r
404             }\r
405             if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),\r
406                         Normalizer.NFC,0) !=Normalizer.NO)\r
407             {\r
408                 errln("ERROR in NFC quick check at U+"+\r
409                        Integer.toHexString(CPNFC[count]));\r
410                 return;\r
411             }\r
412             if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]),\r
413                                 Normalizer.NFKD,0) != Normalizer.NO)\r
414             {\r
415                 errln("ERROR in NFKD quick check at U+"+\r
416                        Integer.toHexString(CPNFKD[count]));\r
417                 return;\r
418             }\r
419             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),\r
420                                          Normalizer.NFKC,0) !=Normalizer.NO)\r
421             {\r
422                 errln("ERROR in NFKC quick check at U+"+\r
423                        Integer.toHexString(CPNFKC[count]));\r
424                 return;\r
425             }\r
426             // for improving coverage\r
427             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),\r
428                                          Normalizer.NFKC) !=Normalizer.NO)\r
429             {\r
430                 errln("ERROR in NFKC quick check at U+"+\r
431                        Integer.toHexString(CPNFKC[count]));\r
432                 return;\r
433             }\r
434         }\r
435     }\r
436 \r
437 \r
438     public void TestQuickCheckResultYES()\r
439                 throws Exception{\r
440         final char CPNFD[] = {0x00C6, 0x017F, 0x0F74, 0x1000, 0x1E9A,\r
441                                 0x2261, 0x3075, 0x4000, 0x5000, 0xF000};\r
442         final char CPNFC[] = {0x0400, 0x0540, 0x0901, 0x1000, 0x1500,\r
443                                 0x1E9A, 0x3000, 0x4000, 0x5000, 0xF000};\r
444         final char CPNFKD[] = {0x00AB, 0x02A0, 0x1000, 0x1027, 0x2FFB,\r
445                                 0x3FFF, 0x4FFF, 0xA000, 0xF000, 0xFA27};\r
446         final char CPNFKC[] = {0x00B0, 0x0100, 0x0200, 0x0A02, 0x1000,\r
447                                 0x2010, 0x3030, 0x4000, 0xA000, 0xFA0E};\r
448 \r
449         final int SIZE = 10;\r
450         int count = 0;\r
451 \r
452         char cp = 0;\r
453         while (cp < 0xA0)\r
454         {\r
455             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFD,0)\r
456                                             != Normalizer.YES)\r
457             {\r
458                 errln("ERROR in NFD quick check at U+"+\r
459                                                       Integer.toHexString(cp));\r
460                 return;\r
461             }\r
462             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFC,0)\r
463                                              != Normalizer.YES)\r
464             {\r
465                 errln("ERROR in NFC quick check at U+"+\r
466                                                       Integer.toHexString(cp));\r
467                 return;\r
468             }\r
469             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKD,0)\r
470                                              != Normalizer.YES)\r
471             {\r
472                 errln("ERROR in NFKD quick check at U+" +\r
473                                                       Integer.toHexString(cp));\r
474                 return;\r
475             }\r
476             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC,0)\r
477                                              != Normalizer.YES)\r
478             {\r
479                 errln("ERROR in NFKC quick check at U+"+\r
480                                                        Integer.toHexString(cp));\r
481                 return;\r
482             }\r
483             // improve the coverage\r
484             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC)\r
485                                              != Normalizer.YES)\r
486             {\r
487                 errln("ERROR in NFKC quick check at U+"+\r
488                                                        Integer.toHexString(cp));\r
489                 return;\r
490             }\r
491             cp++;\r
492         }\r
493 \r
494         for (; count < SIZE; count ++)\r
495         {\r
496             if (Normalizer.quickCheck(String.valueOf(CPNFD[count]),\r
497                                          Normalizer.NFD,0)!=Normalizer.YES)\r
498             {\r
499                 errln("ERROR in NFD quick check at U+"+\r
500                                              Integer.toHexString(CPNFD[count]));\r
501                 return;\r
502             }\r
503             if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),\r
504                                          Normalizer.NFC,0)!=Normalizer.YES)\r
505             {\r
506                 errln("ERROR in NFC quick check at U+"+\r
507                                              Integer.toHexString(CPNFC[count]));\r
508                 return;\r
509             }\r
510             if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]),\r
511                                          Normalizer.NFKD,0)!=Normalizer.YES)\r
512             {\r
513                 errln("ERROR in NFKD quick check at U+"+\r
514                                     Integer.toHexString(CPNFKD[count]));\r
515                 return;\r
516             }\r
517             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),\r
518                                          Normalizer.NFKC,0)!=Normalizer.YES)\r
519             {\r
520                 errln("ERROR in NFKC quick check at U+"+\r
521                         Integer.toHexString(CPNFKC[count]));\r
522                 return;\r
523             }\r
524             // improve the coverage\r
525             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),\r
526                                          Normalizer.NFKC)!=Normalizer.YES)\r
527             {\r
528                 errln("ERROR in NFKC quick check at U+"+\r
529                         Integer.toHexString(CPNFKC[count]));\r
530                 return;\r
531             }\r
532         }\r
533     }\r
534     public void TestBengali() throws Exception{\r
535         String input = "\u09bc\u09be\u09cd\u09be";\r
536         String output=Normalizer.normalize(input,Normalizer.NFC);\r
537         if(!input.equals(output)){\r
538              errln("ERROR in NFC of string");\r
539         }\r
540     }\r
541     public void TestQuickCheckResultMAYBE()\r
542                 throws Exception{\r
543 \r
544         final char[] CPNFC = {0x0306, 0x0654, 0x0BBE, 0x102E, 0x1161,\r
545                                 0x116A, 0x1173, 0x1175, 0x3099, 0x309A};\r
546         final char[] CPNFKC = {0x0300, 0x0654, 0x0655, 0x09D7, 0x0B3E,\r
547                                 0x0DCF, 0xDDF, 0x102E, 0x11A8, 0x3099};\r
548 \r
549 \r
550         final int SIZE = 10;\r
551 \r
552         int count = 0;\r
553 \r
554         /* NFD and NFKD does not have any MAYBE codepoints */\r
555         for (; count < SIZE; count ++)\r
556         {\r
557             if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),\r
558                                         Normalizer.NFC,0)!=Normalizer.MAYBE)\r
559             {\r
560                 errln("ERROR in NFC quick check at U+"+\r
561                                             Integer.toHexString(CPNFC[count]));\r
562                 return;\r
563             }\r
564             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),\r
565                                        Normalizer.NFKC,0)!=Normalizer.MAYBE)\r
566             {\r
567                 errln("ERROR in NFKC quick check at U+"+\r
568                                             Integer.toHexString(CPNFKC[count]));\r
569                 return;\r
570             }\r
571             if (Normalizer.quickCheck(new char[]{CPNFC[count]},\r
572                                         Normalizer.NFC,0)!=Normalizer.MAYBE)\r
573             {\r
574                 errln("ERROR in NFC quick check at U+"+\r
575                                             Integer.toHexString(CPNFC[count]));\r
576                 return;\r
577             }\r
578             if (Normalizer.quickCheck(new char[]{CPNFKC[count]},\r
579                                        Normalizer.NFKC,0)!=Normalizer.MAYBE)\r
580             {\r
581                 errln("ERROR in NFKC quick check at U+"+\r
582                                             Integer.toHexString(CPNFKC[count]));\r
583                 return;\r
584             }\r
585             if (Normalizer.quickCheck(new char[]{CPNFKC[count]},\r
586                                        Normalizer.NONE,0)!=Normalizer.YES)\r
587             {\r
588                 errln("ERROR in NONE quick check at U+"+\r
589                                             Integer.toHexString(CPNFKC[count]));\r
590                 return;\r
591             }\r
592         }\r
593     }\r
594 \r
595     public void TestQuickCheckStringResult()\r
596                 throws Exception{\r
597         int count;\r
598         String d;\r
599         String c;\r
600 \r
601         for (count = 0; count < canonTests.length; count ++)\r
602         {\r
603             d = canonTests[count][1];\r
604             c = canonTests[count][2];\r
605             if (Normalizer.quickCheck(d,Normalizer.NFD,0)\r
606                                             != Normalizer.YES)\r
607             {\r
608                 errln("ERROR in NFD quick check for string at count " + count);\r
609                 return;\r
610             }\r
611 \r
612             if (Normalizer.quickCheck(c, Normalizer.NFC,0)\r
613                                             == Normalizer.NO)\r
614             {\r
615                 errln("ERROR in NFC quick check for string at count " + count);\r
616                 return;\r
617             }\r
618         }\r
619 \r
620         for (count = 0; count < compatTests.length; count ++)\r
621         {\r
622             d = compatTests[count][1];\r
623             c = compatTests[count][2];\r
624             if (Normalizer.quickCheck(d, Normalizer.NFKD,0)\r
625                                             != Normalizer.YES)\r
626             {\r
627                 errln("ERROR in NFKD quick check for string at count " + count);\r
628                 return;\r
629             }\r
630 \r
631             if (Normalizer.quickCheck(c,  Normalizer.NFKC,0)\r
632                                             != Normalizer.YES)\r
633             {\r
634                 errln("ERROR in NFKC quick check for string at count " + count);\r
635                 return;\r
636             }\r
637         }\r
638     }\r
639 \r
640     static final int qcToInt(Normalizer.QuickCheckResult qc) {\r
641         if(qc==Normalizer.NO) {\r
642             return 0;\r
643         } else if(qc==Normalizer.YES) {\r
644             return 1;\r
645         } else /* Normalizer.MAYBE */ {\r
646             return 2;\r
647         }\r
648     }\r
649 \r
650     public void TestQuickCheckPerCP() {\r
651         int c, lead, trail;\r
652         String s, nfd;\r
653         int lccc1, lccc2, tccc1, tccc2;\r
654         int qc1, qc2;\r
655 \r
656         if(\r
657             UCharacter.getIntPropertyMaxValue(UProperty.NFD_QUICK_CHECK)!=1 || // YES\r
658             UCharacter.getIntPropertyMaxValue(UProperty.NFKD_QUICK_CHECK)!=1 ||\r
659             UCharacter.getIntPropertyMaxValue(UProperty.NFC_QUICK_CHECK)!=2 || // MAYBE\r
660             UCharacter.getIntPropertyMaxValue(UProperty.NFKC_QUICK_CHECK)!=2 ||\r
661             UCharacter.getIntPropertyMaxValue(UProperty.LEAD_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) ||\r
662             UCharacter.getIntPropertyMaxValue(UProperty.TRAIL_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS)\r
663         ) {\r
664             errln("wrong result from one of the u_getIntPropertyMaxValue(UCHAR_NF*_QUICK_CHECK) or UCHAR_*_CANONICAL_COMBINING_CLASS");\r
665         }\r
666 \r
667         /*\r
668          * compare the quick check property values for some code points\r
669          * to the quick check results for checking same-code point strings\r
670          */\r
671         c=0;\r
672         while(c<0x110000) {\r
673             s=UTF16.valueOf(c);\r
674 \r
675             qc1=UCharacter.getIntPropertyValue(c, UProperty.NFC_QUICK_CHECK);\r
676             qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFC));\r
677             if(qc1!=qc2) {\r
678                 errln("getIntPropertyValue(NFC)="+qc1+" != "+qc2+"=quickCheck(NFC) for U+"+Integer.toHexString(c));\r
679             }\r
680 \r
681             qc1=UCharacter.getIntPropertyValue(c, UProperty.NFD_QUICK_CHECK);\r
682             qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFD));\r
683             if(qc1!=qc2) {\r
684                 errln("getIntPropertyValue(NFD)="+qc1+" != "+qc2+"=quickCheck(NFD) for U+"+Integer.toHexString(c));\r
685             }\r
686 \r
687             qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKC_QUICK_CHECK);\r
688             qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKC));\r
689             if(qc1!=qc2) {\r
690                 errln("getIntPropertyValue(NFKC)="+qc1+" != "+qc2+"=quickCheck(NFKC) for U+"+Integer.toHexString(c));\r
691             }\r
692 \r
693             qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKD_QUICK_CHECK);\r
694             qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKD));\r
695             if(qc1!=qc2) {\r
696                 errln("getIntPropertyValue(NFKD)="+qc1+" != "+qc2+"=quickCheck(NFKD) for U+"+Integer.toHexString(c));\r
697             }\r
698 \r
699             nfd=Normalizer.normalize(s, Normalizer.NFD);\r
700             lead=UTF16.charAt(nfd, 0);\r
701             trail=UTF16.charAt(nfd, nfd.length()-1);\r
702 \r
703             lccc1=UCharacter.getIntPropertyValue(c, UProperty.LEAD_CANONICAL_COMBINING_CLASS);\r
704             lccc2=UCharacter.getCombiningClass(lead);\r
705             tccc1=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS);\r
706             tccc2=UCharacter.getCombiningClass(trail);\r
707 \r
708             if(lccc1!=lccc2) {\r
709                 errln("getIntPropertyValue(lccc)="+lccc1+" != "+lccc2+"=getCombiningClass(lead) for U+"+Integer.toHexString(c));\r
710             }\r
711             if(tccc1!=tccc2) {\r
712                 errln("getIntPropertyValue(tccc)="+tccc1+" != "+tccc2+"=getCombiningClass(trail) for U+"+Integer.toHexString(c));\r
713             }\r
714 \r
715             /* skip some code points */\r
716             c=(20*c)/19+1;\r
717         }\r
718     }\r
719 \r
720     //------------------------------------------------------------------------\r
721     // Internal utilities\r
722     //\r
723        //------------------------------------------------------------------------\r
724     // Internal utilities\r
725     //\r
726 \r
727 /*    private void backAndForth(Normalizer iter, String input)\r
728     {\r
729         iter.setText(input);\r
730 \r
731         // Run through the iterator forwards and stick it into a StringBuffer\r
732         StringBuffer forward =  new StringBuffer();\r
733         for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) {\r
734             forward.append(ch);\r
735         }\r
736 \r
737         // Now do it backwards\r
738         StringBuffer reverse = new StringBuffer();\r
739         for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) {\r
740             reverse.insert(0, ch);\r
741         }\r
742 \r
743         if (!forward.toString().equals(reverse.toString())) {\r
744             errln("FAIL: Forward/reverse mismatch for input " + hex(input)\r
745                   + ", forward: " + hex(forward) + ", backward: "+hex(reverse));\r
746         } else if (isVerbose()) {\r
747             logln("Ok: Forward/reverse for input " + hex(input)\r
748                   + ", forward: " + hex(forward) + ", backward: "+hex(reverse));\r
749         }\r
750     }*/\r
751 \r
752     private void backAndForth(Normalizer iter, String[][] tests)\r
753     {\r
754         for (int i = 0; i < tests.length; i++)\r
755         {\r
756             iter.setText(tests[i][0]);\r
757 \r
758             // Run through the iterator forwards and stick it into a\r
759             // StringBuffer\r
760             StringBuffer forward =  new StringBuffer();\r
761             for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) {\r
762                 forward.append(ch);\r
763             }\r
764 \r
765             // Now do it backwards\r
766             StringBuffer reverse = new StringBuffer();\r
767             for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) {\r
768                 reverse.insert(0, ch);\r
769             }\r
770 \r
771             if (!forward.toString().equals(reverse.toString())) {\r
772                 errln("FAIL: Forward/reverse mismatch for input "\r
773                     + hex(tests[i][0]) + ", forward: " + hex(forward)\r
774                     + ", backward: " + hex(reverse));\r
775             } else if (isVerbose()) {\r
776                 logln("Ok: Forward/reverse for input " + hex(tests[i][0])\r
777                       + ", forward: " + hex(forward) + ", backward: "\r
778                       + hex(reverse));\r
779             }\r
780         }\r
781     }\r
782 \r
783     private void staticTest (Normalizer.Mode mode,\r
784                              String[][] tests, int outCol) throws Exception{\r
785         for (int i = 0; i < tests.length; i++)\r
786         {\r
787             String input = Utility.unescape(tests[i][0]);\r
788             String expect = Utility.unescape(tests[i][outCol]);\r
789 \r
790             logln("Normalizing '" + input + "' (" + hex(input) + ")" );\r
791 \r
792             String output = Normalizer.normalize(input, mode);\r
793 \r
794             if (!output.equals(expect)) {\r
795                 errln("FAIL: case " + i\r
796                     + " expected '" + expect + "' (" + hex(expect) + ")"\r
797                     + " but got '" + output + "' (" + hex(output) + ")" );\r
798             }\r
799         }\r
800         char[] output = new char[1];\r
801         for (int i = 0; i < tests.length; i++)\r
802         {\r
803             char[] input = Utility.unescape(tests[i][0]).toCharArray();\r
804             String expect =Utility.unescape( tests[i][outCol]);\r
805 \r
806             logln("Normalizing '" + new String(input) + "' (" +\r
807                         hex(new String(input)) + ")" );\r
808             int reqLength=0;\r
809             while(true){\r
810                 try{\r
811                     reqLength=Normalizer.normalize(input,output, mode,0);\r
812                     if(reqLength<=output.length    ){\r
813                         break;\r
814                     }\r
815                 }catch(IndexOutOfBoundsException e){\r
816                     output= new char[Integer.parseInt(e.getMessage())];\r
817                     continue;\r
818                 }\r
819             }\r
820             if (!expect.equals(new String(output,0,reqLength))) {\r
821                 errln("FAIL: case " + i\r
822                     + " expected '" + expect + "' (" + hex(expect) + ")"\r
823                     + " but got '" + new String(output)\r
824                     + "' ("  + hex(new String(output)) + ")" );\r
825             }\r
826         }\r
827     }\r
828     private void decomposeTest(Normalizer.Mode mode,\r
829                              String[][] tests, int outCol) throws Exception{\r
830         for (int i = 0; i < tests.length; i++)\r
831         {\r
832             String input = Utility.unescape(tests[i][0]);\r
833             String expect = Utility.unescape(tests[i][outCol]);\r
834 \r
835             logln("Normalizing '" + input + "' (" + hex(input) + ")" );\r
836 \r
837             String output = Normalizer.decompose(input, mode==Normalizer.NFKD);\r
838 \r
839             if (!output.equals(expect)) {\r
840                 errln("FAIL: case " + i\r
841                     + " expected '" + expect + "' (" + hex(expect) + ")"\r
842                     + " but got '" + output + "' (" + hex(output) + ")" );\r
843             }\r
844         }\r
845         char[] output = new char[1];\r
846         for (int i = 0; i < tests.length; i++)\r
847         {\r
848             char[] input = Utility.unescape(tests[i][0]).toCharArray();\r
849             String expect = Utility.unescape(tests[i][outCol]);\r
850 \r
851             logln("Normalizing '" + new String(input) + "' (" +\r
852                         hex(new String(input)) + ")" );\r
853             int reqLength=0;\r
854             while(true){\r
855                 try{\r
856                     reqLength=Normalizer.decompose(input,output, mode==Normalizer.NFKD,0);\r
857                     if(reqLength<=output.length ){\r
858                         break;\r
859                     }\r
860                 }catch(IndexOutOfBoundsException e){\r
861                     output= new char[Integer.parseInt(e.getMessage())];\r
862                     continue;\r
863                 }\r
864             }\r
865             if (!expect.equals(new String(output,0,reqLength))) {\r
866                 errln("FAIL: case " + i\r
867                     + " expected '" + expect + "' (" + hex(expect) + ")"\r
868                     + " but got '" + new String(output)\r
869                     + "' ("  + hex(new String(output)) + ")" );\r
870             }\r
871         }\r
872         output = new char[1];\r
873         for (int i = 0; i < tests.length; i++)\r
874         {\r
875            char[] input = Utility.unescape(tests[i][0]).toCharArray();\r
876            String expect = Utility.unescape(tests[i][outCol]);\r
877     \r
878            logln("Normalizing '" + new String(input) + "' (" +\r
879                        hex(new String(input)) + ")" );\r
880            int reqLength=0;\r
881            while(true){\r
882                try{\r
883                    reqLength=Normalizer.decompose(input,0,input.length,output,0,output.length, mode==Normalizer.NFKD,0);\r
884                    if(reqLength<=output.length ){\r
885                        break;\r
886                    }\r
887                }catch(IndexOutOfBoundsException e){\r
888                    output= new char[Integer.parseInt(e.getMessage())];\r
889                    continue;\r
890                }\r
891            }\r
892            if (!expect.equals(new String(output,0,reqLength))) {\r
893                errln("FAIL: case " + i\r
894                    + " expected '" + expect + "' (" + hex(expect) + ")"\r
895                    + " but got '" + new String(output)\r
896                    + "' ("  + hex(new String(output)) + ")" );\r
897            }\r
898            char[] output2 = new char[reqLength * 2];\r
899            System.arraycopy(output, 0, output2, 0, reqLength);\r
900            int retLength = Normalizer.decompose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0);\r
901            if(retLength != reqLength){\r
902                logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength);\r
903            }\r
904         }\r
905     }\r
906 \r
907     private void composeTest(Normalizer.Mode mode,\r
908                              String[][] tests, int outCol) throws Exception{\r
909         for (int i = 0; i < tests.length; i++)\r
910         {\r
911             String input = Utility.unescape(tests[i][0]);\r
912             String expect = Utility.unescape(tests[i][outCol]);\r
913 \r
914             logln("Normalizing '" + input + "' (" + hex(input) + ")" );\r
915 \r
916             String output = Normalizer.compose(input, mode==Normalizer.NFKC);\r
917 \r
918             if (!output.equals(expect)) {\r
919                 errln("FAIL: case " + i\r
920                     + " expected '" + expect + "' (" + hex(expect) + ")"\r
921                     + " but got '" + output + "' (" + hex(output) + ")" );\r
922             }\r
923         }\r
924         char[] output = new char[1];\r
925         for (int i = 0; i < tests.length; i++)\r
926         {\r
927             char[] input = Utility.unescape(tests[i][0]).toCharArray();\r
928             String expect = Utility.unescape(tests[i][outCol]);\r
929 \r
930             logln("Normalizing '" + new String(input) + "' (" +\r
931                         hex(new String(input)) + ")" );\r
932             int reqLength=0;\r
933             while(true){\r
934                 try{\r
935                     reqLength=Normalizer.compose(input,output, mode==Normalizer.NFKC,0);\r
936                     if(reqLength<=output.length ){\r
937                         break;\r
938                     }\r
939                 }catch(IndexOutOfBoundsException e){\r
940                     output= new char[Integer.parseInt(e.getMessage())];\r
941                     continue;\r
942                 }\r
943             }\r
944             if (!expect.equals(new String(output,0,reqLength))) {\r
945                 errln("FAIL: case " + i\r
946                     + " expected '" + expect + "' (" + hex(expect) + ")"\r
947                     + " but got '" + new String(output)\r
948                     + "' ("  + hex(new String(output)) + ")" );\r
949             }\r
950         }\r
951         output = new char[1];\r
952         for (int i = 0; i < tests.length; i++)\r
953         {\r
954             char[] input = Utility.unescape(tests[i][0]).toCharArray();\r
955             String expect = Utility.unescape(tests[i][outCol]);\r
956 \r
957             logln("Normalizing '" + new String(input) + "' (" +\r
958                         hex(new String(input)) + ")" );\r
959             int reqLength=0;\r
960             while(true){\r
961                 try{\r
962                     reqLength=Normalizer.compose(input,0,input.length, output, 0, output.length, mode==Normalizer.NFKC,0);\r
963                     if(reqLength<=output.length ){\r
964                         break;\r
965                     }\r
966                 }catch(IndexOutOfBoundsException e){\r
967                     output= new char[Integer.parseInt(e.getMessage())];\r
968                     continue;\r
969                 }\r
970             }\r
971             if (!expect.equals(new String(output,0,reqLength))) {\r
972                 errln("FAIL: case " + i\r
973                     + " expected '" + expect + "' (" + hex(expect) + ")"\r
974                     + " but got '" + new String(output)\r
975                     + "' ("  + hex(new String(output)) + ")" );\r
976             }\r
977             \r
978             char[] output2 = new char[reqLength * 2];\r
979             System.arraycopy(output, 0, output2, 0, reqLength);\r
980             int retLength = Normalizer.compose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0);\r
981             if(retLength != reqLength){\r
982                 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength);\r
983             }\r
984         }\r
985     }\r
986     private void iterateTest(Normalizer iter, String[][] tests, int outCol){\r
987         for (int i = 0; i < tests.length; i++)\r
988         {\r
989             String input = Utility.unescape(tests[i][0]);\r
990             String expect = Utility.unescape(tests[i][outCol]);\r
991 \r
992             logln("Normalizing '" + input + "' (" + hex(input) + ")" );\r
993 \r
994             iter.setText(input);\r
995             assertEqual(expect, iter, "case " + i + " ");\r
996         }\r
997     }\r
998 \r
999     private void assertEqual(String expected, Normalizer iter, String msg)\r
1000     {\r
1001         int index = 0;\r
1002         int ch;\r
1003         UCharacterIterator cIter =  UCharacterIterator.getInstance(expected);\r
1004         \r
1005         while ((ch=iter.next())!= Normalizer.DONE){\r
1006             if (index >= expected.length()) {\r
1007                 errln("FAIL: " + msg + "Unexpected character '" + (char)ch\r
1008                         + "' (" + hex(ch) + ")"\r
1009                         + " at index " + index);\r
1010                 break;\r
1011             }\r
1012             int want = UTF16.charAt(expected,index);\r
1013             if (ch != want) {\r
1014                 errln("FAIL: " + msg + "got '" + (char)ch\r
1015                         + "' (" + hex(ch) + ")"\r
1016                         + " but expected '" + want + "' (" + hex(want)+ ")"\r
1017                         + " at index " + index);\r
1018             }\r
1019             index+=  UTF16.getCharCount(ch);\r
1020         }\r
1021         if (index < expected.length()) {\r
1022             errln("FAIL: " + msg + "Only got " + index + " chars, expected "\r
1023             + expected.length());\r
1024         }\r
1025         \r
1026         cIter.setToLimit();\r
1027         while((ch=iter.previous())!=Normalizer.DONE){\r
1028             int want = cIter.previousCodePoint();\r
1029             if (ch != want ) {\r
1030                 errln("FAIL: " + msg + "got '" + (char)ch\r
1031                         + "' (" + hex(ch) + ")"\r
1032                         + " but expected '" + want + "' (" + hex(want) + ")"\r
1033                         + " at index " + index);\r
1034             }\r
1035         }\r
1036     }\r
1037     //--------------------------------------------------------------------------\r
1038 \r
1039     // NOTE: These tests are used for quick debugging so are not ported\r
1040     // to ICU4C tsnorm.cpp in intltest\r
1041     //\r
1042 \r
1043     public void TestDebugStatic(){\r
1044         String in = Utility.unescape("\\U0001D157\\U0001D165");\r
1045         if(!Normalizer.isNormalized(in,Normalizer.NFC,0)){\r
1046             errln("isNormalized failed");\r
1047         }\r
1048 \r
1049         String input  =  "\uAD8B\uAD8B\uAD8B\uAD8B"+\r
1050             "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1051             "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1052             "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1053             "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1054             "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1055             "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+\r
1056             "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+\r
1057             "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+\r
1058             "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+\r
1059             "\uAD8B\uAD8B\uAD8B\uAD8B"+\r
1060             "d\u031B\u0307\u0323";\r
1061         String expect = "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+\r
1062                         "\u11AA\u1100\u116F\u11AA\uD834\uDD57\uD834\uDD65"+\r
1063                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1064                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1065                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1066                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1067                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1068                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1069                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1070                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1071                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1072                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1073                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1074                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1075                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1076                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1077                         "\uD834\uDD57\uD834\uDD65aaaaaaaaaaaaaaaaaazzzzzz"+\r
1078                         "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+\r
1079                         "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+\r
1080                         "bbbbbbbbbbbbbbbbbbbbbbbbccccccccccccccccccccccccccccc"+\r
1081                         "cccccccccccccccccccccccccccccccccccccccccccccccc"+\r
1082                         "ddddddddddddddddddddddddddddddddddddddddddddddddddddd"+\r
1083                         "dddddddddddddddddddddddd"+\r
1084                         "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+\r
1085                         "\u11AA\u1100\u116F\u11AA\u0064\u031B\u0323\u0307";\r
1086             String output = Normalizer.normalize(Utility.unescape(input),\r
1087                             Normalizer.NFD);\r
1088             if(!expect.equals(output)){\r
1089                 errln("FAIL expected: "+hex(expect) + " got: "+hex(output));\r
1090             }\r
1091 \r
1092 \r
1093 \r
1094     }\r
1095     public void TestDebugIter(){\r
1096         String src = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e");\r
1097         String expected = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e");\r
1098         Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(src)),\r
1099                                                 Normalizer.NONE,0);\r
1100         int index = 0;\r
1101         int ch;\r
1102         UCharacterIterator cIter =  UCharacterIterator.getInstance(expected);\r
1103         \r
1104         while ((ch=iter.next())!= Normalizer.DONE){\r
1105             if (index >= expected.length()) {\r
1106                 errln("FAIL: " +  "Unexpected character '" + (char)ch\r
1107                         + "' (" + hex(ch) + ")"\r
1108                         + " at index " + index);\r
1109                 break;\r
1110             }\r
1111             int want = UTF16.charAt(expected,index);\r
1112             if (ch != want) {\r
1113                 errln("FAIL: " +  "got '" + (char)ch\r
1114                         + "' (" + hex(ch) + ")"\r
1115                         + " but expected '" + want + "' (" + hex(want)+ ")"\r
1116                         + " at index " + index);\r
1117             }\r
1118             index+=  UTF16.getCharCount(ch);\r
1119         }\r
1120         if (index < expected.length()) {\r
1121             errln("FAIL: " +  "Only got " + index + " chars, expected "\r
1122             + expected.length());\r
1123         }\r
1124         \r
1125         cIter.setToLimit();\r
1126         while((ch=iter.previous())!=Normalizer.DONE){\r
1127             int want = cIter.previousCodePoint();\r
1128             if (ch != want ) {\r
1129                 errln("FAIL: " + "got '" + (char)ch\r
1130                         + "' (" + hex(ch) + ")"\r
1131                         + " but expected '" + want + "' (" + hex(want) + ")"\r
1132                         + " at index " + index);\r
1133             }\r
1134         }\r
1135     }\r
1136     public void TestDebugIterOld(){\r
1137         String input = "\\U0001D15E";\r
1138         String expected = "\uD834\uDD57\uD834\uDD65";\r
1139         String expectedReverse = "\uD834\uDD65\uD834\uDD57";\r
1140         int index = 0;\r
1141         int ch;\r
1142         Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(input)),\r
1143                                                 Normalizer.NFKC,0);\r
1144         StringBuffer got = new StringBuffer();\r
1145         for (ch = iter.first();ch!=Normalizer.DONE;ch=iter.next())\r
1146         {\r
1147             if (index >= expected.length()) {\r
1148                 errln("FAIL: " +  "Unexpected character '" + (char)ch +\r
1149                        "' (" + hex(ch) + ")" + " at index " + index);\r
1150                 break;\r
1151             }\r
1152             got.append(UCharacter.toString(ch));\r
1153             index++;\r
1154         }\r
1155         if (!expected.equals(got.toString())) {\r
1156                 errln("FAIL: " +  "got '" +got+ "' (" + hex(got) + ")"\r
1157                         + " but expected '" + expected + "' ("\r
1158                         + hex(expected) + ")");\r
1159         }\r
1160         if (got.length() < expected.length()) {\r
1161             errln("FAIL: " +  "Only got " + index + " chars, expected "\r
1162                            + expected.length());\r
1163         }\r
1164 \r
1165         logln("Reverse Iteration\n");\r
1166         iter.setIndexOnly(iter.endIndex());\r
1167         got.setLength(0);\r
1168         for(ch=iter.previous();ch!=Normalizer.DONE;ch=iter.previous()){\r
1169             if (index >= expected.length()) {\r
1170                 errln("FAIL: " +  "Unexpected character '" + (char)ch\r
1171                                + "' (" + hex(ch) + ")" + " at index " + index);\r
1172                 break;\r
1173             }\r
1174             got.append(UCharacter.toString(ch));\r
1175         }\r
1176         if (!expectedReverse.equals(got.toString())) {\r
1177                 errln("FAIL: " +  "got '" +got+ "' (" + hex(got) + ")"\r
1178                                + " but expected '" + expected\r
1179                                + "' (" + hex(expected) + ")");\r
1180         }\r
1181         if (got.length() < expected.length()) {\r
1182             errln("FAIL: " +  "Only got " + index + " chars, expected "\r
1183                       + expected.length());\r
1184         }\r
1185 \r
1186     }\r
1187     //--------------------------------------------------------------------------\r
1188     // helper class for TestPreviousNext()\r
1189     // simple UTF-32 character iterator\r
1190     class UCharIterator {\r
1191 \r
1192        public UCharIterator(int[] src, int len, int index){\r
1193 \r
1194             s=src;\r
1195             length=len;\r
1196             i=index;\r
1197        }\r
1198 \r
1199         public int current() {\r
1200             if(i<length) {\r
1201                 return s[i];\r
1202             } else {\r
1203                 return -1;\r
1204             }\r
1205         }\r
1206 \r
1207         public int next() {\r
1208             if(i<length) {\r
1209                 return s[i++];\r
1210             } else {\r
1211                 return -1;\r
1212             }\r
1213         }\r
1214 \r
1215         public int previous() {\r
1216             if(i>0) {\r
1217                 return s[--i];\r
1218             } else {\r
1219                 return -1;\r
1220             }\r
1221         }\r
1222 \r
1223         public int getIndex() {\r
1224             return i;\r
1225         }\r
1226 \r
1227         private int[] s;\r
1228         private int length, i;\r
1229     }\r
1230     public void TestPreviousNext() {\r
1231         // src and expect strings\r
1232         char src[]={\r
1233             UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999),\r
1234             UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f),\r
1235             0xc4,\r
1236             0x1ed0\r
1237         };\r
1238         int expect[]={\r
1239             0x831d,\r
1240             0x1d158, 0x1d165,\r
1241             0x41, 0x308,\r
1242             0x4f, 0x302, 0x301\r
1243         };\r
1244 \r
1245         // expected src indexes corresponding to expect indexes\r
1246         int expectIndex[]={\r
1247             0,\r
1248             2, 2,\r
1249             4, 4,\r
1250             5, 5, 5,\r
1251             6 // behind last character\r
1252         };\r
1253 \r
1254         // initial indexes into the src and expect strings\r
1255 \r
1256         final int SRC_MIDDLE=4;\r
1257         final int EXPECT_MIDDLE=3;\r
1258 \r
1259 \r
1260         // movement vector\r
1261         // - for previous(), 0 for current(), + for next()\r
1262         // not const so that we can terminate it below for the error message\r
1263         String moves="0+0+0--0-0-+++0--+++++++0--------";\r
1264 \r
1265         // iterators\r
1266         Normalizer iter = new Normalizer(new String(src),\r
1267                                                 Normalizer.NFD,0);\r
1268         UCharIterator iter32 = new UCharIterator(expect, expect.length,\r
1269                                                      EXPECT_MIDDLE);\r
1270 \r
1271         int c1, c2;\r
1272         char m;\r
1273 \r
1274         // initially set the indexes into the middle of the strings\r
1275         iter.setIndexOnly(SRC_MIDDLE);\r
1276 \r
1277         // move around and compare the iteration code points with\r
1278         // the expected ones\r
1279         int movesIndex =0;\r
1280         while(movesIndex<moves.length()) {\r
1281             m=moves.charAt(movesIndex++);\r
1282             if(m=='-') {\r
1283                 c1=iter.previous();\r
1284                 c2=iter32.previous();\r
1285             } else if(m=='0') {\r
1286                 c1=iter.current();\r
1287                 c2=iter32.current();\r
1288             } else /* m=='+' */ {\r
1289                 c1=iter.next();\r
1290                 c2=iter32.next();\r
1291             }\r
1292 \r
1293             // compare results\r
1294             if(c1!=c2) {\r
1295                 // copy the moves until the current (m) move, and terminate\r
1296                 String history = moves.substring(0,movesIndex);\r
1297                 errln("error: mismatch in Normalizer iteration at "+history+": "\r
1298                       +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2));\r
1299                 break;\r
1300             }\r
1301 \r
1302             // compare indexes\r
1303             if(iter.getIndex()!=expectIndex[iter32.getIndex()]) {\r
1304                 // copy the moves until the current (m) move, and terminate\r
1305                 String history = moves.substring(0,movesIndex);\r
1306                 errln("error: index mismatch in Normalizer iteration at "\r
1307                       +history+ " : "+ "Normalizer index " +iter.getIndex()\r
1308                       +" expected "+ expectIndex[iter32.getIndex()]);\r
1309                 break;\r
1310             }\r
1311         }\r
1312     }\r
1313     // Only in ICU4j\r
1314     public void TestPreviousNextJCI() {\r
1315         // src and expect strings\r
1316         char src[]={\r
1317             UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999),\r
1318             UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f),\r
1319             0xc4,\r
1320             0x1ed0\r
1321         };\r
1322         int expect[]={\r
1323             0x831d,\r
1324             0x1d158, 0x1d165,\r
1325             0x41, 0x308,\r
1326             0x4f, 0x302, 0x301\r
1327         };\r
1328 \r
1329         // expected src indexes corresponding to expect indexes\r
1330         int expectIndex[]={\r
1331             0,\r
1332             2, 2,\r
1333             4, 4,\r
1334             5, 5, 5,\r
1335             6 // behind last character\r
1336         };\r
1337 \r
1338         // initial indexes into the src and expect strings\r
1339 \r
1340         final int SRC_MIDDLE=4;\r
1341         final int EXPECT_MIDDLE=3;\r
1342 \r
1343 \r
1344         // movement vector\r
1345         // - for previous(), 0 for current(), + for next()\r
1346         // not const so that we can terminate it below for the error message\r
1347         String moves="0+0+0--0-0-+++0--+++++++0--------";\r
1348 \r
1349         // iterators\r
1350         StringCharacterIterator text = new StringCharacterIterator(new String(src));\r
1351         Normalizer iter = new Normalizer(text,Normalizer.NFD,0);\r
1352         UCharIterator iter32 = new UCharIterator(expect, expect.length,\r
1353                                                      EXPECT_MIDDLE);\r
1354 \r
1355         int c1, c2;\r
1356         char m;\r
1357 \r
1358         // initially set the indexes into the middle of the strings\r
1359         iter.setIndexOnly(SRC_MIDDLE);\r
1360 \r
1361         // move around and compare the iteration code points with\r
1362         // the expected ones\r
1363         int movesIndex =0;\r
1364         while(movesIndex<moves.length()) {\r
1365             m=moves.charAt(movesIndex++);\r
1366             if(m=='-') {\r
1367                 c1=iter.previous();\r
1368                 c2=iter32.previous();\r
1369             } else if(m=='0') {\r
1370                 c1=iter.current();\r
1371                 c2=iter32.current();\r
1372             } else /* m=='+' */ {\r
1373                 c1=iter.next();\r
1374                 c2=iter32.next();\r
1375             }\r
1376 \r
1377             // compare results\r
1378             if(c1!=c2) {\r
1379                 // copy the moves until the current (m) move, and terminate\r
1380                 String history = moves.substring(0,movesIndex);\r
1381                 errln("error: mismatch in Normalizer iteration at "+history+": "\r
1382                       +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2));\r
1383                 break;\r
1384             }\r
1385 \r
1386             // compare indexes\r
1387             if(iter.getIndex()!=expectIndex[iter32.getIndex()]) {\r
1388                 // copy the moves until the current (m) move, and terminate\r
1389                 String history = moves.substring(0,movesIndex);\r
1390                 errln("error: index mismatch in Normalizer iteration at "\r
1391                       +history+ " : "+ "Normalizer index " +iter.getIndex()\r
1392                       +" expected "+ expectIndex[iter32.getIndex()]);\r
1393                 break;\r
1394             }\r
1395         }\r
1396     }\r
1397 \r
1398     // test APIs that are not otherwise used - improve test coverage\r
1399     public void TestNormalizerAPI() throws Exception {\r
1400         try{\r
1401             // instantiate a Normalizer from a CharacterIterator\r
1402             String s=Utility.unescape("a\u0308\uac00\\U0002f800");\r
1403             // make s a bit longer and more interesting\r
1404             UCharacterIterator iter = UCharacterIterator.getInstance(s+s);\r
1405             Normalizer norm = new Normalizer(iter, Normalizer.NFC,0);\r
1406             if(norm.next()!=0xe4) {\r
1407                 errln("error in Normalizer(CharacterIterator).next()");\r
1408             }   \r
1409     \r
1410             // test clone(), ==, and hashCode()\r
1411             Normalizer clone=(Normalizer)norm.clone();\r
1412             if(clone.equals(norm)) {\r
1413                 errln("error in Normalizer(Normalizer(CharacterIterator)).clone()!=norm");\r
1414             }\r
1415     \r
1416             \r
1417             if(clone.getLength()!= norm.getLength()){\r
1418                errln("error in Normalizer.getBeginIndex()");\r
1419             } \r
1420             // clone must have the same hashCode()\r
1421             //if(clone.hashCode()!=norm.hashCode()) {\r
1422             //    errln("error in Normalizer(Normalizer(CharacterIterator)).clone().hashCode()!=copy.hashCode()");\r
1423             //}\r
1424             if(clone.next()!=0xac00) {\r
1425                 errln("error in Normalizer(Normalizer(CharacterIterator)).next()");\r
1426             }\r
1427             int ch = clone.next();\r
1428             if(ch!=0x4e3d) {\r
1429                 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next()");\r
1430             }\r
1431             // position changed, must change hashCode()\r
1432             if(clone.hashCode()==norm.hashCode()) {\r
1433                 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next().hashCode()==copy.hashCode()");\r
1434             }\r
1435     \r
1436             // test compose() and decompose()\r
1437             StringBuffer tel;\r
1438             String nfkc, nfkd;\r
1439             tel=new StringBuffer("\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121");\r
1440             tel.insert(1,(char)0x0301);\r
1441     \r
1442             nfkc=Normalizer.compose(tel.toString(), true);\r
1443             nfkd=Normalizer.decompose(tel.toString(), true);\r
1444             if(\r
1445                 !nfkc.equals(Utility.unescape("TE\u0139TELTELTELTELTELTELTELTELTEL"))||\r
1446                 !nfkd.equals(Utility.unescape("TEL\u0301TELTELTELTELTELTELTELTELTEL"))\r
1447             ) {\r
1448                 errln("error in Normalizer::(de)compose(): wrong result(s)");\r
1449             }\r
1450     \r
1451             // test setIndex()\r
1452 //            ch=norm.setIndex(3);\r
1453 //            if(ch!=0x4e3d) {\r
1454 //                errln("error in Normalizer(CharacterIterator).setIndex(3)");\r
1455 //            }\r
1456     \r
1457             // test setText(CharacterIterator) and getText()\r
1458             String out, out2;\r
1459             clone.setText(iter);\r
1460     \r
1461             out = clone.getText();\r
1462             out2 = iter.getText();\r
1463             if( !out.equals(out2) ||\r
1464                 clone.startIndex()!=0||\r
1465                 clone.endIndex()!=iter.getLength()\r
1466             ) {\r
1467                 errln("error in Normalizer::setText() or Normalizer::getText()");\r
1468             }\r
1469      \r
1470             char[] fillIn1 = new char[clone.getLength()];\r
1471             char[] fillIn2 = new char[iter.getLength()];\r
1472             int len = clone.getText(fillIn1);\r
1473             iter.getText(fillIn2,0);\r
1474             if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){\r
1475                 errln("error in Normalizer.getText(). Normalizer: "+\r
1476                                 Utility.hex(new String(fillIn1))+ \r
1477                                 " Iter: " + Utility.hex(new String(fillIn2)));\r
1478             }\r
1479             \r
1480             clone.setText(fillIn1);\r
1481             len = clone.getText(fillIn2);\r
1482             if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){\r
1483                 errln("error in Normalizer.setText() or Normalizer.getText()"+\r
1484                                 Utility.hex(new String(fillIn1))+ \r
1485                                 " Iter: " + Utility.hex(new String(fillIn2)));\r
1486             }\r
1487     \r
1488             // test setText(UChar *), getUMode() and setMode()\r
1489             clone.setText(s);\r
1490             clone.setIndexOnly(1);\r
1491             clone.setMode(Normalizer.NFD);\r
1492             if(clone.getMode()!=Normalizer.NFD) {\r
1493                 errln("error in Normalizer::setMode() or Normalizer::getMode()");\r
1494             }\r
1495             if(clone.next()!=0x308 || clone.next()!=0x1100) {\r
1496                 errln("error in Normalizer::setText() or Normalizer::setMode()");\r
1497             }\r
1498     \r
1499             // test last()/previous() with an internal buffer overflow\r
1500             StringBuffer buf = new StringBuffer("aaaaaaaaaa");\r
1501             buf.setCharAt(10-1,'\u0308');\r
1502             clone.setText(buf);\r
1503             if(clone.last()!=0x308) {\r
1504                 errln("error in Normalizer(10*U+0308).last()");\r
1505             }\r
1506     \r
1507             // test UNORM_NONE\r
1508             norm.setMode(Normalizer.NONE);\r
1509             if(norm.first()!=0x61 || norm.next()!=0x308 || norm.last()!=0x2f800) {\r
1510                 errln("error in Normalizer(UNORM_NONE).first()/next()/last()");\r
1511             }\r
1512             out=Normalizer.normalize(s, Normalizer.NONE);\r
1513             if(!out.equals(s)) {\r
1514                 errln("error in Normalizer::normalize(UNORM_NONE)");\r
1515             }\r
1516             ch = 0x1D15E;\r
1517             String exp = "\\U0001D157\\U0001D165";\r
1518             String ns = Normalizer.normalize(ch,Normalizer.NFC);\r
1519             if(!ns.equals(Utility.unescape(exp))){\r
1520                 errln("error in Normalizer.normalize(int,Mode)");\r
1521             }\r
1522             ns = Normalizer.normalize(ch,Normalizer.NFC,0);\r
1523             if(!ns.equals(Utility.unescape(exp))){\r
1524                 errln("error in Normalizer.normalize(int,Mode,int)");\r
1525             }\r
1526             \r
1527             \r
1528         }catch(Exception e){\r
1529             throw e;\r
1530         }\r
1531     }\r
1532 \r
1533     public void TestConcatenate() {\r
1534 \r
1535         Object[][]cases=new Object[][]{\r
1536             /* mode, left, right, result */\r
1537             {\r
1538                 Normalizer.NFC,\r
1539                 "re",\r
1540                 "\u0301sum\u00e9",\r
1541                 "r\u00e9sum\u00e9"\r
1542             },\r
1543             {\r
1544                 Normalizer.NFC,\r
1545                 "a\u1100",\r
1546                 "\u1161bcdefghijk",\r
1547                 "a\uac00bcdefghijk"\r
1548             },\r
1549             /* ### TODO: add more interesting cases */\r
1550             {\r
1551                 Normalizer.NFD,\r
1552                 "\u03B1\u0345",\r
1553                 "\u0C4D\uD804\uDCBA\uD834\uDD69",  // 0C4D 110BA 1D169\r
1554                 "\u03B1\uD834\uDD69\uD804\uDCBA\u0C4D\u0345"  // 03B1 1D169 110BA 0C4D 0345\r
1555             }\r
1556         };\r
1557 \r
1558         String left, right, expect, result;\r
1559         Normalizer.Mode mode;\r
1560         int i;\r
1561 \r
1562         /* test concatenation */\r
1563         for(i=0; i<cases.length; ++i) {\r
1564             mode = (Normalizer.Mode)cases[i][0];\r
1565 \r
1566             left=(String)cases[i][1];\r
1567             right=(String)cases[i][2];\r
1568             expect=(String)cases[i][3];\r
1569             {\r
1570                 result=Normalizer.concatenate(left, right, mode,0);\r
1571                 if(!result.equals(expect)) {\r
1572                     errln("error in Normalizer.concatenate(), cases[] failed"\r
1573                           +", result==expect: expected: "\r
1574                           + hex(expect)+" =========> got: " + hex(result));\r
1575                 }\r
1576             }\r
1577             {\r
1578                 result=Normalizer.concatenate(left.toCharArray(), right.toCharArray(), mode,0);\r
1579                 if(!result.equals(expect)) {\r
1580                     errln("error in Normalizer.concatenate(), cases[] failed"\r
1581                           +", result==expect: expected: "\r
1582                           + hex(expect)+" =========> got: " + hex(result));\r
1583                 }\r
1584             }\r
1585         }\r
1586     }\r
1587     private final int RAND_MAX = 0x7fff;\r
1588 \r
1589     public void TestCheckFCD()\r
1590     {\r
1591       char[] FAST = {0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,\r
1592                      0x0008, 0x0009, 0x000A};\r
1593 \r
1594       char[] FALSE = {0x0001, 0x0002, 0x02EA, 0x03EB, 0x0300, 0x0301,\r
1595                       0x02B9, 0x0314, 0x0315, 0x0316};\r
1596 \r
1597       char[] TRUE = {0x0030, 0x0040, 0x0440, 0x056D, 0x064F, 0x06E7,\r
1598                      0x0050, 0x0730, 0x09EE, 0x1E10};\r
1599 \r
1600       char[][] datastr= { {0x0061, 0x030A, 0x1E05, 0x0302, 0},\r
1601                           {0x0061, 0x030A, 0x00E2, 0x0323, 0},\r
1602                           {0x0061, 0x0323, 0x00E2, 0x0323, 0},\r
1603                           {0x0061, 0x0323, 0x1E05, 0x0302, 0}\r
1604                         };\r
1605       Normalizer.QuickCheckResult result[] = {Normalizer.YES, Normalizer.NO, Normalizer.NO, Normalizer.YES};\r
1606 \r
1607       char[] datachar= {        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,\r
1608                                 0x6a,\r
1609                                 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,\r
1610                                 0xea,\r
1611                                 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306,\r
1612                                 0x0307, 0x0308, 0x0309, 0x030a,\r
1613                                 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326,\r
1614                                 0x0327, 0x0328, 0x0329, 0x032a,\r
1615                                 0x1e00, 0x1e01, 0x1e02, 0x1e03, 0x1e04, 0x1e05, 0x1e06,\r
1616                                 0x1e07, 0x1e08, 0x1e09, 0x1e0a\r
1617                        };\r
1618 \r
1619       int count = 0;\r
1620 \r
1621       if (Normalizer.quickCheck(FAST,0,FAST.length, Normalizer.FCD,0) != Normalizer.YES)\r
1622         errln("Normalizer.quickCheck(FCD) failed: expected value for fast Normalizer.quickCheck is Normalizer.YES\n");\r
1623       if (Normalizer.quickCheck(FALSE,0, FALSE.length,Normalizer.FCD,0) != Normalizer.NO)\r
1624         errln("Normalizer.quickCheck(FCD) failed: expected value for error Normalizer.quickCheck is Normalizer.NO\n");\r
1625       if (Normalizer.quickCheck(TRUE,0,TRUE.length,Normalizer.FCD,0) != Normalizer.YES)\r
1626         errln("Normalizer.quickCheck(FCD) failed: expected value for correct Normalizer.quickCheck is Normalizer.YES\n");\r
1627 \r
1628 \r
1629       while (count < 4)\r
1630       {\r
1631         Normalizer.QuickCheckResult fcdresult = Normalizer.quickCheck(datastr[count],0,datastr[count].length, Normalizer.FCD,0);\r
1632         if (result[count] != fcdresult) {\r
1633             errln("Normalizer.quickCheck(FCD) failed: Data set "+ count\r
1634                     + " expected value "+ result[count]);\r
1635         }\r
1636         count ++;\r
1637       }\r
1638 \r
1639       /* random checks of long strings */\r
1640       //srand((unsigned)time( NULL ));\r
1641       Random rand = createRandom(); // use test framework's random\r
1642 \r
1643       for (count = 0; count < 50; count ++)\r
1644       {\r
1645         int size = 0;\r
1646         Normalizer.QuickCheckResult testresult = Normalizer.YES;\r
1647         char[] data= new char[20];\r
1648         char[] norm= new char[100];\r
1649         char[] nfd = new char[100];\r
1650         int normStart = 0;\r
1651         int nfdsize = 0;\r
1652         while (size != 19) {\r
1653           data[size] = datachar[rand.nextInt(RAND_MAX)*50/RAND_MAX];\r
1654           logln("0x"+data[size]);\r
1655           normStart += Normalizer.normalize(data,size,size+1,\r
1656                                               norm,normStart,100,\r
1657                                               Normalizer.NFD,0);\r
1658           size ++;\r
1659         }\r
1660         logln("\n");\r
1661 \r
1662         nfdsize = Normalizer.normalize(data,0,size, nfd,0,nfd.length,Normalizer.NFD,0);\r
1663         //    nfdsize = unorm_normalize(data, size, UNORM_NFD, UCOL_IGNORE_HANGUL,\r
1664         //                      nfd, 100, &status);\r
1665         if (nfdsize != normStart || Utility.arrayRegionMatches(nfd,0, norm,0,nfdsize) ==false) {\r
1666           testresult = Normalizer.NO;\r
1667         }\r
1668         if (testresult == Normalizer.YES) {\r
1669           logln("result Normalizer.YES\n");\r
1670         }\r
1671         else {\r
1672           logln("result Normalizer.NO\n");\r
1673         }\r
1674 \r
1675         if (Normalizer.quickCheck(data,0,data.length, Normalizer.FCD,0) != testresult) {\r
1676           errln("Normalizer.quickCheck(FCD) failed: expected "+ testresult +" for random data: "+hex(new String(data)) );\r
1677         }\r
1678       }\r
1679     }\r
1680 \r
1681 \r
1682     // reference implementation of Normalizer::compare\r
1683     private int ref_norm_compare(String s1, String s2, int options) {\r
1684         String t1, t2,r1,r2;\r
1685 \r
1686         int normOptions=(int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT);\r
1687         \r
1688         if((options&Normalizer.COMPARE_IGNORE_CASE)!=0) {\r
1689             // NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))\r
1690             r1 = Normalizer.decompose(s1,false,normOptions);\r
1691             r2 = Normalizer.decompose(s2,false,normOptions);\r
1692             r1 = UCharacter.foldCase(r1,options);\r
1693             r2 = UCharacter.foldCase(r2,options);\r
1694         }else{\r
1695             r1 = s1;\r
1696             r2 = s2;\r
1697         }\r
1698         \r
1699         t1 = Normalizer.decompose(r1, false, normOptions);\r
1700         t2 = Normalizer.decompose(r2, false, normOptions);\r
1701 \r
1702         if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) {\r
1703             UTF16.StringComparator comp \r
1704                     = new UTF16.StringComparator(true, false, \r
1705                                      UTF16.StringComparator.FOLD_CASE_DEFAULT);\r
1706             return comp.compare(t1,t2);\r
1707         } else {\r
1708             return t1.compareTo(t2);\r
1709         }\r
1710 \r
1711     }\r
1712 \r
1713     // test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately\r
1714     private int norm_compare(String s1, String s2, int options) {\r
1715         int normOptions=(int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT);\r
1716 \r
1717         if( Normalizer.YES==Normalizer.quickCheck(s1,Normalizer.FCD,normOptions) &&\r
1718             Normalizer.YES==Normalizer.quickCheck(s2,Normalizer.FCD,normOptions)) {\r
1719             options|=Normalizer.INPUT_IS_FCD;\r
1720         }\r
1721 \r
1722         return Normalizer.compare(s1, s2, options);\r
1723     }\r
1724 \r
1725     // reference implementation of UnicodeString::caseCompare\r
1726     private int ref_case_compare(String s1, String s2, int options) {\r
1727         String t1, t2;\r
1728 \r
1729         t1=s1;\r
1730         t2=s2;\r
1731 \r
1732         t1 = UCharacter.foldCase(t1,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0));\r
1733         t2 = UCharacter.foldCase(t2,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0));\r
1734 \r
1735         if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) {\r
1736             UTF16.StringComparator comp \r
1737                     = new UTF16.StringComparator(true, false,\r
1738                                     UTF16.StringComparator.FOLD_CASE_DEFAULT);\r
1739             return comp.compare(t1,t2);\r
1740         } else {\r
1741             return t1.compareTo(t2);\r
1742         }\r
1743 \r
1744     }\r
1745 \r
1746     // reduce an integer to -1/0/1\r
1747     private static int sign(int value) {\r
1748         if(value==0) {\r
1749             return 0;\r
1750         } else {\r
1751             return (value>>31)|1;\r
1752         }\r
1753     }\r
1754     private static String signString(int value) {\r
1755         if(value<0) {\r
1756             return "<0";\r
1757         } else if(value==0) {\r
1758             return "=0";\r
1759         } else /* value>0 */ {\r
1760             return ">0";\r
1761         }\r
1762     }\r
1763     // test Normalizer::compare and unorm_compare (thinly wrapped by the former)\r
1764     // by comparing it with its semantic equivalent\r
1765     // since we trust the pieces, this is sufficient\r
1766 \r
1767     // test each string with itself and each other\r
1768     // each time with all options\r
1769     private  String strings[]=new String[]{\r
1770                 // some cases from NormalizationTest.txt\r
1771                 // 0..3\r
1772                 "D\u031B\u0307\u0323",\r
1773                 "\u1E0C\u031B\u0307",\r
1774                 "D\u031B\u0323\u0307",\r
1775                 "d\u031B\u0323\u0307",\r
1776         \r
1777                 // 4..6\r
1778                 "\u00E4",\r
1779                 "a\u0308",\r
1780                 "A\u0308",\r
1781         \r
1782                 // Angstrom sign = A ring\r
1783                 // 7..10\r
1784                 "\u212B",\r
1785                 "\u00C5",\r
1786                 "A\u030A",\r
1787                 "a\u030A",\r
1788         \r
1789                 // 11.14\r
1790                 "a\u059A\u0316\u302A\u032Fb",\r
1791                 "a\u302A\u0316\u032F\u059Ab",\r
1792                 "a\u302A\u0316\u032F\u059Ab",\r
1793                 "A\u059A\u0316\u302A\u032Fb",\r
1794         \r
1795                 // from ICU case folding tests\r
1796                 // 15..20\r
1797                 "A\u00df\u00b5\ufb03\\U0001040c\u0131",\r
1798                 "ass\u03bcffi\\U00010434i",\r
1799                 "\u0061\u0042\u0131\u03a3\u00df\ufb03\ud93f\udfff",\r
1800                 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udfff",\r
1801                 "\u0041\u0062\u0131\u03c3\u0053\u0073\u0066\u0046\u0069\ud93f\udfff",\r
1802                 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udffd",\r
1803         \r
1804                 //     U+d800 U+10001   see implementation comment in unorm_cmpEquivFold\r
1805                 // vs. U+10000          at bottom - code point order\r
1806                 // 21..22\r
1807                 "\ud800\ud800\udc01",\r
1808                 "\ud800\udc00",\r
1809         \r
1810                 // other code point order tests from ustrtest.cpp\r
1811                 // 23..31\r
1812                 "\u20ac\ud801",\r
1813                 "\u20ac\ud800\udc00",\r
1814                 "\ud800",\r
1815                 "\ud800\uff61",\r
1816                 "\udfff",\r
1817                 "\uff61\udfff",\r
1818                 "\uff61\ud800\udc02",\r
1819                 "\ud800\udc02",\r
1820                 "\ud84d\udc56",\r
1821         \r
1822                 // long strings, see cnormtst.c/TestNormCoverage()\r
1823                 // equivalent if case-insensitive\r
1824                 // 32..33\r
1825                 "\uAD8B\uAD8B\uAD8B\uAD8B"+\r
1826                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1827                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1828                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1829                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1830                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1831                 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+\r
1832                 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+\r
1833                 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+\r
1834                 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+\r
1835                 "\uAD8B\uAD8B\uAD8B\uAD8B"+\r
1836                 "d\u031B\u0307\u0323",\r
1837         \r
1838                 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+\r
1839                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1840                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1841                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1842                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1843                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1844                 "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+\r
1845                 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+\r
1846                 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+\r
1847                 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+\r
1848                 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+\r
1849                 "\u1E0C\u031B\u0307",\r
1850         \r
1851                 // some strings that may make a difference whether the compare function\r
1852                 // case-folds or decomposes first\r
1853                 // 34..41\r
1854                 "\u0360\u0345\u0334",\r
1855                 "\u0360\u03b9\u0334",\r
1856         \r
1857                 "\u0360\u1f80\u0334",\r
1858                 "\u0360\u03b1\u0313\u03b9\u0334",\r
1859         \r
1860                 "\u0360\u1ffc\u0334",\r
1861                 "\u0360\u03c9\u03b9\u0334",\r
1862         \r
1863                 "a\u0360\u0345\u0360\u0345b",\r
1864                 "a\u0345\u0360\u0345\u0360b",\r
1865         \r
1866                 // interesting cases for canonical caseless match with turkic i handling\r
1867                 // 42..43\r
1868                 "\u00cc",\r
1869                 "\u0069\u0300",\r
1870         \r
1871                 // strings with post-Unicode 3.2 normalization or normalization corrections\r
1872                 // 44..45\r
1873                 "\u00e4\u193b\\U0002f868",\r
1874                 "\u0061\u193b\u0308\u36fc",\r
1875 \r
1876 \r
1877     };\r
1878 \r
1879     // all combinations of options\r
1880     // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions\r
1881     final class Temp {\r
1882         int options;\r
1883         String name;\r
1884         public Temp(int opt,String str){\r
1885             options =opt;\r
1886             name = str;\r
1887         }\r
1888 \r
1889     }\r
1890     // set UNORM_UNICODE_3_2 in one additional combination\r
1891   \r
1892     private Temp[] opt = new Temp[]{\r
1893                     new Temp(0,"default"),\r
1894                     new Temp(Normalizer.COMPARE_CODE_POINT_ORDER, "code point order" ),\r
1895                     new Temp(Normalizer.COMPARE_IGNORE_CASE, "ignore case" ),\r
1896                     new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE, "code point order & ignore case" ),\r
1897                     new Temp(Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "ignore case & special i"),\r
1898                     new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "code point order & ignore case & special i"),\r
1899                     new Temp(Normalizer.UNICODE_3_2 << Normalizer.COMPARE_NORM_OPTIONS_SHIFT, "Unicode 3.2")\r
1900             };\r
1901 \r
1902 \r
1903     public void TestCompareDebug(){\r
1904 \r
1905         String[] s = new String[100]; // at least as many items as in strings[] !\r
1906 \r
1907 \r
1908         int i, j, k, count=strings.length;\r
1909         int result, refResult;\r
1910 \r
1911         // create the UnicodeStrings\r
1912         for(i=0; i<count; ++i) {\r
1913             s[i]=Utility.unescape(strings[i]);\r
1914         }\r
1915         UTF16.StringComparator comp = new UTF16.StringComparator(true, false, \r
1916                                      UTF16.StringComparator.FOLD_CASE_DEFAULT);\r
1917         // test them each with each other\r
1918 \r
1919         i = 42;\r
1920         j = 43;\r
1921         k = 2;\r
1922         // test Normalizer::compare\r
1923         result=norm_compare(s[i], s[j], opt[k].options);\r
1924         refResult=ref_norm_compare(s[i], s[j], opt[k].options);\r
1925         if(sign(result)!=sign(refResult)) {\r
1926             errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);\r
1927         }\r
1928 \r
1929         // test UnicodeString::caseCompare - same internal implementation function\r
1930          if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) {\r
1931         //    result=s[i]. (s[j], opt[k].options);\r
1932             if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)\r
1933             {\r
1934                 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);\r
1935             }\r
1936             else {\r
1937                 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);\r
1938             }\r
1939             \r
1940             result=comp.compare(s[i],s[j]);\r
1941             refResult=ref_case_compare(s[i], s[j], opt[k].options);\r
1942             if(sign(result)!=sign(refResult)) {\r
1943                       errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);\r
1944                             }\r
1945         }\r
1946         String value1 = "\u00dater\u00fd";\r
1947         String value2 = "\u00fater\u00fd";\r
1948         if(Normalizer.compare(value1,value2,0)!=0){\r
1949             if(Normalizer.compare(value1,value2,Normalizer.COMPARE_IGNORE_CASE)==0){\r
1950 \r
1951             }\r
1952         }\r
1953     }\r
1954 \r
1955     public void TestCompare() {\r
1956 \r
1957         String[] s = new String[100]; // at least as many items as in strings[] !\r
1958 \r
1959         int i, j, k, count=strings.length;\r
1960         int result, refResult;\r
1961 \r
1962         // create the UnicodeStrings\r
1963         for(i=0; i<count; ++i) {\r
1964             s[i]=Utility.unescape(strings[i]);\r
1965         }\r
1966         UTF16.StringComparator comp = new UTF16.StringComparator();\r
1967         // test them each with each other\r
1968         for(i=0; i<count; ++i) {\r
1969             for(j=i; j<count; ++j) {\r
1970                 for(k=0; k<opt.length; ++k) {\r
1971                     // test Normalizer::compare\r
1972                     result=norm_compare(s[i], s[j], opt[k].options);\r
1973                     refResult=ref_norm_compare(s[i], s[j], opt[k].options);\r
1974                     if(sign(result)!=sign(refResult)) {\r
1975                         errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);\r
1976                     }\r
1977 \r
1978                     // test UnicodeString::caseCompare - same internal implementation function\r
1979                      if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) {\r
1980                         //    result=s[i]. (s[j], opt[k].options);\r
1981                         if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)\r
1982                         {\r
1983                             comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);\r
1984                         }\r
1985                         else {\r
1986                             comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);\r
1987                         }\r
1988                         \r
1989                         comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0);\r
1990                         // result=comp.caseCompare(s[i],s[j], opt[k].options);\r
1991                         result=comp.compare(s[i],s[j]);\r
1992                         refResult=ref_case_compare(s[i], s[j], opt[k].options);\r
1993                         if(sign(result)!=sign(refResult)) {\r
1994                                   errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);\r
1995                                          }\r
1996                     }\r
1997                 }\r
1998             }\r
1999         }\r
2000         \r
2001         // test cases with i and I to make sure Turkic works\r
2002         char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 };\r
2003         UnicodeSet set = new UnicodeSet(), iSet = new UnicodeSet();\r
2004         Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstance().impl;\r
2005         nfcImpl.ensureCanonIterData();\r
2006 \r
2007         String s1, s2;\r
2008         int start, end;\r
2009     \r
2010         // collect all sets into one for contiguous output\r
2011         for(i=0; i<iI.length; ++i) {\r
2012             if(nfcImpl.getCanonStartSet(iI[i], iSet)) {\r
2013                 set.addAll(iSet);\r
2014             }\r
2015         }\r
2016 \r
2017         // test all of these precomposed characters\r
2018         UnicodeSetIterator it = new UnicodeSetIterator(set);\r
2019         while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) {\r
2020             start=it.codepoint;\r
2021             end=it.codepointEnd;\r
2022             while(start<=end) {\r
2023                 s1 = Integer.toString(start);\r
2024                 s2 = Normalizer.decompose(s1, false, 0);\r
2025 //                if(U_FAILURE(errorCode)) {\r
2026 //                    errln("Normalizer::decompose(U+%04x) failed: %s", start, u_errorName(errorCode));\r
2027 //                    return;\r
2028 //                }\r
2029                 for(k=0; k<opt.length; ++k) {\r
2030                     // test Normalizer::compare\r
2031 \r
2032                     result= norm_compare(s1, s2, opt[k].options);\r
2033                     refResult=ref_norm_compare(s1, s2, opt[k].options);\r
2034                     if(sign(result)!=sign(refResult)) {\r
2035                         errln("Normalizer.compare(U+"+hex(start)+" with its NFD, "+opt[k].name+")" \r
2036                               + signString(result)+" should be "+signString(refResult));\r
2037                     }\r
2038     \r
2039                     // test UnicodeString::caseCompare - same internal implementation function\r
2040                     if((opt[k].options & Normalizer.COMPARE_IGNORE_CASE)>0) {\r
2041                          if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)\r
2042                         {\r
2043                             comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);\r
2044                         }\r
2045                         else {\r
2046                             comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);\r
2047                         }\r
2048                         \r
2049                         comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0);\r
2050          \r
2051                         result=comp.compare(s1,s2);\r
2052                         refResult=ref_case_compare(s1, s2, opt[k].options);\r
2053                         if(sign(result)!=sign(refResult)) {\r
2054                             errln("UTF16.compare(U+"+hex(start)+" with its NFD, "\r
2055                                   +opt[k].name+")"+signString(result) +" should be "+signString(refResult));\r
2056                         }\r
2057                     }\r
2058                 }\r
2059     \r
2060                 ++start;\r
2061             }\r
2062         }\r
2063 \r
2064     }\r
2065 \r
2066     // verify that case-folding does not un-FCD strings\r
2067     int countFoldFCDExceptions(int foldingOptions) {\r
2068         String s, d;\r
2069         int c;\r
2070         int count;\r
2071         int/*unsigned*/ cc, trailCC, foldCC, foldTrailCC;\r
2072         Normalizer.QuickCheckResult qcResult;\r
2073         int category;\r
2074         boolean isNFD;\r
2075 \r
2076 \r
2077         logln("Test if case folding may un-FCD a string (folding options 0x)"+hex(foldingOptions));\r
2078 \r
2079         count=0;\r
2080         for(c=0; c<=0x10ffff; ++c) {\r
2081             category=UCharacter.getType(c);\r
2082             if(category==UCharacterCategory.UNASSIGNED) {\r
2083                 continue; // skip unassigned code points\r
2084             }\r
2085             if(c==0xac00) {\r
2086                 c=0xd7a3; // skip Hangul - no case folding there\r
2087                 continue;\r
2088             }\r
2089             // skip Han blocks - no case folding there either\r
2090             if(c==0x3400) {\r
2091                 c=0x4db5;\r
2092                 continue;\r
2093             }\r
2094             if(c==0x4e00) {\r
2095                 c=0x9fa5;\r
2096                 continue;\r
2097             }\r
2098             if(c==0x20000) {\r
2099                 c=0x2a6d6;\r
2100                 continue;\r
2101             }\r
2102 \r
2103             s= UTF16.valueOf(c);\r
2104 \r
2105             // get leading and trailing cc for c\r
2106             d= Normalizer.decompose(s,false);\r
2107             isNFD= s==d;\r
2108             cc=UCharacter.getCombiningClass(UTF16.charAt(d,0));\r
2109             trailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1));\r
2110 \r
2111             // get leading and trailing cc for the case-folding of c\r
2112             UCharacter.foldCase(s,(foldingOptions==0));\r
2113             d = Normalizer.decompose(s, false);\r
2114             foldCC=UCharacter.getCombiningClass(UTF16.charAt(d,0));\r
2115             foldTrailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1));\r
2116 \r
2117             qcResult=Normalizer.quickCheck(s, Normalizer.FCD,0);\r
2118 \r
2119 \r
2120             // bad:\r
2121             // - character maps to empty string: adjacent characters may then need reordering\r
2122             // - folding has different leading/trailing cc's, and they don't become just 0\r
2123             // - folding itself is not FCD\r
2124             if( qcResult!=Normalizer.YES ||\r
2125                 s.length()==0 ||\r
2126                 (cc!=foldCC && foldCC!=0) || (trailCC!=foldTrailCC && foldTrailCC!=0)\r
2127             ) {\r
2128                 ++count;\r
2129                 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")");\r
2130                 //errln("  cc %02x trailCC %02x    foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x   quickCheck(folded)=%d", cc, trailCC, UTF16.charAt(d,0), foldCC, UTF16.charAt(d,d.length()-1), foldTrailCC, qcResult);\r
2131                 continue;\r
2132             }\r
2133 \r
2134             // also bad:\r
2135             // if a code point is in NFD but its case folding is not, then\r
2136             // unorm_compare will also fail\r
2137             if(isNFD && Normalizer.YES!=Normalizer.quickCheck(s, Normalizer.NFD,0)) {\r
2138                 ++count;\r
2139                 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")");\r
2140             }\r
2141         }\r
2142 \r
2143         logln("There are "+hex(count)+" code points for which case-folding may un-FCD a string (folding options"+foldingOptions+"x)" );\r
2144         return count;\r
2145     }\r
2146 \r
2147     public void TestFindFoldFCDExceptions() {\r
2148         int count;\r
2149 \r
2150         count=countFoldFCDExceptions(0);\r
2151         count+=countFoldFCDExceptions(Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I);\r
2152         if(count>0) {\r
2153             //*\r
2154             //* If case-folding un-FCDs any strings, then unorm_compare() must be\r
2155             //* re-implemented.\r
2156             //* It currently assumes that one can check for FCD then case-fold\r
2157             //* and then still have FCD strings for raw decomposition without reordering.\r
2158             //*\r
2159             errln("error: There are "+count+" code points for which case-folding"+\r
2160                   " may un-FCD a string for all folding options.\n See comment"+\r
2161                   " in BasicNormalizerTest::FindFoldFCDExceptions()!");\r
2162         }\r
2163     }\r
2164     \r
2165     public void TestCombiningMarks(){\r
2166         String src = "\u0f71\u0f72\u0f73\u0f74\u0f75";\r
2167         String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74";\r
2168         String result = Normalizer.decompose(src,false);\r
2169         if(!expected.equals(result)){\r
2170             errln("Reordering of combining marks failed. Expected: "+Utility.hex(expected)+" Got: "+ Utility.hex(result));\r
2171         }\r
2172     }\r
2173 \r
2174     /*\r
2175      * Re-enable this test when UTC fixes UAX 21\r
2176     public void TestUAX21Failure(){\r
2177         final String[][] cases = new String[][]{\r
2178                 {"\u0061\u0345\u0360\u0345\u0062", "\u0061\u0360\u0345\u0345\u0062"},\r
2179                 {"\u0061\u0345\u0345\u0360\u0062", "\u0061\u0360\u0345\u0345\u0062"},\r
2180                 {"\u0061\u0345\u0360\u0362\u0360\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"},\r
2181                 {"\u0061\u0360\u0345\u0360\u0362\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"},\r
2182                 {"\u0061\u0345\u0360\u0362\u0361\u0062", "\u0061\u0362\u0360\u0361\u0345\u0062"},\r
2183                 {"\u0061\u0361\u0345\u0360\u0362\u0062", "\u0061\u0362\u0361\u0360\u0345\u0062"},\r
2184         };\r
2185         for(int i = 0; i< cases.length; i++){\r
2186             String s1 =cases[0][0]; \r
2187             String s2 = cases[0][1];\r
2188             if( (Normalizer.compare(s1,s2,Normalizer.FOLD_CASE_DEFAULT ==0)//case sensitive compare\r
2189                 &&\r
2190                 (Normalizer.compare(s1,s2,Normalizer.COMPARE_IGNORE_CASE)!=0)){\r
2191                 errln("Normalizer.compare() failed for s1: " \r
2192                         + Utility.hex(s1) +" s2: " + Utility.hex(s2));\r
2193             }\r
2194         }\r
2195     }\r
2196     */\r
2197     public void TestFCNFKCClosure() {\r
2198         final class TestStruct{\r
2199             int c;\r
2200             String s;\r
2201             TestStruct(int cp, String src){\r
2202                 c=cp;\r
2203                 s=src;\r
2204             }\r
2205         }\r
2206         \r
2207         TestStruct[] tests= new TestStruct[]{\r
2208             new TestStruct( 0x00C4, "" ),\r
2209             new TestStruct( 0x00E4, "" ),\r
2210             new TestStruct( 0x037A, "\u0020\u03B9" ),\r
2211             new TestStruct( 0x03D2, "\u03C5" ),\r
2212             new TestStruct( 0x20A8, "\u0072\u0073" ) ,\r
2213             new TestStruct( 0x210B, "\u0068" ),\r
2214             new TestStruct( 0x210C, "\u0068" ),\r
2215             new TestStruct( 0x2121, "\u0074\u0065\u006C" ),\r
2216             new TestStruct( 0x2122, "\u0074\u006D" ),\r
2217             new TestStruct( 0x2128, "\u007A" ),\r
2218             new TestStruct( 0x1D5DB,"\u0068" ),\r
2219             new TestStruct( 0x1D5ED,"\u007A" ),\r
2220             new TestStruct( 0x0061, "" )\r
2221         };\r
2222     \r
2223 \r
2224         for(int i = 0; i < tests.length; ++ i) {\r
2225             String result=Normalizer.getFC_NFKC_Closure(tests[i].c);\r
2226             if(!result.equals(new String(tests[i].s))) {\r
2227                 errln("getFC_NFKC_Closure(U+"+Integer.toHexString(tests[i].c)+") is wrong");\r
2228             }\r
2229         }\r
2230     \r
2231         /* error handling */\r
2232 \r
2233         int length=Normalizer.getFC_NFKC_Closure(0x5c, null);\r
2234         if(length!=0){\r
2235             errln("getFC_NFKC_Closure did not perform error handling correctly");\r
2236         }\r
2237     }\r
2238     public void TestBugJ2324(){\r
2239        /* String[] input = new String[]{\r
2240                             //"\u30FD\u3099",\r
2241                             "\u30FA\u309A",\r
2242                             "\u30FB\u309A",\r
2243                             "\u30FC\u309A",\r
2244                             "\u30FE\u309A",\r
2245                             "\u30FD\u309A",\r
2246 \r
2247         };*/\r
2248         String troublesome = "\u309A";\r
2249         for(int i=0x3000; i<0x3100;i++){\r
2250             String input = ((char)i)+troublesome;\r
2251             try{                            \r
2252               /*  String result =*/ Normalizer.compose(input,false);\r
2253             }catch(IndexOutOfBoundsException e){\r
2254                 errln("compose() failed for input: " + Utility.hex(input) + " Exception: " + e.toString());\r
2255             }\r
2256         }\r
2257                 \r
2258     }\r
2259 \r
2260      static final int D = 0, C = 1, KD= 2, KC = 3, FCD=4, NONE=5;   \r
2261     private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets){\r
2262         if( skipSets.length < 4 ){\r
2263             return null;\r
2264         }\r
2265         skipSets[D].applyPattern(\r
2266             "[^\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6\\u00D9-\\u00DD"\r
2267             + "\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6\\u00F9-\\u00FD"\r
2268             + "\\u00FF-\\u010F\\u0112-\\u0125\\u0128-\\u0130\\u0134-\\u0137"\r
2269             + "\\u0139-\\u013E\\u0143-\\u0148\\u014C-\\u0151\\u0154-\\u0165"\r
2270             + "\\u0168-\\u017E\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\\u01DC"\r
2271             + "\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B"\r
2272             + "\\u021E\\u021F\\u0226-\\u0233\\u0300-\\u034E\\u0350-\\u036F"\r
2273             + "\\u0374\\u037E\\u0385-\\u038A\\u038C\\u038E-\\u0390\\u03AA-"\r
2274             + "\\u03B0\\u03CA-\\u03CE\\u03D3\\u03D4\\u0400\\u0401\\u0403\\u0407"\r
2275             + "\\u040C-\\u040E\\u0419\\u0439\\u0450\\u0451\\u0453\\u0457\\u045C"\r
2276             + "-\\u045E\\u0476\\u0477\\u0483-\\u0487\\u04C1\\u04C2\\u04D0-"\r
2277             + "\\u04D3\\u04D6\\u04D7\\u04DA-\\u04DF\\u04E2-\\u04E7\\u04EA-"\r
2278             + "\\u04F5\\u04F8\\u04F9\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4"\r
2279             + "\\u05C5\\u05C7\\u0610-\\u061A\\u0622-\\u0626\\u064B-\\u065E"\r
2280             + "\\u0670\\u06C0\\u06C2\\u06D3\\u06D6-\\u06DC\\u06DF-\\u06E4"\r
2281             + "\\u06E7\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-"\r
2282             + "\\u07F3\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-"\r
2283             + "\\u082D\\u0929\\u0931\\u0934\\u093C\\u094D\\u0951-\\u0954\\u0958"\r
2284             + "-\\u095F\\u09BC\\u09CB-\\u09CD\\u09DC\\u09DD\\u09DF\\u0A33"\r
2285             + "\\u0A36\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C"\r
2286             + "\\u0B48\\u0B4B-\\u0B4D\\u0B5C\\u0B5D\\u0B94\\u0BCA-\\u0BCD"\r
2287             + "\\u0C48\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CC0\\u0CC7\\u0CC8\\u0CCA"\r
2288             + "\\u0CCB\\u0CCD\\u0D4A-\\u0D4D\\u0DCA\\u0DDA\\u0DDC-\\u0DDE"\r
2289             + "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB8\\u0EB9\\u0EC8-\\u0ECB"\r
2290             + "\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D\\u0F52\\u0F57"\r
2291             + "\\u0F5C\\u0F69\\u0F71-\\u0F76\\u0F78\\u0F7A-\\u0F7D\\u0F80-"\r
2292             + "\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"\r
2293             + "\\u0FC6\\u1026\\u1037\\u1039\\u103A\\u108D\\u135F\\u1714\\u1734"\r
2294             + "\\u17D2\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60\\u1A75"\r
2295             + "-\\u1A7C\\u1A7F\\u1B06\\u1B08\\u1B0A\\u1B0C\\u1B0E\\u1B12\\u1B34"\r
2296             + "\\u1B3B\\u1B3D\\u1B40\\u1B41\\u1B43\\u1B44\\u1B6B-\\u1B73\\u1BAA"\r
2297             + "\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8\\u1CED"\r
2298             + "\\u1DC0-\\u1DE6\\u1DFD-\\u1E99\\u1E9B\\u1EA0-\\u1EF9\\u1F00-"\r
2299             + "\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-"\r
2300             + "\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4"\r
2301             + "\\u1FB6-\\u1FBC\\u1FBE\\u1FC1-\\u1FC4\\u1FC6-\\u1FD3\\u1FD6-"\r
2302             + "\\u1FDB\\u1FDD-\\u1FEF\\u1FF2-\\u1FF4\\u1FF6-\\u1FFD\\u2000"\r
2303             + "\\u2001\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2126\\u212A"\r
2304             + "\\u212B\\u219A\\u219B\\u21AE\\u21CD-\\u21CF\\u2204\\u2209\\u220C"\r
2305             + "\\u2224\\u2226\\u2241\\u2244\\u2247\\u2249\\u2260\\u2262\\u226D-"\r
2306             + "\\u2271\\u2274\\u2275\\u2278\\u2279\\u2280\\u2281\\u2284\\u2285"\r
2307             + "\\u2288\\u2289\\u22AC-\\u22AF\\u22E0-\\u22E3\\u22EA-\\u22ED"\r
2308             + "\\u2329\\u232A\\u2ADC\\u2CEF-\\u2CF1\\u2DE0-\\u2DFF\\u302A-"\r
2309             + "\\u302F\\u304C\\u304E\\u3050\\u3052\\u3054\\u3056\\u3058\\u305A"\r
2310             + "\\u305C\\u305E\\u3060\\u3062\\u3065\\u3067\\u3069\\u3070\\u3071"\r
2311             + "\\u3073\\u3074\\u3076\\u3077\\u3079\\u307A\\u307C\\u307D\\u3094"\r
2312             + "\\u3099\\u309A\\u309E\\u30AC\\u30AE\\u30B0\\u30B2\\u30B4\\u30B6"\r
2313             + "\\u30B8\\u30BA\\u30BC\\u30BE\\u30C0\\u30C2\\u30C5\\u30C7\\u30C9"\r
2314             + "\\u30D0\\u30D1\\u30D3\\u30D4\\u30D6\\u30D7\\u30D9\\u30DA\\u30DC"\r
2315             + "\\u30DD\\u30F4\\u30F7-\\u30FA\\u30FE\\uA66F\\uA67C\\uA67D\\uA6F0"\r
2316             + "\\uA6F1\\uA806\\uA8C4\\uA8E0-\\uA8F1\\uA92B-\\uA92D\\uA953"\r
2317             + "\\uA9B3\\uA9C0\\uAAB0\\uAAB2-\\uAAB4\\uAAB7\\uAAB8\\uAABE\\uAABF"\r
2318             + "\\uAAC1\\uABED\\uAC00-\\uD7A3\\uF900-\\uFA0D\\uFA10\\uFA12"\r
2319             + "\\uFA15-\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D"\r
2320             + "\\uFA30-\\uFA6D\\uFA70-\\uFAD9\\uFB1D-\\uFB1F\\uFB2A-\\uFB36"\r
2321             + "\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-"\r
2322             + "\\uFB4E\\uFE20-\\uFE26\\U000101FD\\U00010A0D\\U00010A0F\\U00010A"\r
2323             + "38-\\U00010A3A\\U00010A3F\\U0001109A\\U0001109C\\U000110AB"\r
2324             + "\\U000110B9\\U000110BA\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001"\r
2325             + "D172\\U0001D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-"\r
2326             + "\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0002"\r
2327             + "F800-\\U0002FA1D]", false);\r
2328 \r
2329       skipSets[C].applyPattern(\r
2330           "[^<->A-PR-Za-pr-z\\u00A8\\u00C0-\\u00CF\\u00D1-\\u00D6\\u00D8-"\r
2331           + "\\u00DD\\u00E0-\\u00EF\\u00F1-\\u00F6\\u00F8-\\u00FD\\u00FF-"\r
2332           + "\\u0103\\u0106-\\u010F\\u0112-\\u0117\\u011A-\\u0121\\u0124"\r
2333           + "\\u0125\\u0128-\\u012D\\u0130\\u0139\\u013A\\u013D\\u013E\\u0143"\r
2334           + "\\u0144\\u0147\\u0148\\u014C-\\u0151\\u0154\\u0155\\u0158-"\r
2335           + "\\u015D\\u0160\\u0161\\u0164\\u0165\\u0168-\\u0171\\u0174-"\r
2336           + "\\u017F\\u01A0\\u01A1\\u01AF\\u01B0\\u01B7\\u01CD-\\u01DC\\u01DE"\r
2337           + "-\\u01E1\\u01E6-\\u01EB\\u01F4\\u01F5\\u01F8-\\u01FB\\u0200-"\r
2338           + "\\u021B\\u021E\\u021F\\u0226-\\u0233\\u0292\\u0300-\\u034E"\r
2339           + "\\u0350-\\u036F\\u0374\\u037E\\u0387\\u0391\\u0395\\u0397\\u0399"\r
2340           + "\\u039F\\u03A1\\u03A5\\u03A9\\u03AC\\u03AE\\u03B1\\u03B5\\u03B7"\r
2341           + "\\u03B9\\u03BF\\u03C1\\u03C5\\u03C9-\\u03CB\\u03CE\\u03D2\\u0406"\r
2342           + "\\u0410\\u0413\\u0415-\\u0418\\u041A\\u041E\\u0423\\u0427\\u042B"\r
2343           + "\\u042D\\u0430\\u0433\\u0435-\\u0438\\u043A\\u043E\\u0443\\u0447"\r
2344           + "\\u044B\\u044D\\u0456\\u0474\\u0475\\u0483-\\u0487\\u04D8\\u04D9"\r
2345           + "\\u04E8\\u04E9\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5"\r
2346           + "\\u05C7\\u0610-\\u061A\\u0622\\u0623\\u0627\\u0648\\u064A-"\r
2347           + "\\u065E\\u0670\\u06C1\\u06D2\\u06D5-\\u06DC\\u06DF-\\u06E4"\r
2348           + "\\u06E7\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-"\r
2349           + "\\u07F3\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-"\r
2350           + "\\u082D\\u0928\\u0930\\u0933\\u093C\\u094D\\u0951-\\u0954\\u0958"\r
2351           + "-\\u095F\\u09BC\\u09BE\\u09C7\\u09CD\\u09D7\\u09DC\\u09DD\\u09DF"\r
2352           + "\\u0A33\\u0A36\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD"\r
2353           + "\\u0B3C\\u0B3E\\u0B47\\u0B4D\\u0B56\\u0B57\\u0B5C\\u0B5D\\u0B92"\r
2354           + "\\u0BBE\\u0BC6\\u0BC7\\u0BCD\\u0BD7\\u0C46\\u0C4D\\u0C55\\u0C56"\r
2355           + "\\u0CBC\\u0CBF\\u0CC2\\u0CC6\\u0CCA\\u0CCD\\u0CD5\\u0CD6\\u0D3E"\r
2356           + "\\u0D46\\u0D47\\u0D4D\\u0D57\\u0DCA\\u0DCF\\u0DD9\\u0DDC\\u0DDF"\r
2357           + "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB8\\u0EB9\\u0EC8-\\u0ECB"\r
2358           + "\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D\\u0F52\\u0F57"\r
2359           + "\\u0F5C\\u0F69\\u0F71-\\u0F76\\u0F78\\u0F7A-\\u0F7D\\u0F80-"\r
2360           + "\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"\r
2361           + "\\u0FC6\\u1025\\u102E\\u1037\\u1039\\u103A\\u108D\\u1100-\\u1112"\r
2362           + "\\u1161-\\u1175\\u11A8-\\u11C2\\u135F\\u1714\\u1734\\u17D2"\r
2363           + "\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60\\u1A75-"\r
2364           + "\\u1A7C\\u1A7F\\u1B05\\u1B07\\u1B09\\u1B0B\\u1B0D\\u1B11\\u1B34"\r
2365           + "\\u1B35\\u1B3A\\u1B3C\\u1B3E\\u1B3F\\u1B42\\u1B44\\u1B6B-\\u1B73"\r
2366           + "\\u1BAA\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8"\r
2367           + "\\u1CED\\u1DC0-\\u1DE6\\u1DFD-\\u1E03\\u1E0A-\\u1E0F\\u1E12-"\r
2368           + "\\u1E1B\\u1E20-\\u1E27\\u1E2A-\\u1E41\\u1E44-\\u1E53\\u1E58-"\r
2369           + "\\u1E7D\\u1E80-\\u1E87\\u1E8E-\\u1E91\\u1E96-\\u1E99\\u1EA0-"\r
2370           + "\\u1EF3\\u1EF6-\\u1EF9\\u1F00-\\u1F11\\u1F18\\u1F19\\u1F20-"\r
2371           + "\\u1F31\\u1F38\\u1F39\\u1F40\\u1F41\\u1F48\\u1F49\\u1F50\\u1F51"\r
2372           + "\\u1F59\\u1F60-\\u1F71\\u1F73-\\u1F75\\u1F77\\u1F79\\u1F7B-"\r
2373           + "\\u1F7D\\u1F80\\u1F81\\u1F88\\u1F89\\u1F90\\u1F91\\u1F98\\u1F99"\r
2374           + "\\u1FA0\\u1FA1\\u1FA8\\u1FA9\\u1FB3\\u1FB6\\u1FBB\\u1FBC\\u1FBE"\r
2375           + "\\u1FBF\\u1FC3\\u1FC6\\u1FC9\\u1FCB\\u1FCC\\u1FD3\\u1FDB\\u1FE3"\r
2376           + "\\u1FEB\\u1FEE\\u1FEF\\u1FF3\\u1FF6\\u1FF9\\u1FFB-\\u1FFE\\u2000"\r
2377           + "\\u2001\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2126\\u212A"\r
2378           + "\\u212B\\u2190\\u2192\\u2194\\u21D0\\u21D2\\u21D4\\u2203\\u2208"\r
2379           + "\\u220B\\u2223\\u2225\\u223C\\u2243\\u2245\\u2248\\u224D\\u2261"\r
2380           + "\\u2264\\u2265\\u2272\\u2273\\u2276\\u2277\\u227A-\\u227D\\u2282"\r
2381           + "\\u2283\\u2286\\u2287\\u2291\\u2292\\u22A2\\u22A8\\u22A9\\u22AB"\r
2382           + "\\u22B2-\\u22B5\\u2329\\u232A\\u2ADC\\u2CEF-\\u2CF1\\u2DE0-"\r
2383           + "\\u2DFF\\u302A-\\u302F\\u3046\\u304B\\u304D\\u304F\\u3051\\u3053"\r
2384           + "\\u3055\\u3057\\u3059\\u305B\\u305D\\u305F\\u3061\\u3064\\u3066"\r
2385           + "\\u3068\\u306F\\u3072\\u3075\\u3078\\u307B\\u3099\\u309A\\u309D"\r
2386           + "\\u30A6\\u30AB\\u30AD\\u30AF\\u30B1\\u30B3\\u30B5\\u30B7\\u30B9"\r
2387           + "\\u30BB\\u30BD\\u30BF\\u30C1\\u30C4\\u30C6\\u30C8\\u30CF\\u30D2"\r
2388           + "\\u30D5\\u30D8\\u30DB\\u30EF-\\u30F2\\u30FD\\uA66F\\uA67C\\uA67D"\r
2389           + "\\uA6F0\\uA6F1\\uA806\\uA8C4\\uA8E0-\\uA8F1\\uA92B-\\uA92D"\r
2390           + "\\uA953\\uA9B3\\uA9C0\\uAAB0\\uAAB2-\\uAAB4\\uAAB7\\uAAB8\\uAABE"\r
2391           + "\\uAABF\\uAAC1\\uABED\\uAC00\\uAC1C\\uAC38\\uAC54\\uAC70\\uAC8C"\r
2392           + "\\uACA8\\uACC4\\uACE0\\uACFC\\uAD18\\uAD34\\uAD50\\uAD6C\\uAD88"\r
2393           + "\\uADA4\\uADC0\\uADDC\\uADF8\\uAE14\\uAE30\\uAE4C\\uAE68\\uAE84"\r
2394           + "\\uAEA0\\uAEBC\\uAED8\\uAEF4\\uAF10\\uAF2C\\uAF48\\uAF64\\uAF80"\r
2395           + "\\uAF9C\\uAFB8\\uAFD4\\uAFF0\\uB00C\\uB028\\uB044\\uB060\\uB07C"\r
2396           + "\\uB098\\uB0B4\\uB0D0\\uB0EC\\uB108\\uB124\\uB140\\uB15C\\uB178"\r
2397           + "\\uB194\\uB1B0\\uB1CC\\uB1E8\\uB204\\uB220\\uB23C\\uB258\\uB274"\r
2398           + "\\uB290\\uB2AC\\uB2C8\\uB2E4\\uB300\\uB31C\\uB338\\uB354\\uB370"\r
2399           + "\\uB38C\\uB3A8\\uB3C4\\uB3E0\\uB3FC\\uB418\\uB434\\uB450\\uB46C"\r
2400           + "\\uB488\\uB4A4\\uB4C0\\uB4DC\\uB4F8\\uB514\\uB530\\uB54C\\uB568"\r
2401           + "\\uB584\\uB5A0\\uB5BC\\uB5D8\\uB5F4\\uB610\\uB62C\\uB648\\uB664"\r
2402           + "\\uB680\\uB69C\\uB6B8\\uB6D4\\uB6F0\\uB70C\\uB728\\uB744\\uB760"\r
2403           + "\\uB77C\\uB798\\uB7B4\\uB7D0\\uB7EC\\uB808\\uB824\\uB840\\uB85C"\r
2404           + "\\uB878\\uB894\\uB8B0\\uB8CC\\uB8E8\\uB904\\uB920\\uB93C\\uB958"\r
2405           + "\\uB974\\uB990\\uB9AC\\uB9C8\\uB9E4\\uBA00\\uBA1C\\uBA38\\uBA54"\r
2406           + "\\uBA70\\uBA8C\\uBAA8\\uBAC4\\uBAE0\\uBAFC\\uBB18\\uBB34\\uBB50"\r
2407           + "\\uBB6C\\uBB88\\uBBA4\\uBBC0\\uBBDC\\uBBF8\\uBC14\\uBC30\\uBC4C"\r
2408           + "\\uBC68\\uBC84\\uBCA0\\uBCBC\\uBCD8\\uBCF4\\uBD10\\uBD2C\\uBD48"\r
2409           + "\\uBD64\\uBD80\\uBD9C\\uBDB8\\uBDD4\\uBDF0\\uBE0C\\uBE28\\uBE44"\r
2410           + "\\uBE60\\uBE7C\\uBE98\\uBEB4\\uBED0\\uBEEC\\uBF08\\uBF24\\uBF40"\r
2411           + "\\uBF5C\\uBF78\\uBF94\\uBFB0\\uBFCC\\uBFE8\\uC004\\uC020\\uC03C"\r
2412           + "\\uC058\\uC074\\uC090\\uC0AC\\uC0C8\\uC0E4\\uC100\\uC11C\\uC138"\r
2413           + "\\uC154\\uC170\\uC18C\\uC1A8\\uC1C4\\uC1E0\\uC1FC\\uC218\\uC234"\r
2414           + "\\uC250\\uC26C\\uC288\\uC2A4\\uC2C0\\uC2DC\\uC2F8\\uC314\\uC330"\r
2415           + "\\uC34C\\uC368\\uC384\\uC3A0\\uC3BC\\uC3D8\\uC3F4\\uC410\\uC42C"\r
2416           + "\\uC448\\uC464\\uC480\\uC49C\\uC4B8\\uC4D4\\uC4F0\\uC50C\\uC528"\r
2417           + "\\uC544\\uC560\\uC57C\\uC598\\uC5B4\\uC5D0\\uC5EC\\uC608\\uC624"\r
2418           + "\\uC640\\uC65C\\uC678\\uC694\\uC6B0\\uC6CC\\uC6E8\\uC704\\uC720"\r
2419           + "\\uC73C\\uC758\\uC774\\uC790\\uC7AC\\uC7C8\\uC7E4\\uC800\\uC81C"\r
2420           + "\\uC838\\uC854\\uC870\\uC88C\\uC8A8\\uC8C4\\uC8E0\\uC8FC\\uC918"\r
2421           + "\\uC934\\uC950\\uC96C\\uC988\\uC9A4\\uC9C0\\uC9DC\\uC9F8\\uCA14"\r
2422           + "\\uCA30\\uCA4C\\uCA68\\uCA84\\uCAA0\\uCABC\\uCAD8\\uCAF4\\uCB10"\r
2423           + "\\uCB2C\\uCB48\\uCB64\\uCB80\\uCB9C\\uCBB8\\uCBD4\\uCBF0\\uCC0C"\r
2424           + "\\uCC28\\uCC44\\uCC60\\uCC7C\\uCC98\\uCCB4\\uCCD0\\uCCEC\\uCD08"\r
2425           + "\\uCD24\\uCD40\\uCD5C\\uCD78\\uCD94\\uCDB0\\uCDCC\\uCDE8\\uCE04"\r
2426           + "\\uCE20\\uCE3C\\uCE58\\uCE74\\uCE90\\uCEAC\\uCEC8\\uCEE4\\uCF00"\r
2427           + "\\uCF1C\\uCF38\\uCF54\\uCF70\\uCF8C\\uCFA8\\uCFC4\\uCFE0\\uCFFC"\r
2428           + "\\uD018\\uD034\\uD050\\uD06C\\uD088\\uD0A4\\uD0C0\\uD0DC\\uD0F8"\r
2429           + "\\uD114\\uD130\\uD14C\\uD168\\uD184\\uD1A0\\uD1BC\\uD1D8\\uD1F4"\r
2430           + "\\uD210\\uD22C\\uD248\\uD264\\uD280\\uD29C\\uD2B8\\uD2D4\\uD2F0"\r
2431           + "\\uD30C\\uD328\\uD344\\uD360\\uD37C\\uD398\\uD3B4\\uD3D0\\uD3EC"\r
2432           + "\\uD408\\uD424\\uD440\\uD45C\\uD478\\uD494\\uD4B0\\uD4CC\\uD4E8"\r
2433           + "\\uD504\\uD520\\uD53C\\uD558\\uD574\\uD590\\uD5AC\\uD5C8\\uD5E4"\r
2434           + "\\uD600\\uD61C\\uD638\\uD654\\uD670\\uD68C\\uD6A8\\uD6C4\\uD6E0"\r
2435           + "\\uD6FC\\uD718\\uD734\\uD750\\uD76C\\uD788\\uF900-\\uFA0D\\uFA10"\r
2436           + "\\uFA12\\uFA15-\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A-"\r
2437           + "\\uFA2D\\uFA30-\\uFA6D\\uFA70-\\uFAD9\\uFB1D-\\uFB1F\\uFB2A-"\r
2438           + "\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46"\r
2439           + "-\\uFB4E\\uFE20-\\uFE26\\U000101FD\\U00010A0D\\U00010A0F\\U00010"\r
2440           + "A38-\\U00010A3A\\U00010A3F\\U00011099\\U0001109B\\U000110A5"\r
2441           + "\\U000110B9\\U000110BA\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001"\r
2442           + "D172\\U0001D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-"\r
2443           + "\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0002"\r
2444           + "F800-\\U0002FA1D]", false);\r
2445    \r
2446         skipSets[KD].applyPattern(\r
2447             "[^\\u00A0\\u00A8\\u00AA\\u00AF\\u00B2-\\u00B5\\u00B8-\\u00BA"\r
2448             + "\\u00BC-\\u00BE\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6"\r
2449             + "\\u00D9-\\u00DD\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6"\r
2450             + "\\u00F9-\\u00FD\\u00FF-\\u010F\\u0112-\\u0125\\u0128-\\u0130"\r
2451             + "\\u0132-\\u0137\\u0139-\\u0140\\u0143-\\u0149\\u014C-\\u0151"\r
2452             + "\\u0154-\\u0165\\u0168-\\u017F\\u01A0\\u01A1\\u01AF\\u01B0"\r
2453             + "\\u01C4-\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F5\\u01F8-\\u021B"\r
2454             + "\\u021E\\u021F\\u0226-\\u0233\\u02B0-\\u02B8\\u02D8-\\u02DD"\r
2455             + "\\u02E0-\\u02E4\\u0300-\\u034E\\u0350-\\u036F\\u0374\\u037A"\r
2456             + "\\u037E\\u0384-\\u038A\\u038C\\u038E-\\u0390\\u03AA-\\u03B0"\r
2457             + "\\u03CA-\\u03CE\\u03D0-\\u03D6\\u03F0-\\u03F2\\u03F4\\u03F5"\r
2458             + "\\u03F9\\u0400\\u0401\\u0403\\u0407\\u040C-\\u040E\\u0419\\u0439"\r
2459             + "\\u0450\\u0451\\u0453\\u0457\\u045C-\\u045E\\u0476\\u0477\\u0483"\r
2460             + "-\\u0487\\u04C1\\u04C2\\u04D0-\\u04D3\\u04D6\\u04D7\\u04DA-"\r
2461             + "\\u04DF\\u04E2-\\u04E7\\u04EA-\\u04F5\\u04F8\\u04F9\\u0587"\r
2462             + "\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5\\u05C7\\u0610"\r
2463             + "-\\u061A\\u0622-\\u0626\\u064B-\\u065E\\u0670\\u0675-\\u0678"\r
2464             + "\\u06C0\\u06C2\\u06D3\\u06D6-\\u06DC\\u06DF-\\u06E4\\u06E7"\r
2465             + "\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-\\u07F3"\r
2466             + "\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-\\u082D"\r
2467             + "\\u0929\\u0931\\u0934\\u093C\\u094D\\u0951-\\u0954\\u0958-"\r
2468             + "\\u095F\\u09BC\\u09CB-\\u09CD\\u09DC\\u09DD\\u09DF\\u0A33\\u0A36"\r
2469             + "\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C\\u0B48"\r
2470             + "\\u0B4B-\\u0B4D\\u0B5C\\u0B5D\\u0B94\\u0BCA-\\u0BCD\\u0C48"\r
2471             + "\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CC0\\u0CC7\\u0CC8\\u0CCA\\u0CCB"\r
2472             + "\\u0CCD\\u0D4A-\\u0D4D\\u0DCA\\u0DDA\\u0DDC-\\u0DDE\\u0E33"\r
2473             + "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB3\\u0EB8\\u0EB9\\u0EC8-"\r
2474             + "\\u0ECB\\u0EDC\\u0EDD\\u0F0C\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39"\r
2475             + "\\u0F43\\u0F4D\\u0F52\\u0F57\\u0F5C\\u0F69\\u0F71-\\u0F7D\\u0F80"\r
2476             + "-\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"\r
2477             + "\\u0FC6\\u1026\\u1037\\u1039\\u103A\\u108D\\u10FC\\u135F\\u1714"\r
2478             + "\\u1734\\u17D2\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60"\r
2479             + "\\u1A75-\\u1A7C\\u1A7F\\u1B06\\u1B08\\u1B0A\\u1B0C\\u1B0E\\u1B12"\r
2480             + "\\u1B34\\u1B3B\\u1B3D\\u1B40\\u1B41\\u1B43\\u1B44\\u1B6B-\\u1B73"\r
2481             + "\\u1BAA\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8"\r
2482             + "\\u1CED\\u1D2C-\\u1D2E\\u1D30-\\u1D3A\\u1D3C-\\u1D4D\\u1D4F-"\r
2483             + "\\u1D6A\\u1D78\\u1D9B-\\u1DE6\\u1DFD-\\u1E9B\\u1EA0-\\u1EF9"\r
2484             + "\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D"\r
2485             + "\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-"\r
2486             + "\\u1FB4\\u1FB6-\\u1FC4\\u1FC6-\\u1FD3\\u1FD6-\\u1FDB\\u1FDD-"\r
2487             + "\\u1FEF\\u1FF2-\\u1FF4\\u1FF6-\\u1FFE\\u2000-\\u200A\\u2011"\r
2488             + "\\u2017\\u2024-\\u2026\\u202F\\u2033\\u2034\\u2036\\u2037\\u203C"\r
2489             + "\\u203E\\u2047-\\u2049\\u2057\\u205F\\u2070\\u2071\\u2074-"\r
2490             + "\\u208E\\u2090-\\u2094\\u20A8\\u20D0-\\u20DC\\u20E1\\u20E5-"\r
2491             + "\\u20F0\\u2100-\\u2103\\u2105-\\u2107\\u2109-\\u2113\\u2115"\r
2492             + "\\u2116\\u2119-\\u211D\\u2120-\\u2122\\u2124\\u2126\\u2128"\r
2493             + "\\u212A-\\u212D\\u212F-\\u2131\\u2133-\\u2139\\u213B-\\u2140"\r
2494             + "\\u2145-\\u2149\\u2150-\\u217F\\u2189\\u219A\\u219B\\u21AE"\r
2495             + "\\u21CD-\\u21CF\\u2204\\u2209\\u220C\\u2224\\u2226\\u222C\\u222D"\r
2496             + "\\u222F\\u2230\\u2241\\u2244\\u2247\\u2249\\u2260\\u2262\\u226D-"\r
2497             + "\\u2271\\u2274\\u2275\\u2278\\u2279\\u2280\\u2281\\u2284\\u2285"\r
2498             + "\\u2288\\u2289\\u22AC-\\u22AF\\u22E0-\\u22E3\\u22EA-\\u22ED"\r
2499             + "\\u2329\\u232A\\u2460-\\u24EA\\u2A0C\\u2A74-\\u2A76\\u2ADC"\r
2500             + "\\u2C7C\\u2C7D\\u2CEF-\\u2CF1\\u2D6F\\u2DE0-\\u2DFF\\u2E9F"\r
2501             + "\\u2EF3\\u2F00-\\u2FD5\\u3000\\u302A-\\u302F\\u3036\\u3038-"\r
2502             + "\\u303A\\u304C\\u304E\\u3050\\u3052\\u3054\\u3056\\u3058\\u305A"\r
2503             + "\\u305C\\u305E\\u3060\\u3062\\u3065\\u3067\\u3069\\u3070\\u3071"\r
2504             + "\\u3073\\u3074\\u3076\\u3077\\u3079\\u307A\\u307C\\u307D\\u3094"\r
2505             + "\\u3099-\\u309C\\u309E\\u309F\\u30AC\\u30AE\\u30B0\\u30B2\\u30B4"\r
2506             + "\\u30B6\\u30B8\\u30BA\\u30BC\\u30BE\\u30C0\\u30C2\\u30C5\\u30C7"\r
2507             + "\\u30C9\\u30D0\\u30D1\\u30D3\\u30D4\\u30D6\\u30D7\\u30D9\\u30DA"\r
2508             + "\\u30DC\\u30DD\\u30F4\\u30F7-\\u30FA\\u30FE\\u30FF\\u3131-"\r
2509             + "\\u318E\\u3192-\\u319F\\u3200-\\u321E\\u3220-\\u3247\\u3250-"\r
2510             + "\\u327E\\u3280-\\u32FE\\u3300-\\u33FF\\uA66F\\uA67C\\uA67D"\r
2511             + "\\uA6F0\\uA6F1\\uA770\\uA806\\uA8C4\\uA8E0-\\uA8F1\\uA92B-"\r
2512             + "\\uA92D\\uA953\\uA9B3\\uA9C0\\uAAB0\\uAAB2-\\uAAB4\\uAAB7\\uAAB8"\r
2513             + "\\uAABE\\uAABF\\uAAC1\\uABED\\uAC00-\\uD7A3\\uF900-\\uFA0D"\r
2514             + "\\uFA10\\uFA12\\uFA15-\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A"\r
2515             + "-\\uFA2D\\uFA30-\\uFA6D\\uFA70-\\uFAD9\\uFB00-\\uFB06\\uFB13-"\r
2516             + "\\uFB17\\uFB1D-\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41"\r
2517             + "\\uFB43\\uFB44\\uFB46-\\uFBB1\\uFBD3-\\uFD3D\\uFD50-\\uFD8F"\r
2518             + "\\uFD92-\\uFDC7\\uFDF0-\\uFDFC\\uFE10-\\uFE19\\uFE20-\\uFE26"\r
2519             + "\\uFE30-\\uFE44\\uFE47-\\uFE52\\uFE54-\\uFE66\\uFE68-\\uFE6B"\r
2520             + "\\uFE70-\\uFE72\\uFE74\\uFE76-\\uFEFC\\uFF01-\\uFFBE\\uFFC2-"\r
2521             + "\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7\\uFFDA-\\uFFDC\\uFFE0-"\r
2522             + "\\uFFE6\\uFFE8-\\uFFEE\\U000101FD\\U00010A0D\\U00010A0F\\U00010A"\r
2523             + "38-\\U00010A3A\\U00010A3F\\U0001109A\\U0001109C\\U000110AB"\r
2524             + "\\U000110B9\\U000110BA\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001"\r
2525             + "D172\\U0001D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-"\r
2526             + "\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0001"\r
2527             + "D400-\\U0001D454\\U0001D456-\\U0001D49C\\U0001D49E\\U0001D49F"\r
2528             + "\\U0001D4A2\\U0001D4A5\\U0001D4A6\\U0001D4A9-\\U0001D4AC\\U0001D"\r
2529             + "4AE-\\U0001D4B9\\U0001D4BB\\U0001D4BD-\\U0001D4C3\\U0001D4C5-"\r
2530             + "\\U0001D505\\U0001D507-\\U0001D50A\\U0001D50D-\\U0001D514\\U0001"\r
2531             + "D516-\\U0001D51C\\U0001D51E-\\U0001D539\\U0001D53B-\\U0001D53E"\r
2532             + "\\U0001D540-\\U0001D544\\U0001D546\\U0001D54A-\\U0001D550\\U0001"\r
2533             + "D552-\\U0001D6A5\\U0001D6A8-\\U0001D7CB\\U0001D7CE-\\U0001D7FF"\r
2534             + "\\U0001F100-\\U0001F10A\\U0001F110-\\U0001F12E\\U0001F131\\U0001"\r
2535             + "F13D\\U0001F13F\\U0001F142\\U0001F146\\U0001F14A-\\U0001F14E"\r
2536             + "\\U0001F190\\U0001F200\\U0001F210-\\U0001F231\\U0001F240-\\U0001"\r
2537             + "F248\\U0002F800-\\U0002FA1D]", false);\r
2538    \r
2539         skipSets[KC].applyPattern(\r
2540             "[^<->A-PR-Za-pr-z\\u00A0\\u00A8\\u00AA\\u00AF\\u00B2-\\u00B5"\r
2541             + "\\u00B8-\\u00BA\\u00BC-\\u00BE\\u00C0-\\u00CF\\u00D1-\\u00D6"\r
2542             + "\\u00D8-\\u00DD\\u00E0-\\u00EF\\u00F1-\\u00F6\\u00F8-\\u00FD"\r
2543             + "\\u00FF-\\u0103\\u0106-\\u010F\\u0112-\\u0117\\u011A-\\u0121"\r
2544             + "\\u0124\\u0125\\u0128-\\u012D\\u0130\\u0132\\u0133\\u0139\\u013A"\r
2545             + "\\u013D-\\u0140\\u0143\\u0144\\u0147-\\u0149\\u014C-\\u0151"\r
2546             + "\\u0154\\u0155\\u0158-\\u015D\\u0160\\u0161\\u0164\\u0165\\u0168"\r
2547             + "-\\u0171\\u0174-\\u017F\\u01A0\\u01A1\\u01AF\\u01B0\\u01B7"\r
2548             + "\\u01C4-\\u01DC\\u01DE-\\u01E1\\u01E6-\\u01EB\\u01F1-\\u01F5"\r
2549             + "\\u01F8-\\u01FB\\u0200-\\u021B\\u021E\\u021F\\u0226-\\u0233"\r
2550             + "\\u0292\\u02B0-\\u02B8\\u02D8-\\u02DD\\u02E0-\\u02E4\\u0300-"\r
2551             + "\\u034E\\u0350-\\u036F\\u0374\\u037A\\u037E\\u0384\\u0385\\u0387"\r
2552             + "\\u0391\\u0395\\u0397\\u0399\\u039F\\u03A1\\u03A5\\u03A9\\u03AC"\r
2553             + "\\u03AE\\u03B1\\u03B5\\u03B7\\u03B9\\u03BF\\u03C1\\u03C5\\u03C9-"\r
2554             + "\\u03CB\\u03CE\\u03D0-\\u03D6\\u03F0-\\u03F2\\u03F4\\u03F5"\r
2555             + "\\u03F9\\u0406\\u0410\\u0413\\u0415-\\u0418\\u041A\\u041E\\u0423"\r
2556             + "\\u0427\\u042B\\u042D\\u0430\\u0433\\u0435-\\u0438\\u043A\\u043E"\r
2557             + "\\u0443\\u0447\\u044B\\u044D\\u0456\\u0474\\u0475\\u0483-\\u0487"\r
2558             + "\\u04D8\\u04D9\\u04E8\\u04E9\\u0587\\u0591-\\u05BD\\u05BF\\u05C1"\r
2559             + "\\u05C2\\u05C4\\u05C5\\u05C7\\u0610-\\u061A\\u0622\\u0623\\u0627"\r
2560             + "\\u0648\\u064A-\\u065E\\u0670\\u0675-\\u0678\\u06C1\\u06D2"\r
2561             + "\\u06D5-\\u06DC\\u06DF-\\u06E4\\u06E7\\u06E8\\u06EA-\\u06ED"\r
2562             + "\\u0711\\u0730-\\u074A\\u07EB-\\u07F3\\u0816-\\u0819\\u081B-"\r
2563             + "\\u0823\\u0825-\\u0827\\u0829-\\u082D\\u0928\\u0930\\u0933"\r
2564             + "\\u093C\\u094D\\u0951-\\u0954\\u0958-\\u095F\\u09BC\\u09BE"\r
2565             + "\\u09C7\\u09CD\\u09D7\\u09DC\\u09DD\\u09DF\\u0A33\\u0A36\\u0A3C"\r
2566             + "\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C\\u0B3E\\u0B47"\r
2567             + "\\u0B4D\\u0B56\\u0B57\\u0B5C\\u0B5D\\u0B92\\u0BBE\\u0BC6\\u0BC7"\r
2568             + "\\u0BCD\\u0BD7\\u0C46\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CBF\\u0CC2"\r
2569             + "\\u0CC6\\u0CCA\\u0CCD\\u0CD5\\u0CD6\\u0D3E\\u0D46\\u0D47\\u0D4D"\r
2570             + "\\u0D57\\u0DCA\\u0DCF\\u0DD9\\u0DDC\\u0DDF\\u0E33\\u0E38-\\u0E3A"\r
2571             + "\\u0E48-\\u0E4B\\u0EB3\\u0EB8\\u0EB9\\u0EC8-\\u0ECB\\u0EDC"\r
2572             + "\\u0EDD\\u0F0C\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D"\r
2573             + "\\u0F52\\u0F57\\u0F5C\\u0F69\\u0F71-\\u0F7D\\u0F80-\\u0F84"\r
2574             + "\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9\\u0FC6"\r
2575             + "\\u1025\\u102E\\u1037\\u1039\\u103A\\u108D\\u10FC\\u1100-\\u1112"\r
2576             + "\\u1161-\\u1175\\u11A8-\\u11C2\\u135F\\u1714\\u1734\\u17D2"\r
2577             + "\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60\\u1A75-"\r
2578             + "\\u1A7C\\u1A7F\\u1B05\\u1B07\\u1B09\\u1B0B\\u1B0D\\u1B11\\u1B34"\r
2579             + "\\u1B35\\u1B3A\\u1B3C\\u1B3E\\u1B3F\\u1B42\\u1B44\\u1B6B-\\u1B73"\r
2580             + "\\u1BAA\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8"\r
2581             + "\\u1CED\\u1D2C-\\u1D2E\\u1D30-\\u1D3A\\u1D3C-\\u1D4D\\u1D4F-"\r
2582             + "\\u1D6A\\u1D78\\u1D9B-\\u1DE6\\u1DFD-\\u1E03\\u1E0A-\\u1E0F"\r
2583             + "\\u1E12-\\u1E1B\\u1E20-\\u1E27\\u1E2A-\\u1E41\\u1E44-\\u1E53"\r
2584             + "\\u1E58-\\u1E7D\\u1E80-\\u1E87\\u1E8E-\\u1E91\\u1E96-\\u1E9B"\r
2585             + "\\u1EA0-\\u1EF3\\u1EF6-\\u1EF9\\u1F00-\\u1F11\\u1F18\\u1F19"\r
2586             + "\\u1F20-\\u1F31\\u1F38\\u1F39\\u1F40\\u1F41\\u1F48\\u1F49\\u1F50"\r
2587             + "\\u1F51\\u1F59\\u1F60-\\u1F71\\u1F73-\\u1F75\\u1F77\\u1F79"\r
2588             + "\\u1F7B-\\u1F7D\\u1F80\\u1F81\\u1F88\\u1F89\\u1F90\\u1F91\\u1F98"\r
2589             + "\\u1F99\\u1FA0\\u1FA1\\u1FA8\\u1FA9\\u1FB3\\u1FB6\\u1FBB-\\u1FC1"\r
2590             + "\\u1FC3\\u1FC6\\u1FC9\\u1FCB-\\u1FCF\\u1FD3\\u1FDB\\u1FDD-"\r
2591             + "\\u1FDF\\u1FE3\\u1FEB\\u1FED-\\u1FEF\\u1FF3\\u1FF6\\u1FF9\\u1FFB"\r
2592             + "-\\u1FFE\\u2000-\\u200A\\u2011\\u2017\\u2024-\\u2026\\u202F"\r
2593             + "\\u2033\\u2034\\u2036\\u2037\\u203C\\u203E\\u2047-\\u2049\\u2057"\r
2594             + "\\u205F\\u2070\\u2071\\u2074-\\u208E\\u2090-\\u2094\\u20A8"\r
2595             + "\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2100-\\u2103\\u2105-"\r
2596             + "\\u2107\\u2109-\\u2113\\u2115\\u2116\\u2119-\\u211D\\u2120-"\r
2597             + "\\u2122\\u2124\\u2126\\u2128\\u212A-\\u212D\\u212F-\\u2131"\r
2598             + "\\u2133-\\u2139\\u213B-\\u2140\\u2145-\\u2149\\u2150-\\u217F"\r
2599             + "\\u2189\\u2190\\u2192\\u2194\\u21D0\\u21D2\\u21D4\\u2203\\u2208"\r
2600             + "\\u220B\\u2223\\u2225\\u222C\\u222D\\u222F\\u2230\\u223C\\u2243"\r
2601             + "\\u2245\\u2248\\u224D\\u2261\\u2264\\u2265\\u2272\\u2273\\u2276"\r
2602             + "\\u2277\\u227A-\\u227D\\u2282\\u2283\\u2286\\u2287\\u2291\\u2292"\r
2603             + "\\u22A2\\u22A8\\u22A9\\u22AB\\u22B2-\\u22B5\\u2329\\u232A\\u2460"\r
2604             + "-\\u24EA\\u2A0C\\u2A74-\\u2A76\\u2ADC\\u2C7C\\u2C7D\\u2CEF-"\r
2605             + "\\u2CF1\\u2D6F\\u2DE0-\\u2DFF\\u2E9F\\u2EF3\\u2F00-\\u2FD5"\r
2606             + "\\u3000\\u302A-\\u302F\\u3036\\u3038-\\u303A\\u3046\\u304B"\r
2607             + "\\u304D\\u304F\\u3051\\u3053\\u3055\\u3057\\u3059\\u305B\\u305D"\r
2608             + "\\u305F\\u3061\\u3064\\u3066\\u3068\\u306F\\u3072\\u3075\\u3078"\r
2609             + "\\u307B\\u3099-\\u309D\\u309F\\u30A6\\u30AB\\u30AD\\u30AF\\u30B1"\r
2610             + "\\u30B3\\u30B5\\u30B7\\u30B9\\u30BB\\u30BD\\u30BF\\u30C1\\u30C4"\r
2611             + "\\u30C6\\u30C8\\u30CF\\u30D2\\u30D5\\u30D8\\u30DB\\u30EF-\\u30F2"\r
2612             + "\\u30FD\\u30FF\\u3131-\\u318E\\u3192-\\u319F\\u3200-\\u321E"\r
2613             + "\\u3220-\\u3247\\u3250-\\u327E\\u3280-\\u32FE\\u3300-\\u33FF"\r
2614             + "\\uA66F\\uA67C\\uA67D\\uA6F0\\uA6F1\\uA770\\uA806\\uA8C4\\uA8E0-"\r
2615             + "\\uA8F1\\uA92B-\\uA92D\\uA953\\uA9B3\\uA9C0\\uAAB0\\uAAB2-"\r
2616             + "\\uAAB4\\uAAB7\\uAAB8\\uAABE\\uAABF\\uAAC1\\uABED\\uAC00\\uAC1C"\r
2617             + "\\uAC38\\uAC54\\uAC70\\uAC8C\\uACA8\\uACC4\\uACE0\\uACFC\\uAD18"\r
2618             + "\\uAD34\\uAD50\\uAD6C\\uAD88\\uADA4\\uADC0\\uADDC\\uADF8\\uAE14"\r
2619             + "\\uAE30\\uAE4C\\uAE68\\uAE84\\uAEA0\\uAEBC\\uAED8\\uAEF4\\uAF10"\r
2620             + "\\uAF2C\\uAF48\\uAF64\\uAF80\\uAF9C\\uAFB8\\uAFD4\\uAFF0\\uB00C"\r
2621             + "\\uB028\\uB044\\uB060\\uB07C\\uB098\\uB0B4\\uB0D0\\uB0EC\\uB108"\r
2622             + "\\uB124\\uB140\\uB15C\\uB178\\uB194\\uB1B0\\uB1CC\\uB1E8\\uB204"\r
2623             + "\\uB220\\uB23C\\uB258\\uB274\\uB290\\uB2AC\\uB2C8\\uB2E4\\uB300"\r
2624             + "\\uB31C\\uB338\\uB354\\uB370\\uB38C\\uB3A8\\uB3C4\\uB3E0\\uB3FC"\r
2625             + "\\uB418\\uB434\\uB450\\uB46C\\uB488\\uB4A4\\uB4C0\\uB4DC\\uB4F8"\r
2626             + "\\uB514\\uB530\\uB54C\\uB568\\uB584\\uB5A0\\uB5BC\\uB5D8\\uB5F4"\r
2627             + "\\uB610\\uB62C\\uB648\\uB664\\uB680\\uB69C\\uB6B8\\uB6D4\\uB6F0"\r
2628             + "\\uB70C\\uB728\\uB744\\uB760\\uB77C\\uB798\\uB7B4\\uB7D0\\uB7EC"\r
2629             + "\\uB808\\uB824\\uB840\\uB85C\\uB878\\uB894\\uB8B0\\uB8CC\\uB8E8"\r
2630             + "\\uB904\\uB920\\uB93C\\uB958\\uB974\\uB990\\uB9AC\\uB9C8\\uB9E4"\r
2631             + "\\uBA00\\uBA1C\\uBA38\\uBA54\\uBA70\\uBA8C\\uBAA8\\uBAC4\\uBAE0"\r
2632             + "\\uBAFC\\uBB18\\uBB34\\uBB50\\uBB6C\\uBB88\\uBBA4\\uBBC0\\uBBDC"\r
2633             + "\\uBBF8\\uBC14\\uBC30\\uBC4C\\uBC68\\uBC84\\uBCA0\\uBCBC\\uBCD8"\r
2634             + "\\uBCF4\\uBD10\\uBD2C\\uBD48\\uBD64\\uBD80\\uBD9C\\uBDB8\\uBDD4"\r
2635             + "\\uBDF0\\uBE0C\\uBE28\\uBE44\\uBE60\\uBE7C\\uBE98\\uBEB4\\uBED0"\r
2636             + "\\uBEEC\\uBF08\\uBF24\\uBF40\\uBF5C\\uBF78\\uBF94\\uBFB0\\uBFCC"\r
2637             + "\\uBFE8\\uC004\\uC020\\uC03C\\uC058\\uC074\\uC090\\uC0AC\\uC0C8"\r
2638             + "\\uC0E4\\uC100\\uC11C\\uC138\\uC154\\uC170\\uC18C\\uC1A8\\uC1C4"\r
2639             + "\\uC1E0\\uC1FC\\uC218\\uC234\\uC250\\uC26C\\uC288\\uC2A4\\uC2C0"\r
2640             + "\\uC2DC\\uC2F8\\uC314\\uC330\\uC34C\\uC368\\uC384\\uC3A0\\uC3BC"\r
2641             + "\\uC3D8\\uC3F4\\uC410\\uC42C\\uC448\\uC464\\uC480\\uC49C\\uC4B8"\r
2642             + "\\uC4D4\\uC4F0\\uC50C\\uC528\\uC544\\uC560\\uC57C\\uC598\\uC5B4"\r
2643             + "\\uC5D0\\uC5EC\\uC608\\uC624\\uC640\\uC65C\\uC678\\uC694\\uC6B0"\r
2644             + "\\uC6CC\\uC6E8\\uC704\\uC720\\uC73C\\uC758\\uC774\\uC790\\uC7AC"\r
2645             + "\\uC7C8\\uC7E4\\uC800\\uC81C\\uC838\\uC854\\uC870\\uC88C\\uC8A8"\r
2646             + "\\uC8C4\\uC8E0\\uC8FC\\uC918\\uC934\\uC950\\uC96C\\uC988\\uC9A4"\r
2647             + "\\uC9C0\\uC9DC\\uC9F8\\uCA14\\uCA30\\uCA4C\\uCA68\\uCA84\\uCAA0"\r
2648             + "\\uCABC\\uCAD8\\uCAF4\\uCB10\\uCB2C\\uCB48\\uCB64\\uCB80\\uCB9C"\r
2649             + "\\uCBB8\\uCBD4\\uCBF0\\uCC0C\\uCC28\\uCC44\\uCC60\\uCC7C\\uCC98"\r
2650             + "\\uCCB4\\uCCD0\\uCCEC\\uCD08\\uCD24\\uCD40\\uCD5C\\uCD78\\uCD94"\r
2651             + "\\uCDB0\\uCDCC\\uCDE8\\uCE04\\uCE20\\uCE3C\\uCE58\\uCE74\\uCE90"\r
2652             + "\\uCEAC\\uCEC8\\uCEE4\\uCF00\\uCF1C\\uCF38\\uCF54\\uCF70\\uCF8C"\r
2653             + "\\uCFA8\\uCFC4\\uCFE0\\uCFFC\\uD018\\uD034\\uD050\\uD06C\\uD088"\r
2654             + "\\uD0A4\\uD0C0\\uD0DC\\uD0F8\\uD114\\uD130\\uD14C\\uD168\\uD184"\r
2655             + "\\uD1A0\\uD1BC\\uD1D8\\uD1F4\\uD210\\uD22C\\uD248\\uD264\\uD280"\r
2656             + "\\uD29C\\uD2B8\\uD2D4\\uD2F0\\uD30C\\uD328\\uD344\\uD360\\uD37C"\r
2657             + "\\uD398\\uD3B4\\uD3D0\\uD3EC\\uD408\\uD424\\uD440\\uD45C\\uD478"\r
2658             + "\\uD494\\uD4B0\\uD4CC\\uD4E8\\uD504\\uD520\\uD53C\\uD558\\uD574"\r
2659             + "\\uD590\\uD5AC\\uD5C8\\uD5E4\\uD600\\uD61C\\uD638\\uD654\\uD670"\r
2660             + "\\uD68C\\uD6A8\\uD6C4\\uD6E0\\uD6FC\\uD718\\uD734\\uD750\\uD76C"\r
2661             + "\\uD788\\uF900-\\uFA0D\\uFA10\\uFA12\\uFA15-\\uFA1E\\uFA20"\r
2662             + "\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D\\uFA30-\\uFA6D\\uFA70-"\r
2663             + "\\uFAD9\\uFB00-\\uFB06\\uFB13-\\uFB17\\uFB1D-\\uFB36\\uFB38-"\r
2664             + "\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-\\uFBB1\\uFBD3"\r
2665             + "-\\uFD3D\\uFD50-\\uFD8F\\uFD92-\\uFDC7\\uFDF0-\\uFDFC\\uFE10-"\r
2666             + "\\uFE19\\uFE20-\\uFE26\\uFE30-\\uFE44\\uFE47-\\uFE52\\uFE54-"\r
2667             + "\\uFE66\\uFE68-\\uFE6B\\uFE70-\\uFE72\\uFE74\\uFE76-\\uFEFC"\r
2668             + "\\uFF01-\\uFFBE\\uFFC2-\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7"\r
2669             + "\\uFFDA-\\uFFDC\\uFFE0-\\uFFE6\\uFFE8-\\uFFEE\\U000101FD\\U00010"\r
2670             + "A0D\\U00010A0F\\U00010A38-\\U00010A3A\\U00010A3F\\U00011099"\r
2671             + "\\U0001109B\\U000110A5\\U000110B9\\U000110BA\\U0001D15E-\\U0001D"\r
2672             + "169\\U0001D16D-\\U0001D172\\U0001D17B-\\U0001D182\\U0001D185-"\r
2673             + "\\U0001D18B\\U0001D1AA-\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001"\r
2674             + "D242-\\U0001D244\\U0001D400-\\U0001D454\\U0001D456-\\U0001D49C"\r
2675             + "\\U0001D49E\\U0001D49F\\U0001D4A2\\U0001D4A5\\U0001D4A6\\U0001D4"\r
2676             + "A9-\\U0001D4AC\\U0001D4AE-\\U0001D4B9\\U0001D4BB\\U0001D4BD-"\r
2677             + "\\U0001D4C3\\U0001D4C5-\\U0001D505\\U0001D507-\\U0001D50A\\U0001"\r
2678             + "D50D-\\U0001D514\\U0001D516-\\U0001D51C\\U0001D51E-\\U0001D539"\r
2679             + "\\U0001D53B-\\U0001D53E\\U0001D540-\\U0001D544\\U0001D546\\U0001"\r
2680             + "D54A-\\U0001D550\\U0001D552-\\U0001D6A5\\U0001D6A8-\\U0001D7CB"\r
2681             + "\\U0001D7CE-\\U0001D7FF\\U0001F100-\\U0001F10A\\U0001F110-"\r
2682             + "\\U0001F12E\\U0001F131\\U0001F13D\\U0001F13F\\U0001F142\\U0001F1"\r
2683             + "46\\U0001F14A-\\U0001F14E\\U0001F190\\U0001F200\\U0001F210-"\r
2684             + "\\U0001F231\\U0001F240-\\U0001F248\\U0002F800-\\U0002FA1D]", false);\r
2685    \r
2686         return skipSets;\r
2687     }\r
2688 \r
2689     public void TestSkippable() {\r
2690         UnicodeSet[] skipSets = new UnicodeSet[] {\r
2691             new UnicodeSet(), //NFD\r
2692             new UnicodeSet(), //NFC\r
2693             new UnicodeSet(), //NFKD\r
2694             new UnicodeSet()  //NFKC\r
2695         };\r
2696         UnicodeSet[] expectSets = new UnicodeSet[] {\r
2697             new UnicodeSet(),\r
2698             new UnicodeSet(),\r
2699             new UnicodeSet(),\r
2700             new UnicodeSet()\r
2701         };\r
2702         StringBuilder s, pattern;\r
2703 \r
2704         // build NF*Skippable sets from runtime data \r
2705         skipSets[D].applyPattern("[:NFD_Inert:]");\r
2706         skipSets[C].applyPattern("[:NFC_Inert:]");\r
2707         skipSets[KD].applyPattern("[:NFKD_Inert:]");\r
2708         skipSets[KC].applyPattern("[:NFKC_Inert:]");\r
2709 \r
2710         expectSets = initSkippables(expectSets);\r
2711         if(expectSets[D].contains(0x0350)){\r
2712             errln("expectSets[D] contains 0x0350");\r
2713         }\r
2714         for(int i=0; i<expectSets.length; ++i) {\r
2715             if(!skipSets[i].equals(expectSets[i])) {\r
2716                 errln("error: TestSkippable skipSets["+i+"]!=expectedSets["+i+"]\n"+\r
2717                       "May need to update hardcoded UnicodeSet patterns in com.ibm.icu.dev.test.normalizer.BasicTest.java\n"+\r
2718                       "See ICU4J - unicodetools.com.ibm.text.UCD.NFSkippable\n" +\r
2719                       "Run com.ibm.text.UCD.Main with the option NFSkippable.");\r
2720 \r
2721                 s=new StringBuilder();\r
2722 \r
2723                 s.append("\n\nskip=       ");\r
2724                 s.append(skipSets[i].toPattern(true));\r
2725                 s.append("\n\n");\r
2726 \r
2727                 s.append("skip-expect=");             \r
2728                 pattern = new StringBuilder(((UnicodeSet)skipSets[i].clone()).removeAll(expectSets[i]).toPattern(true));\r
2729                 s.append(pattern);\r
2730 \r
2731                 pattern.delete(0,pattern.length());\r
2732                 s.append("\n\nexpect-skip=");\r
2733                 pattern = new StringBuilder(((UnicodeSet)expectSets[i].clone()).removeAll(skipSets[i]).toPattern(true));\r
2734                 s.append(pattern);\r
2735                 s.append("\n\n");\r
2736 \r
2737                 pattern.delete(0,pattern.length());\r
2738                 s.append("\n\nintersection(expect,skip)=");\r
2739                 UnicodeSet intersection  = ((UnicodeSet) expectSets[i].clone()).retainAll(skipSets[i]);\r
2740                 pattern = new StringBuilder(intersection.toPattern(true));\r
2741                 s.append(pattern);\r
2742                 s.append("\n\n");\r
2743 \r
2744                 errln(s.toString());\r
2745             }\r
2746         }\r
2747     }\r
2748 \r
2749     public void TestBugJ2068(){\r
2750         String sample = "The quick brown fox jumped over the lazy dog";\r
2751         UCharacterIterator text = UCharacterIterator.getInstance(sample);\r
2752         Normalizer norm = new Normalizer(text,Normalizer.NFC,0);\r
2753         text.setIndex(4);\r
2754         if(text.current() == norm.current()){\r
2755             errln("Normalizer is not cloning the UCharacterIterator");\r
2756         }\r
2757      }   \r
2758      public void TestGetCombiningClass(){\r
2759         for(int i=0;i<0x10FFFF;i++){\r
2760             int cc = UCharacter.getCombiningClass(i);\r
2761             if(0xD800<= i && i<=0xDFFF && cc >0 ){\r
2762                 cc = UCharacter.getCombiningClass(i);\r
2763                 errln("CC: "+ cc + " for codepoint: " +Utility.hex(i,8));\r
2764             } \r
2765         }\r
2766     }  \r
2767 \r
2768     public void TestSerializedSet(){\r
2769         USerializedSet sset=new USerializedSet();\r
2770         UnicodeSet set = new UnicodeSet();\r
2771         int start, end;\r
2772 \r
2773         char[] serialized = {\r
2774             0x8007,  // length\r
2775             3,  // bmpLength\r
2776             0xc0, 0xfe, 0xfffc,\r
2777             1, 9, 0x10, 0xfffc\r
2778         };\r
2779         sset.getSet(serialized, 0);\r
2780 \r
2781         // collect all sets into one for contiguous output\r
2782         int[] startEnd = new int[2];\r
2783         int count=sset.countRanges();\r
2784         for(int j=0; j<count; ++j) {\r
2785             sset.getRange(j, startEnd);\r
2786             set.add(startEnd[0], startEnd[1]);\r
2787         }\r
2788 \r
2789         // test all of these characters\r
2790         UnicodeSetIterator it = new UnicodeSetIterator(set);\r
2791         while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) {\r
2792             start=it.codepoint;\r
2793             end=it.codepointEnd;\r
2794             while(start<=end) {\r
2795                 if(!sset.contains(start)){\r
2796                     errln("USerializedSet.contains failed for "+Utility.hex(start,8));\r
2797                 }\r
2798                 ++start;\r
2799             }\r
2800         }\r
2801     }\r
2802 \r
2803     public void TestReturnFailure(){\r
2804         char[] term = {'r','\u00e9','s','u','m','\u00e9' };\r
2805         char[] decomposed_term = new char[10 + term.length + 2];\r
2806         int rc = Normalizer.decompose(term,0,term.length, decomposed_term,0,decomposed_term.length,true, 0);\r
2807         int rc1 = Normalizer.decompose(term,0,term.length, decomposed_term,10,decomposed_term.length,true, 0); \r
2808         if(rc!=rc1){\r
2809             errln("Normalizer decompose did not return correct length");\r
2810         }\r
2811     }\r
2812 \r
2813     private final static class TestCompositionCase {\r
2814         public Normalizer.Mode mode;\r
2815         public int options;\r
2816         public String input, expect;\r
2817         TestCompositionCase(Normalizer.Mode mode, int options, String input, String expect) {\r
2818             this.mode=mode;\r
2819             this.options=options;\r
2820             this.input=input;\r
2821             this.expect=expect;\r
2822         }\r
2823     }\r
2824 \r
2825     public void TestComposition() {\r
2826         final TestCompositionCase cases[]=new TestCompositionCase[]{\r
2827             /*\r
2828              * special cases for UAX #15 bug\r
2829              * see Unicode Corrigendum #5: Normalization Idempotency\r
2830              * at http://unicode.org/versions/corrigendum5.html\r
2831              * (was Public Review Issue #29)\r
2832              */\r
2833             new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327",      "\u1100\u0300\u1161\u0327"),\r
2834             new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327\u11a8","\u1100\u0300\u1161\u0327\u11a8"),\r
2835             new TestCompositionCase(Normalizer.NFC, 0, "\uac00\u0300\u0327\u11a8",      "\uac00\u0327\u0300\u11a8"),\r
2836             new TestCompositionCase(Normalizer.NFC, 0, "\u0b47\u0300\u0b3e",            "\u0b47\u0300\u0b3e"),\r
2837 \r
2838             /* TODO: add test cases for UNORM_FCC here (j2151) */\r
2839         };\r
2840 \r
2841         String output;\r
2842         int i;\r
2843 \r
2844         for(i=0; i<cases.length; ++i) {\r
2845             output=Normalizer.normalize(cases[i].input, cases[i].mode, cases[i].options);\r
2846             if(!output.equals(cases[i].expect)) {\r
2847                 errln("unexpected result for case "+i);\r
2848             }\r
2849         }\r
2850     }\r
2851 \r
2852     public void TestCustomComp() {\r
2853         String [][] pairs={\r
2854             { "\\uD801\\uE000\\uDFFE", "" },\r
2855             { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" },\r
2856             { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" },\r
2857             { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE002\\U000110B9\\u0327\\u0345" },\r
2858             { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" },\r
2859             { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" },\r
2860             { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" },\r
2861             { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" }\r
2862         };\r
2863         Normalizer2 customNorm2;\r
2864         customNorm2=\r
2865             Normalizer2.getInstance(\r
2866                 BasicTest.class.getResourceAsStream("/com/ibm/icu/dev/data/testdata/testnorm.nrm"),\r
2867                 "testnorm",\r
2868                 Normalizer2.Mode.COMPOSE);\r
2869         for(int i=0; i<pairs.length; ++i) {\r
2870             String[] pair=pairs[i];\r
2871             String input=Utility.unescape(pair[0]);\r
2872             String expected=Utility.unescape(pair[1]);\r
2873             String result=customNorm2.normalize(input);\r
2874             if(!result.equals(expected)) {\r
2875                 errln("custom compose Normalizer2 did not normalize input "+i+" as expected");\r
2876             }\r
2877         }\r
2878     }\r
2879 \r
2880     public void TestCustomFCC() {\r
2881         String[][] pairs={\r
2882             { "\\uD801\\uE000\\uDFFE", "" },\r
2883             { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" },\r
2884             { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" },\r
2885             // The following expected result is different from CustomComp\r
2886             // because of only-contiguous composition.\r
2887             { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE001\\U000110B9\\u0327\\u0308\\u0345" },\r
2888             { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" },\r
2889             { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" },\r
2890             { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" },\r
2891             { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" }\r
2892         };\r
2893         Normalizer2 customNorm2;\r
2894         customNorm2=\r
2895             Normalizer2.getInstance(\r
2896                 BasicTest.class.getResourceAsStream("/com/ibm/icu/dev/data/testdata/testnorm.nrm"),\r
2897                 "testnorm",\r
2898                 Normalizer2.Mode.COMPOSE_CONTIGUOUS);\r
2899         for(int i=0; i<pairs.length; ++i) {\r
2900             String[] pair=pairs[i];\r
2901             String input=Utility.unescape(pair[0]);\r
2902             String expected=Utility.unescape(pair[1]);\r
2903             String result=customNorm2.normalize(input);\r
2904             if(!result.equals(expected)) {\r
2905                 errln("custom FCC Normalizer2 did not normalize input "+i+" as expected");\r
2906             }\r
2907         }\r
2908     }\r
2909 \r
2910     public void TestCanonIterData() {\r
2911         // For now, just a regression test.\r
2912         Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl.ensureCanonIterData();\r
2913         // U+0FB5 TIBETAN SUBJOINED LETTER SSA is the trailing character\r
2914         // in some decomposition mappings where there is a composition exclusion.\r
2915         // In fact, U+0FB5 is normalization-inert (NFC_QC=Yes, NFD_QC=Yes, ccc=0)\r
2916         // but it is not a segment starter because it occurs in a decomposition mapping.\r
2917         if(impl.isCanonSegmentStarter(0xfb5)) {\r
2918             errln("isCanonSegmentStarter(U+0fb5)=true is wrong");\r
2919         }\r
2920         // For [:Segment_Starter:] to work right, not just the property function has to work right,\r
2921         // UnicodeSet also needs a correct range starts set.\r
2922         UnicodeSet segStarters=new UnicodeSet("[:Segment_Starter:]").freeze();\r
2923         if(segStarters.contains(0xfb5)) {\r
2924             errln("[:Segment_Starter:].contains(U+0fb5)=true is wrong");\r
2925         }\r
2926         // Try characters up to Kana and miscellaneous CJK but below Han (for expediency).\r
2927         for(int c=0; c<=0x33ff; ++c) {\r
2928             boolean isStarter=impl.isCanonSegmentStarter(c);\r
2929             boolean isContained=segStarters.contains(c);\r
2930             if(isStarter!=isContained) {\r
2931                 errln(String.format(\r
2932                         "discrepancy: isCanonSegmentStarter(U+%04x)=%5b != " +\r
2933                         "[:Segment_Starter:].contains(same)",\r
2934                         c, isStarter));\r
2935             }\r
2936         }\r
2937     }\r
2938 }\r