]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_2_1-src/src/com/ibm/icu/dev/test/normalizer/BasicTest.java
icu4jsrc
[Dictionary.git] / jars / icu4j-4_2_1-src / src / com / ibm / icu / dev / test / normalizer / BasicTest.java
1 /*\r
2  *******************************************************************************\r
3  * Copyright (C) 1996-2008, International Business Machines Corporation and    *\r
4  * others. All Rights Reserved.                                                *\r
5  *******************************************************************************\r
6  */\r
7 \r
8 package com.ibm.icu.dev.test.normalizer;\r
9 \r
10 import java.text.StringCharacterIterator;\r
11 import java.util.Random;\r
12 \r
13 import com.ibm.icu.dev.test.TestFmwk;\r
14 import com.ibm.icu.impl.NormalizerImpl;\r
15 import com.ibm.icu.impl.USerializedSet;\r
16 import com.ibm.icu.impl.Utility;\r
17 import com.ibm.icu.lang.*;\r
18 import com.ibm.icu.lang.UCharacter;\r
19 import com.ibm.icu.lang.UCharacterCategory;\r
20 import com.ibm.icu.text.Normalizer;\r
21 import com.ibm.icu.text.UCharacterIterator;\r
22 import com.ibm.icu.text.UTF16;\r
23 import com.ibm.icu.text.UnicodeSet;\r
24 import com.ibm.icu.text.UnicodeSetIterator;\r
25 \r
26 \r
27 public class BasicTest extends TestFmwk {\r
28     public static void main(String[] args) throws Exception {\r
29         new BasicTest().run(args);\r
30     }\r
31 \r
32     String[][] canonTests = {\r
33         // Input                Decomposed              Composed\r
34         { "cat",                "cat",                  "cat"               },\r
35         { "\u00e0ardvark",      "a\u0300ardvark",       "\u00e0ardvark",    },\r
36 \r
37         { "\u1e0a",             "D\u0307",              "\u1e0a"            }, // D-dot_above\r
38         { "D\u0307",            "D\u0307",              "\u1e0a"            }, // D dot_above\r
39 \r
40         { "\u1e0c\u0307",       "D\u0323\u0307",        "\u1e0c\u0307"      }, // D-dot_below dot_above\r
41         { "\u1e0a\u0323",       "D\u0323\u0307",        "\u1e0c\u0307"      }, // D-dot_above dot_below\r
42         { "D\u0307\u0323",      "D\u0323\u0307",        "\u1e0c\u0307"      }, // D dot_below dot_above\r
43 \r
44         { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307",  "\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above\r
45         { "D\u0307\u0328\u0323","D\u0328\u0323\u0307",  "\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below\r
46 \r
47         { "\u1E14",             "E\u0304\u0300",        "\u1E14"            }, // E-macron-grave\r
48         { "\u0112\u0300",       "E\u0304\u0300",        "\u1E14"            }, // E-macron + grave\r
49         { "\u00c8\u0304",       "E\u0300\u0304",        "\u00c8\u0304"      }, // E-grave + macron\r
50 \r
51         { "\u212b",             "A\u030a",              "\u00c5"            }, // angstrom_sign\r
52         { "\u00c5",             "A\u030a",              "\u00c5"            }, // A-ring\r
53 \r
54         { "\u00c4ffin",         "A\u0308ffin",          "\u00c4ffin"        },\r
55         { "\u00c4\uFB03n",      "A\u0308\uFB03n",       "\u00c4\uFB03n"     },\r
56 \r
57         { "\u00fdffin",         "y\u0301ffin",          "\u00fdffin"        }, //updated with 3.0\r
58         { "\u00fd\uFB03n",      "y\u0301\uFB03n",       "\u00fd\uFB03n"     }, //updated with 3.0\r
59 \r
60         { "Henry IV",           "Henry IV",             "Henry IV"          },\r
61         { "Henry \u2163",       "Henry \u2163",         "Henry \u2163"      },\r
62 \r
63         { "\u30AC",             "\u30AB\u3099",         "\u30AC"            }, // ga (Katakana)\r
64         { "\u30AB\u3099",       "\u30AB\u3099",         "\u30AC"            }, // ka + ten\r
65         { "\uFF76\uFF9E",       "\uFF76\uFF9E",         "\uFF76\uFF9E"      }, // hw_ka + hw_ten\r
66         { "\u30AB\uFF9E",       "\u30AB\uFF9E",         "\u30AB\uFF9E"      }, // ka + hw_ten\r
67         { "\uFF76\u3099",       "\uFF76\u3099",         "\uFF76\u3099"      }, // hw_ka + ten\r
68 \r
69         { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" },\r
70         {"\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e","\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165", "\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165"},\r
71     };\r
72 \r
73     String[][] compatTests = {\r
74             // Input                Decomposed              Composed\r
75         { "cat",                 "cat",                     "cat"           },\r
76         { "\uFB4f",             "\u05D0\u05DC",         "\u05D0\u05DC",     }, // Alef-Lamed vs. Alef, Lamed\r
77 \r
78         { "\u00C4ffin",         "A\u0308ffin",          "\u00C4ffin"        },\r
79         { "\u00C4\uFB03n",      "A\u0308ffin",          "\u00C4ffin"        }, // ffi ligature -> f + f + i\r
80 \r
81         { "\u00fdffin",         "y\u0301ffin",          "\u00fdffin"        },        //updated for 3.0\r
82         { "\u00fd\uFB03n",      "y\u0301ffin",          "\u00fdffin"        }, // ffi ligature -> f + f + i\r
83 \r
84         { "Henry IV",           "Henry IV",             "Henry IV"          },\r
85         { "Henry \u2163",       "Henry IV",             "Henry IV"          },\r
86 \r
87         { "\u30AC",             "\u30AB\u3099",         "\u30AC"            }, // ga (Katakana)\r
88         { "\u30AB\u3099",       "\u30AB\u3099",         "\u30AC"            }, // ka + ten\r
89 \r
90         { "\uFF76\u3099",       "\u30AB\u3099",         "\u30AC"            }, // hw_ka + ten\r
91 \r
92         /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/\r
93         { "\uFF76\uFF9E",       "\u30AB\u3099",         "\u30AC"            }, // hw_ka + hw_ten\r
94         { "\u30AB\uFF9E",       "\u30AB\u3099",         "\u30AC"            }, // ka + hw_ten\r
95 \r
96     };\r
97 \r
98     // With Canonical decomposition, Hangul syllables should get decomposed\r
99     // into Jamo, but Jamo characters should not be decomposed into\r
100     // conjoining Jamo\r
101     String[][] hangulCanon = {\r
102         // Input                Decomposed              Composed\r
103         { "\ud4db",             "\u1111\u1171\u11b6",   "\ud4db"        },\r
104         { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6",   "\ud4db"        },\r
105     };\r
106 \r
107     // With compatibility decomposition turned on,\r
108     // it should go all the way down to conjoining Jamo characters.\r
109     // THIS IS NO LONGER TRUE IN UNICODE v2.1.8, SO THIS TEST IS OBSOLETE\r
110     String[][] hangulCompat = {\r
111         // Input        Decomposed                          Composed\r
112         // { "\ud4db",     "\u1111\u116e\u1175\u11af\u11c2",   "\ud478\u1175\u11af\u11c2"  },\r
113     };\r
114 \r
115     public void TestHangulCompose()\r
116                 throws Exception{\r
117         // Make sure that the static composition methods work\r
118         logln("Canonical composition...");\r
119         staticTest(Normalizer.NFC, hangulCanon,  2);\r
120         logln("Compatibility composition...");\r
121         staticTest(Normalizer.NFKC, hangulCompat, 2);\r
122         // Now try iterative composition....\r
123         logln("Iterative composition...");\r
124         Normalizer norm = new Normalizer("", Normalizer.NFC,0);\r
125         iterateTest(norm, hangulCanon, 2);\r
126 \r
127         norm.setMode(Normalizer.NFKD);\r
128         iterateTest(norm, hangulCompat, 2);\r
129 \r
130         // And finally, make sure you can do it in reverse too\r
131         logln("Reverse iteration...");\r
132         norm.setMode(Normalizer.NFC);\r
133         backAndForth(norm, hangulCanon);\r
134      }\r
135 \r
136     public void TestHangulDecomp() throws Exception{\r
137         // Make sure that the static decomposition methods work\r
138         logln("Canonical decomposition...");\r
139         staticTest(Normalizer.NFD, hangulCanon,  1);\r
140         logln("Compatibility decomposition...");\r
141         staticTest(Normalizer.NFKD, hangulCompat, 1);\r
142 \r
143          // Now the iterative decomposition methods...\r
144         logln("Iterative decomposition...");\r
145         Normalizer norm = new Normalizer("", Normalizer.NFD,0);\r
146         iterateTest(norm, hangulCanon, 1);\r
147 \r
148         norm.setMode(Normalizer.NFKD);\r
149         iterateTest(norm, hangulCompat, 1);\r
150 \r
151         // And finally, make sure you can do it in reverse too\r
152         logln("Reverse iteration...");\r
153         norm.setMode(Normalizer.NFD);\r
154         backAndForth(norm, hangulCanon);\r
155     }\r
156     public void TestNone() throws Exception{\r
157         Normalizer norm = new Normalizer("", Normalizer.NONE,0);\r
158         iterateTest(norm, canonTests, 0);\r
159         staticTest(Normalizer.NONE, canonTests, 0);\r
160     }\r
161     public void TestDecomp() throws Exception{\r
162         Normalizer norm = new Normalizer("", Normalizer.NFD,0);\r
163         iterateTest(norm, canonTests, 1);\r
164         staticTest(Normalizer.NFD, canonTests, 1);\r
165         decomposeTest(Normalizer.NFD, canonTests, 1);\r
166     }\r
167 \r
168     public void TestCompatDecomp() throws Exception{\r
169         Normalizer norm = new Normalizer("", Normalizer.NFKD,0);\r
170         iterateTest(norm, compatTests, 1);\r
171         staticTest(Normalizer.NFKD,compatTests, 1);\r
172         decomposeTest(Normalizer.NFKD,compatTests, 1);\r
173     }\r
174 \r
175     public void TestCanonCompose() throws Exception{\r
176         Normalizer norm = new Normalizer("", Normalizer.NFC,0);\r
177         iterateTest(norm, canonTests, 2);\r
178         staticTest(Normalizer.NFC, canonTests, 2);\r
179         composeTest(Normalizer.NFC, canonTests, 2);\r
180     }\r
181 \r
182     public void TestCompatCompose() throws Exception{\r
183         Normalizer norm = new Normalizer("", Normalizer.NFKC,0);\r
184         iterateTest(norm, compatTests, 2);\r
185         staticTest(Normalizer.NFKC,compatTests, 2);\r
186         composeTest(Normalizer.NFKC,compatTests, 2);\r
187     }\r
188 \r
189     public void TestExplodingBase() throws Exception{\r
190         // \u017f - Latin small letter long s\r
191         // \u0307 - combining dot above\r
192         // \u1e61 - Latin small letter s with dot above\r
193         // \u1e9b - Latin small letter long s with dot above\r
194         String[][] canon = {\r
195             // Input                Decomposed              Composed\r
196             { "Tschu\u017f",        "Tschu\u017f",          "Tschu\u017f"    },\r
197             { "Tschu\u1e9b",        "Tschu\u017f\u0307",    "Tschu\u1e9b"    },\r
198         };\r
199         String[][] compat = {\r
200             // Input                Decomposed              Composed\r
201             { "\u017f",        "s",              "s"           },\r
202             { "\u1e9b",        "s\u0307",        "\u1e61"      },\r
203         };\r
204 \r
205         staticTest(Normalizer.NFD, canon,  1);\r
206         staticTest(Normalizer.NFC, canon,  2);\r
207 \r
208         staticTest(Normalizer.NFKD, compat, 1);\r
209         staticTest(Normalizer.NFKC, compat, 2);\r
210 \r
211     }\r
212 \r
213     /**\r
214      * The Tibetan vowel sign AA, 0f71, was messed up prior to\r
215      * Unicode version 2.1.9.\r
216      * Once 2.1.9 or 3.0 is released, uncomment this test.\r
217      */\r
218     public void TestTibetan() throws Exception{\r
219         String[][] decomp = {\r
220             { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" }\r
221         };\r
222         String[][] compose = {\r
223             { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" }\r
224         };\r
225 \r
226         staticTest(Normalizer.NFD, decomp, 1);\r
227         staticTest(Normalizer.NFKD,decomp, 2);\r
228         staticTest(Normalizer.NFC, compose, 1);\r
229         staticTest(Normalizer.NFKC,compose, 2);\r
230     }\r
231 \r
232     /**\r
233      * Make sure characters in the CompositionExclusion.txt list do not get\r
234      * composed to.\r
235      */\r
236     public void TestCompositionExclusion()\r
237                 throws Exception{\r
238         // This list is generated from CompositionExclusion.txt.\r
239         // Update whenever the normalizer tables are updated.  Note\r
240         // that we test all characters listed, even those that can be\r
241         // derived from the Unicode DB and are therefore commented\r
242         // out.\r
243         String EXCLUDED =\r
244             "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" +\r
245             "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" +\r
246             "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" +\r
247             "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" +\r
248             "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" +\r
249             "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" +\r
250             "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u1FDB" +\r
251             "\u1FE3\u1FEB\u1FEE\u1FEF\u1FF9\u1FFB\u1FFD\u2000" +\r
252             "\u2001\u2126\u212A\u212B\u2329\u232A\uF900\uFA10" +\r
253             "\uFA12\uFA15\uFA20\uFA22\uFA25\uFA26\uFA2A\uFB1F" +\r
254             "\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E\uFB2F\uFB30\uFB31" +\r
255             "\uFB32\uFB33\uFB34\uFB35\uFB36\uFB38\uFB39\uFB3A" +\r
256             "\uFB3B\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46" +\r
257             "\uFB47\uFB48\uFB49\uFB4A\uFB4B\uFB4C\uFB4D\uFB4E";\r
258         for (int i=0; i<EXCLUDED.length(); ++i) {\r
259             String a = String.valueOf(EXCLUDED.charAt(i));\r
260             String b = Normalizer.normalize(a, Normalizer.NFKD);\r
261             String c = Normalizer.normalize(b, Normalizer.NFC);\r
262             if (c.equals(a)) {\r
263                 errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " +\r
264                       hex(b) + " x COMPOSE => " +\r
265                       hex(c));\r
266             } else if (isVerbose()) {\r
267                 logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " +\r
268                       hex(b) + " x COMPOSE => " +\r
269                       hex(c));\r
270             }\r
271         }\r
272         // The following method works too, but it is somewhat\r
273         // incestuous.  It uses UInfo, which is the same database that\r
274         // NormalizerBuilder uses, so if something is wrong with\r
275         // UInfo, the following test won't show it.  All it will show\r
276         // is that NormalizerBuilder has been run with whatever the\r
277         // current UInfo is.\r
278         //\r
279         // We comment this out in favor of the test above, which\r
280         // provides independent verification (but also requires\r
281         // independent updating).\r
282 //      logln("---");\r
283 //      UInfo uinfo = new UInfo();\r
284 //      for (int i=0; i<=0xFFFF; ++i) {\r
285 //          if (!uinfo.isExcludedComposition((char)i) ||\r
286 //              (!uinfo.hasCanonicalDecomposition((char)i) &&\r
287 //               !uinfo.hasCompatibilityDecomposition((char)i))) continue;\r
288 //          String a = String.valueOf((char)i);\r
289 //          String b = Normalizer.normalize(a,Normalizer.DECOMP_COMPAT,0);\r
290 //          String c = Normalizer.normalize(b,Normalizer.COMPOSE,0);\r
291 //          if (c.equals(a)) {\r
292 //              errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " +\r
293 //                    hex(b) + " x COMPOSE => " +\r
294 //                    hex(c));\r
295 //          } else if (isVerbose()) {\r
296 //              logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " +\r
297 //                    hex(b) + " x COMPOSE => " +\r
298 //                    hex(c));\r
299 //          }\r
300 //      }\r
301     }\r
302 \r
303     /**\r
304      * Test for a problem that showed up just before ICU 1.6 release\r
305      * having to do with combining characters with an index of zero.\r
306      * Such characters do not participate in any canonical\r
307      * decompositions.  However, having an index of zero means that\r
308      * they all share one typeMask[] entry, that is, they all have to\r
309      * map to the same canonical class, which is not the case, in\r
310      * reality.\r
311      */\r
312     public void TestZeroIndex()\r
313                 throws Exception{\r
314         String[] DATA = {\r
315             // Expect col1 x COMPOSE_COMPAT => col2\r
316             // Expect col2 x DECOMP => col3\r
317             "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300",\r
318             "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300",\r
319             "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300",\r
320             "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327",\r
321             "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321",\r
322         };\r
323 \r
324         for (int i=0; i<DATA.length; i+=3) {\r
325             String a = DATA[i];\r
326             String b = Normalizer.normalize(a, Normalizer.NFKC);\r
327             String exp = DATA[i+1];\r
328             if (b.equals(exp)) {\r
329                 logln("Ok: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b));\r
330             } else {\r
331                 errln("FAIL: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b) +\r
332                       ", expect " + hex(exp));\r
333             }\r
334             a = Normalizer.normalize(b, Normalizer.NFD);\r
335             exp = DATA[i+2];\r
336             if (a.equals(exp)) {\r
337                 logln("Ok: " + hex(b) + " x DECOMP => " + hex(a));\r
338             } else {\r
339                 errln("FAIL: " + hex(b) + " x DECOMP => " + hex(a) +\r
340                       ", expect " + hex(exp));\r
341             }\r
342         }\r
343     }\r
344 \r
345     /**\r
346      * Test for a problem found by Verisign.  Problem is that\r
347      * characters at the start of a string are not put in canonical\r
348      * order correctly by compose() if there is no starter.\r
349      */\r
350     public void TestVerisign()\r
351                 throws Exception{\r
352         String[] inputs = {\r
353             "\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f",\r
354             "\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad"\r
355         };\r
356         String[] outputs = {\r
357             "\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f",\r
358             "\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4"\r
359         };\r
360 \r
361         for (int i = 0; i < inputs.length; ++i) {\r
362             String input = inputs[i];\r
363             String output = outputs[i];\r
364             String result = Normalizer.decompose(input, false);\r
365             if (!result.equals(output)) {\r
366                 errln("FAIL input: " + hex(input));\r
367                 errln(" decompose: " + hex(result));\r
368                 errln("  expected: " + hex(output));\r
369             }\r
370             result = Normalizer.compose(input, false);\r
371             if (!result.equals(output)) {\r
372                 errln("FAIL input: " + hex(input));\r
373                 errln("   compose: " + hex(result));\r
374                 errln("  expected: " + hex(output));\r
375             }\r
376         }\r
377 \r
378     }\r
379     public void  TestQuickCheckResultNO()\r
380                  throws Exception{\r
381         final char CPNFD[] = {0x00C5, 0x0407, 0x1E00, 0x1F57, 0x220C,\r
382                                 0x30AE, 0xAC00, 0xD7A3, 0xFB36, 0xFB4E};\r
383         final char CPNFC[] = {0x0340, 0x0F93, 0x1F77, 0x1FBB, 0x1FEB,\r
384                                 0x2000, 0x232A, 0xF900, 0xFA1E, 0xFB4E};\r
385         final char CPNFKD[] = {0x00A0, 0x02E4, 0x1FDB, 0x24EA, 0x32FE,\r
386                                 0xAC00, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D};\r
387         final char CPNFKC[] = {0x00A0, 0x017F, 0x2000, 0x24EA, 0x32FE,\r
388                                 0x33FE, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D};\r
389 \r
390 \r
391         final int SIZE = 10;\r
392 \r
393         int count = 0;\r
394         for (; count < SIZE; count ++)\r
395         {\r
396             if (Normalizer.quickCheck(String.valueOf(CPNFD[count]),\r
397                     Normalizer.NFD,0) != Normalizer.NO)\r
398             {\r
399                 errln("ERROR in NFD quick check at U+" +\r
400                        Integer.toHexString(CPNFD[count]));\r
401                 return;\r
402             }\r
403             if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),\r
404                         Normalizer.NFC,0) !=Normalizer.NO)\r
405             {\r
406                 errln("ERROR in NFC quick check at U+"+\r
407                        Integer.toHexString(CPNFC[count]));\r
408                 return;\r
409             }\r
410             if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]),\r
411                                 Normalizer.NFKD,0) != Normalizer.NO)\r
412             {\r
413                 errln("ERROR in NFKD quick check at U+"+\r
414                        Integer.toHexString(CPNFKD[count]));\r
415                 return;\r
416             }\r
417             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),\r
418                                          Normalizer.NFKC,0) !=Normalizer.NO)\r
419             {\r
420                 errln("ERROR in NFKC quick check at U+"+\r
421                        Integer.toHexString(CPNFKC[count]));\r
422                 return;\r
423             }\r
424             // for improving coverage\r
425             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),\r
426                                          Normalizer.NFKC) !=Normalizer.NO)\r
427             {\r
428                 errln("ERROR in NFKC quick check at U+"+\r
429                        Integer.toHexString(CPNFKC[count]));\r
430                 return;\r
431             }\r
432         }\r
433     }\r
434 \r
435 \r
436     public void TestQuickCheckResultYES()\r
437                 throws Exception{\r
438         final char CPNFD[] = {0x00C6, 0x017F, 0x0F74, 0x1000, 0x1E9A,\r
439                                 0x2261, 0x3075, 0x4000, 0x5000, 0xF000};\r
440         final char CPNFC[] = {0x0400, 0x0540, 0x0901, 0x1000, 0x1500,\r
441                                 0x1E9A, 0x3000, 0x4000, 0x5000, 0xF000};\r
442         final char CPNFKD[] = {0x00AB, 0x02A0, 0x1000, 0x1027, 0x2FFB,\r
443                                 0x3FFF, 0x4FFF, 0xA000, 0xF000, 0xFA27};\r
444         final char CPNFKC[] = {0x00B0, 0x0100, 0x0200, 0x0A02, 0x1000,\r
445                                 0x2010, 0x3030, 0x4000, 0xA000, 0xFA0E};\r
446 \r
447         final int SIZE = 10;\r
448         int count = 0;\r
449 \r
450         char cp = 0;\r
451         while (cp < 0xA0)\r
452         {\r
453             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFD,0)\r
454                                             != Normalizer.YES)\r
455             {\r
456                 errln("ERROR in NFD quick check at U+"+\r
457                                                       Integer.toHexString(cp));\r
458                 return;\r
459             }\r
460             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFC,0)\r
461                                              != Normalizer.YES)\r
462             {\r
463                 errln("ERROR in NFC quick check at U+"+\r
464                                                       Integer.toHexString(cp));\r
465                 return;\r
466             }\r
467             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKD,0)\r
468                                              != Normalizer.YES)\r
469             {\r
470                 errln("ERROR in NFKD quick check at U+" +\r
471                                                       Integer.toHexString(cp));\r
472                 return;\r
473             }\r
474             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC,0)\r
475                                              != Normalizer.YES)\r
476             {\r
477                 errln("ERROR in NFKC quick check at U+"+\r
478                                                        Integer.toHexString(cp));\r
479                 return;\r
480             }\r
481             // improve the coverage\r
482             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC)\r
483                                              != Normalizer.YES)\r
484             {\r
485                 errln("ERROR in NFKC quick check at U+"+\r
486                                                        Integer.toHexString(cp));\r
487                 return;\r
488             }\r
489             cp++;\r
490         }\r
491 \r
492         for (; count < SIZE; count ++)\r
493         {\r
494             if (Normalizer.quickCheck(String.valueOf(CPNFD[count]),\r
495                                          Normalizer.NFD,0)!=Normalizer.YES)\r
496             {\r
497                 errln("ERROR in NFD quick check at U+"+\r
498                                              Integer.toHexString(CPNFD[count]));\r
499                 return;\r
500             }\r
501             if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),\r
502                                          Normalizer.NFC,0)!=Normalizer.YES)\r
503             {\r
504                 errln("ERROR in NFC quick check at U+"+\r
505                                              Integer.toHexString(CPNFC[count]));\r
506                 return;\r
507             }\r
508             if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]),\r
509                                          Normalizer.NFKD,0)!=Normalizer.YES)\r
510             {\r
511                 errln("ERROR in NFKD quick check at U+"+\r
512                                     Integer.toHexString(CPNFKD[count]));\r
513                 return;\r
514             }\r
515             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),\r
516                                          Normalizer.NFKC,0)!=Normalizer.YES)\r
517             {\r
518                 errln("ERROR in NFKC quick check at U+"+\r
519                         Integer.toHexString(CPNFKC[count]));\r
520                 return;\r
521             }\r
522             // improve the coverage\r
523             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),\r
524                                          Normalizer.NFKC)!=Normalizer.YES)\r
525             {\r
526                 errln("ERROR in NFKC quick check at U+"+\r
527                         Integer.toHexString(CPNFKC[count]));\r
528                 return;\r
529             }\r
530         }\r
531     }\r
532     public void TestBengali() throws Exception{\r
533         String input = "\u09bc\u09be\u09cd\u09be";\r
534         String output=Normalizer.normalize(input,Normalizer.NFC);\r
535         if(!input.equals(output)){\r
536              errln("ERROR in NFC of string");\r
537         }\r
538     }\r
539     public void TestQuickCheckResultMAYBE()\r
540                 throws Exception{\r
541 \r
542         final char[] CPNFC = {0x0306, 0x0654, 0x0BBE, 0x102E, 0x1161,\r
543                                 0x116A, 0x1173, 0x1175, 0x3099, 0x309A};\r
544         final char[] CPNFKC = {0x0300, 0x0654, 0x0655, 0x09D7, 0x0B3E,\r
545                                 0x0DCF, 0xDDF, 0x102E, 0x11A8, 0x3099};\r
546 \r
547 \r
548         final int SIZE = 10;\r
549 \r
550         int count = 0;\r
551 \r
552         /* NFD and NFKD does not have any MAYBE codepoints */\r
553         for (; count < SIZE; count ++)\r
554         {\r
555             if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),\r
556                                         Normalizer.NFC,0)!=Normalizer.MAYBE)\r
557             {\r
558                 errln("ERROR in NFC quick check at U+"+\r
559                                             Integer.toHexString(CPNFC[count]));\r
560                 return;\r
561             }\r
562             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),\r
563                                        Normalizer.NFKC,0)!=Normalizer.MAYBE)\r
564             {\r
565                 errln("ERROR in NFKC quick check at U+"+\r
566                                             Integer.toHexString(CPNFKC[count]));\r
567                 return;\r
568             }\r
569             if (Normalizer.quickCheck(new char[]{CPNFC[count]},\r
570                                         Normalizer.NFC,0)!=Normalizer.MAYBE)\r
571             {\r
572                 errln("ERROR in NFC quick check at U+"+\r
573                                             Integer.toHexString(CPNFC[count]));\r
574                 return;\r
575             }\r
576             if (Normalizer.quickCheck(new char[]{CPNFKC[count]},\r
577                                        Normalizer.NFKC,0)!=Normalizer.MAYBE)\r
578             {\r
579                 errln("ERROR in NFKC quick check at U+"+\r
580                                             Integer.toHexString(CPNFKC[count]));\r
581                 return;\r
582             }\r
583             if (Normalizer.quickCheck(new char[]{CPNFKC[count]},\r
584                                        Normalizer.NONE,0)!=Normalizer.MAYBE)\r
585             {\r
586                 errln("ERROR in NFKC quick check at U+"+\r
587                                             Integer.toHexString(CPNFKC[count]));\r
588                 return;\r
589             }\r
590         }\r
591     }\r
592 \r
593     public void TestQuickCheckStringResult()\r
594                 throws Exception{\r
595         int count;\r
596         String d;\r
597         String c;\r
598 \r
599         for (count = 0; count < canonTests.length; count ++)\r
600         {\r
601             d = canonTests[count][1];\r
602             c = canonTests[count][2];\r
603             if (Normalizer.quickCheck(d,Normalizer.NFD,0)\r
604                                             != Normalizer.YES)\r
605             {\r
606                 errln("ERROR in NFD quick check for string at count " + count);\r
607                 return;\r
608             }\r
609 \r
610             if (Normalizer.quickCheck(c, Normalizer.NFC,0)\r
611                                             == Normalizer.NO)\r
612             {\r
613                 errln("ERROR in NFC quick check for string at count " + count);\r
614                 return;\r
615             }\r
616         }\r
617 \r
618         for (count = 0; count < compatTests.length; count ++)\r
619         {\r
620             d = compatTests[count][1];\r
621             c = compatTests[count][2];\r
622             if (Normalizer.quickCheck(d, Normalizer.NFKD,0)\r
623                                             != Normalizer.YES)\r
624             {\r
625                 errln("ERROR in NFKD quick check for string at count " + count);\r
626                 return;\r
627             }\r
628 \r
629             if (Normalizer.quickCheck(c,  Normalizer.NFKC,0)\r
630                                             != Normalizer.YES)\r
631             {\r
632                 errln("ERROR in NFKC quick check for string at count " + count);\r
633                 return;\r
634             }\r
635         }\r
636     }\r
637 \r
638     static final int qcToInt(Normalizer.QuickCheckResult qc) {\r
639         if(qc==Normalizer.NO) {\r
640             return 0;\r
641         } else if(qc==Normalizer.YES) {\r
642             return 1;\r
643         } else /* Normalizer.MAYBE */ {\r
644             return 2;\r
645         }\r
646     }\r
647 \r
648     public void TestQuickCheckPerCP() {\r
649         int c, lead, trail;\r
650         String s, nfd;\r
651         int lccc1, lccc2, tccc1, tccc2;\r
652         int qc1, qc2;\r
653 \r
654         if(\r
655             UCharacter.getIntPropertyMaxValue(UProperty.NFD_QUICK_CHECK)!=1 || // YES\r
656             UCharacter.getIntPropertyMaxValue(UProperty.NFKD_QUICK_CHECK)!=1 ||\r
657             UCharacter.getIntPropertyMaxValue(UProperty.NFC_QUICK_CHECK)!=2 || // MAYBE\r
658             UCharacter.getIntPropertyMaxValue(UProperty.NFKC_QUICK_CHECK)!=2 ||\r
659             UCharacter.getIntPropertyMaxValue(UProperty.LEAD_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) ||\r
660             UCharacter.getIntPropertyMaxValue(UProperty.TRAIL_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS)\r
661         ) {\r
662             errln("wrong result from one of the u_getIntPropertyMaxValue(UCHAR_NF*_QUICK_CHECK) or UCHAR_*_CANONICAL_COMBINING_CLASS");\r
663         }\r
664 \r
665         /*\r
666          * compare the quick check property values for some code points\r
667          * to the quick check results for checking same-code point strings\r
668          */\r
669         c=0;\r
670         while(c<0x110000) {\r
671             s=UTF16.valueOf(c);\r
672 \r
673             qc1=UCharacter.getIntPropertyValue(c, UProperty.NFC_QUICK_CHECK);\r
674             qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFC));\r
675             if(qc1!=qc2) {\r
676                 errln("getIntPropertyValue(NFC)="+qc1+" != "+qc2+"=quickCheck(NFC) for U+"+Integer.toHexString(c));\r
677             }\r
678 \r
679             qc1=UCharacter.getIntPropertyValue(c, UProperty.NFD_QUICK_CHECK);\r
680             qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFD));\r
681             if(qc1!=qc2) {\r
682                 errln("getIntPropertyValue(NFD)="+qc1+" != "+qc2+"=quickCheck(NFD) for U+"+Integer.toHexString(c));\r
683             }\r
684 \r
685             qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKC_QUICK_CHECK);\r
686             qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKC));\r
687             if(qc1!=qc2) {\r
688                 errln("getIntPropertyValue(NFKC)="+qc1+" != "+qc2+"=quickCheck(NFKC) for U+"+Integer.toHexString(c));\r
689             }\r
690 \r
691             qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKD_QUICK_CHECK);\r
692             qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKD));\r
693             if(qc1!=qc2) {\r
694                 errln("getIntPropertyValue(NFKD)="+qc1+" != "+qc2+"=quickCheck(NFKD) for U+"+Integer.toHexString(c));\r
695             }\r
696 \r
697             nfd=Normalizer.normalize(s, Normalizer.NFD);\r
698             lead=UTF16.charAt(nfd, 0);\r
699             trail=UTF16.charAt(nfd, nfd.length()-1);\r
700 \r
701             lccc1=UCharacter.getIntPropertyValue(c, UProperty.LEAD_CANONICAL_COMBINING_CLASS);\r
702             lccc2=UCharacter.getCombiningClass(lead);\r
703             tccc1=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS);\r
704             tccc2=UCharacter.getCombiningClass(trail);\r
705 \r
706             if(lccc1!=lccc2) {\r
707                 errln("getIntPropertyValue(lccc)="+lccc1+" != "+lccc2+"=getCombiningClass(lead) for U+"+Integer.toHexString(c));\r
708             }\r
709             if(tccc1!=tccc2) {\r
710                 errln("getIntPropertyValue(tccc)="+tccc1+" != "+tccc2+"=getCombiningClass(trail) for U+"+Integer.toHexString(c));\r
711             }\r
712 \r
713             /* skip some code points */\r
714             c=(20*c)/19+1;\r
715         }\r
716     }\r
717 \r
718     //------------------------------------------------------------------------\r
719     // Internal utilities\r
720     //\r
721        //------------------------------------------------------------------------\r
722     // Internal utilities\r
723     //\r
724 \r
725 /*    private void backAndForth(Normalizer iter, String input)\r
726     {\r
727         iter.setText(input);\r
728 \r
729         // Run through the iterator forwards and stick it into a StringBuffer\r
730         StringBuffer forward =  new StringBuffer();\r
731         for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) {\r
732             forward.append(ch);\r
733         }\r
734 \r
735         // Now do it backwards\r
736         StringBuffer reverse = new StringBuffer();\r
737         for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) {\r
738             reverse.insert(0, ch);\r
739         }\r
740 \r
741         if (!forward.toString().equals(reverse.toString())) {\r
742             errln("FAIL: Forward/reverse mismatch for input " + hex(input)\r
743                   + ", forward: " + hex(forward) + ", backward: "+hex(reverse));\r
744         } else if (isVerbose()) {\r
745             logln("Ok: Forward/reverse for input " + hex(input)\r
746                   + ", forward: " + hex(forward) + ", backward: "+hex(reverse));\r
747         }\r
748     }*/\r
749 \r
750     private void backAndForth(Normalizer iter, String[][] tests)\r
751     {\r
752         for (int i = 0; i < tests.length; i++)\r
753         {\r
754             iter.setText(tests[i][0]);\r
755 \r
756             // Run through the iterator forwards and stick it into a\r
757             // StringBuffer\r
758             StringBuffer forward =  new StringBuffer();\r
759             for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) {\r
760                 forward.append(ch);\r
761             }\r
762 \r
763             // Now do it backwards\r
764             StringBuffer reverse = new StringBuffer();\r
765             for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) {\r
766                 reverse.insert(0, ch);\r
767             }\r
768 \r
769             if (!forward.toString().equals(reverse.toString())) {\r
770                 errln("FAIL: Forward/reverse mismatch for input "\r
771                     + hex(tests[i][0]) + ", forward: " + hex(forward)\r
772                     + ", backward: " + hex(reverse));\r
773             } else if (isVerbose()) {\r
774                 logln("Ok: Forward/reverse for input " + hex(tests[i][0])\r
775                       + ", forward: " + hex(forward) + ", backward: "\r
776                       + hex(reverse));\r
777             }\r
778         }\r
779     }\r
780 \r
781     private void staticTest (Normalizer.Mode mode,\r
782                              String[][] tests, int outCol) throws Exception{\r
783         for (int i = 0; i < tests.length; i++)\r
784         {\r
785             String input = Utility.unescape(tests[i][0]);\r
786             String expect = Utility.unescape(tests[i][outCol]);\r
787 \r
788             logln("Normalizing '" + input + "' (" + hex(input) + ")" );\r
789 \r
790             String output = Normalizer.normalize(input, mode);\r
791 \r
792             if (!output.equals(expect)) {\r
793                 errln("FAIL: case " + i\r
794                     + " expected '" + expect + "' (" + hex(expect) + ")"\r
795                     + " but got '" + output + "' (" + hex(output) + ")" );\r
796             }\r
797         }\r
798         char[] output = new char[1];\r
799         for (int i = 0; i < tests.length; i++)\r
800         {\r
801             char[] input = Utility.unescape(tests[i][0]).toCharArray();\r
802             String expect =Utility.unescape( tests[i][outCol]);\r
803 \r
804             logln("Normalizing '" + new String(input) + "' (" +\r
805                         hex(new String(input)) + ")" );\r
806             int reqLength=0;\r
807             while(true){\r
808                 try{\r
809                     reqLength=Normalizer.normalize(input,output, mode,0);\r
810                     if(reqLength<=output.length    ){\r
811                         break;\r
812                     }\r
813                 }catch(IndexOutOfBoundsException e){\r
814                     output= new char[Integer.parseInt(e.getMessage())];\r
815                     continue;\r
816                 }\r
817             }\r
818             if (!expect.equals(new String(output,0,reqLength))) {\r
819                 errln("FAIL: case " + i\r
820                     + " expected '" + expect + "' (" + hex(expect) + ")"\r
821                     + " but got '" + new String(output)\r
822                     + "' ("  + hex(new String(output)) + ")" );\r
823             }\r
824         }\r
825     }\r
826     private void decomposeTest(Normalizer.Mode mode,\r
827                              String[][] tests, int outCol) throws Exception{\r
828         for (int i = 0; i < tests.length; i++)\r
829         {\r
830             String input = Utility.unescape(tests[i][0]);\r
831             String expect = Utility.unescape(tests[i][outCol]);\r
832 \r
833             logln("Normalizing '" + input + "' (" + hex(input) + ")" );\r
834 \r
835             String output = Normalizer.decompose(input, mode==Normalizer.NFKD);\r
836 \r
837             if (!output.equals(expect)) {\r
838                 errln("FAIL: case " + i\r
839                     + " expected '" + expect + "' (" + hex(expect) + ")"\r
840                     + " but got '" + output + "' (" + hex(output) + ")" );\r
841             }\r
842         }\r
843         char[] output = new char[1];\r
844         for (int i = 0; i < tests.length; i++)\r
845         {\r
846             char[] input = Utility.unescape(tests[i][0]).toCharArray();\r
847             String expect = Utility.unescape(tests[i][outCol]);\r
848 \r
849             logln("Normalizing '" + new String(input) + "' (" +\r
850                         hex(new String(input)) + ")" );\r
851             int reqLength=0;\r
852             while(true){\r
853                 try{\r
854                     reqLength=Normalizer.decompose(input,output, mode==Normalizer.NFKD,0);\r
855                     if(reqLength<=output.length ){\r
856                         break;\r
857                     }\r
858                 }catch(IndexOutOfBoundsException e){\r
859                     output= new char[Integer.parseInt(e.getMessage())];\r
860                     continue;\r
861                 }\r
862             }\r
863             if (!expect.equals(new String(output,0,reqLength))) {\r
864                 errln("FAIL: case " + i\r
865                     + " expected '" + expect + "' (" + hex(expect) + ")"\r
866                     + " but got '" + new String(output)\r
867                     + "' ("  + hex(new String(output)) + ")" );\r
868             }\r
869         }\r
870         output = new char[1];\r
871         for (int i = 0; i < tests.length; i++)\r
872         {\r
873            char[] input = Utility.unescape(tests[i][0]).toCharArray();\r
874            String expect = Utility.unescape(tests[i][outCol]);\r
875     \r
876            logln("Normalizing '" + new String(input) + "' (" +\r
877                        hex(new String(input)) + ")" );\r
878            int reqLength=0;\r
879            while(true){\r
880                try{\r
881                    reqLength=Normalizer.decompose(input,0,input.length,output,0,output.length, mode==Normalizer.NFKD,0);\r
882                    if(reqLength<=output.length ){\r
883                        break;\r
884                    }\r
885                }catch(IndexOutOfBoundsException e){\r
886                    output= new char[Integer.parseInt(e.getMessage())];\r
887                    continue;\r
888                }\r
889            }\r
890            if (!expect.equals(new String(output,0,reqLength))) {\r
891                errln("FAIL: case " + i\r
892                    + " expected '" + expect + "' (" + hex(expect) + ")"\r
893                    + " but got '" + new String(output)\r
894                    + "' ("  + hex(new String(output)) + ")" );\r
895            }\r
896            char[] output2 = new char[reqLength * 2];\r
897            System.arraycopy(output, 0, output2, 0, reqLength);\r
898            int retLength = Normalizer.decompose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0);\r
899            if(retLength != reqLength){\r
900                logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength);\r
901            }\r
902         }\r
903     }\r
904 \r
905     private void composeTest(Normalizer.Mode mode,\r
906                              String[][] tests, int outCol) throws Exception{\r
907         for (int i = 0; i < tests.length; i++)\r
908         {\r
909             String input = Utility.unescape(tests[i][0]);\r
910             String expect = Utility.unescape(tests[i][outCol]);\r
911 \r
912             logln("Normalizing '" + input + "' (" + hex(input) + ")" );\r
913 \r
914             String output = Normalizer.compose(input, mode==Normalizer.NFKC);\r
915 \r
916             if (!output.equals(expect)) {\r
917                 errln("FAIL: case " + i\r
918                     + " expected '" + expect + "' (" + hex(expect) + ")"\r
919                     + " but got '" + output + "' (" + hex(output) + ")" );\r
920             }\r
921         }\r
922         char[] output = new char[1];\r
923         for (int i = 0; i < tests.length; i++)\r
924         {\r
925             char[] input = Utility.unescape(tests[i][0]).toCharArray();\r
926             String expect = Utility.unescape(tests[i][outCol]);\r
927 \r
928             logln("Normalizing '" + new String(input) + "' (" +\r
929                         hex(new String(input)) + ")" );\r
930             int reqLength=0;\r
931             while(true){\r
932                 try{\r
933                     reqLength=Normalizer.compose(input,output, mode==Normalizer.NFKC,0);\r
934                     if(reqLength<=output.length ){\r
935                         break;\r
936                     }\r
937                 }catch(IndexOutOfBoundsException e){\r
938                     output= new char[Integer.parseInt(e.getMessage())];\r
939                     continue;\r
940                 }\r
941             }\r
942             if (!expect.equals(new String(output,0,reqLength))) {\r
943                 errln("FAIL: case " + i\r
944                     + " expected '" + expect + "' (" + hex(expect) + ")"\r
945                     + " but got '" + new String(output)\r
946                     + "' ("  + hex(new String(output)) + ")" );\r
947             }\r
948         }\r
949         output = new char[1];\r
950         for (int i = 0; i < tests.length; i++)\r
951         {\r
952             char[] input = Utility.unescape(tests[i][0]).toCharArray();\r
953             String expect = Utility.unescape(tests[i][outCol]);\r
954 \r
955             logln("Normalizing '" + new String(input) + "' (" +\r
956                         hex(new String(input)) + ")" );\r
957             int reqLength=0;\r
958             while(true){\r
959                 try{\r
960                     reqLength=Normalizer.compose(input,0,input.length, output, 0, output.length, mode==Normalizer.NFKC,0);\r
961                     if(reqLength<=output.length ){\r
962                         break;\r
963                     }\r
964                 }catch(IndexOutOfBoundsException e){\r
965                     output= new char[Integer.parseInt(e.getMessage())];\r
966                     continue;\r
967                 }\r
968             }\r
969             if (!expect.equals(new String(output,0,reqLength))) {\r
970                 errln("FAIL: case " + i\r
971                     + " expected '" + expect + "' (" + hex(expect) + ")"\r
972                     + " but got '" + new String(output)\r
973                     + "' ("  + hex(new String(output)) + ")" );\r
974             }\r
975             \r
976             char[] output2 = new char[reqLength * 2];\r
977             System.arraycopy(output, 0, output2, 0, reqLength);\r
978             int retLength = Normalizer.compose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0);\r
979             if(retLength != reqLength){\r
980                 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength);\r
981             }\r
982         }\r
983     }\r
984     private void iterateTest(Normalizer iter, String[][] tests, int outCol){\r
985         for (int i = 0; i < tests.length; i++)\r
986         {\r
987             String input = Utility.unescape(tests[i][0]);\r
988             String expect = Utility.unescape(tests[i][outCol]);\r
989 \r
990             logln("Normalizing '" + input + "' (" + hex(input) + ")" );\r
991 \r
992             iter.setText(input);\r
993             assertEqual(expect, iter, "case " + i + " ");\r
994         }\r
995     }\r
996 \r
997     private void assertEqual(String expected, Normalizer iter, String msg)\r
998     {\r
999         int index = 0;\r
1000         int ch;\r
1001         UCharacterIterator cIter =  UCharacterIterator.getInstance(expected);\r
1002         \r
1003         while ((ch=iter.next())!= Normalizer.DONE){\r
1004             if (index >= expected.length()) {\r
1005                 errln("FAIL: " + msg + "Unexpected character '" + (char)ch\r
1006                         + "' (" + hex(ch) + ")"\r
1007                         + " at index " + index);\r
1008                 break;\r
1009             }\r
1010             int want = UTF16.charAt(expected,index);\r
1011             if (ch != want) {\r
1012                 errln("FAIL: " + msg + "got '" + (char)ch\r
1013                         + "' (" + hex(ch) + ")"\r
1014                         + " but expected '" + want + "' (" + hex(want)+ ")"\r
1015                         + " at index " + index);\r
1016             }\r
1017             index+=  UTF16.getCharCount(ch);\r
1018         }\r
1019         if (index < expected.length()) {\r
1020             errln("FAIL: " + msg + "Only got " + index + " chars, expected "\r
1021             + expected.length());\r
1022         }\r
1023         \r
1024         cIter.setToLimit();\r
1025         while((ch=iter.previous())!=Normalizer.DONE){\r
1026             int want = cIter.previousCodePoint();\r
1027             if (ch != want ) {\r
1028                 errln("FAIL: " + msg + "got '" + (char)ch\r
1029                         + "' (" + hex(ch) + ")"\r
1030                         + " but expected '" + want + "' (" + hex(want) + ")"\r
1031                         + " at index " + index);\r
1032             }\r
1033         }\r
1034     }\r
1035     //--------------------------------------------------------------------------\r
1036 \r
1037     // NOTE: These tests are used for quick debugging so are not ported\r
1038     // to ICU4C tsnorm.cpp in intltest\r
1039     //\r
1040 \r
1041     public void TestDebugStatic(){\r
1042         String in = Utility.unescape("\\U0001D157\\U0001D165");\r
1043         if(!Normalizer.isNormalized(in,Normalizer.NFC,0)){\r
1044             errln("isNormalized failed");\r
1045         }\r
1046 \r
1047         String input  =  "\uAD8B\uAD8B\uAD8B\uAD8B"+\r
1048             "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1049             "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1050             "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1051             "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1052             "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1053             "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+\r
1054             "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+\r
1055             "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+\r
1056             "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+\r
1057             "\uAD8B\uAD8B\uAD8B\uAD8B"+\r
1058             "d\u031B\u0307\u0323";\r
1059         String expect = "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+\r
1060                         "\u11AA\u1100\u116F\u11AA\uD834\uDD57\uD834\uDD65"+\r
1061                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1062                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1063                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1064                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1065                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1066                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1067                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1068                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1069                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1070                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1071                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1072                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1073                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1074                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+\r
1075                         "\uD834\uDD57\uD834\uDD65aaaaaaaaaaaaaaaaaazzzzzz"+\r
1076                         "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+\r
1077                         "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+\r
1078                         "bbbbbbbbbbbbbbbbbbbbbbbbccccccccccccccccccccccccccccc"+\r
1079                         "cccccccccccccccccccccccccccccccccccccccccccccccc"+\r
1080                         "ddddddddddddddddddddddddddddddddddddddddddddddddddddd"+\r
1081                         "dddddddddddddddddddddddd"+\r
1082                         "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+\r
1083                         "\u11AA\u1100\u116F\u11AA\u0064\u031B\u0323\u0307";\r
1084             String output = Normalizer.normalize(Utility.unescape(input),\r
1085                             Normalizer.NFD);\r
1086             if(!expect.equals(output)){\r
1087                 errln("FAIL expected: "+hex(expect) + " got: "+hex(output));\r
1088             }\r
1089 \r
1090 \r
1091 \r
1092     }\r
1093     public void TestDebugIter(){\r
1094         String src = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e");\r
1095         String expected = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e");\r
1096         Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(src)),\r
1097                                                 Normalizer.NONE,0);\r
1098         int index = 0;\r
1099         int ch;\r
1100         UCharacterIterator cIter =  UCharacterIterator.getInstance(expected);\r
1101         \r
1102         while ((ch=iter.next())!= Normalizer.DONE){\r
1103             if (index >= expected.length()) {\r
1104                 errln("FAIL: " +  "Unexpected character '" + (char)ch\r
1105                         + "' (" + hex(ch) + ")"\r
1106                         + " at index " + index);\r
1107                 break;\r
1108             }\r
1109             int want = UTF16.charAt(expected,index);\r
1110             if (ch != want) {\r
1111                 errln("FAIL: " +  "got '" + (char)ch\r
1112                         + "' (" + hex(ch) + ")"\r
1113                         + " but expected '" + want + "' (" + hex(want)+ ")"\r
1114                         + " at index " + index);\r
1115             }\r
1116             index+=  UTF16.getCharCount(ch);\r
1117         }\r
1118         if (index < expected.length()) {\r
1119             errln("FAIL: " +  "Only got " + index + " chars, expected "\r
1120             + expected.length());\r
1121         }\r
1122         \r
1123         cIter.setToLimit();\r
1124         while((ch=iter.previous())!=Normalizer.DONE){\r
1125             int want = cIter.previousCodePoint();\r
1126             if (ch != want ) {\r
1127                 errln("FAIL: " + "got '" + (char)ch\r
1128                         + "' (" + hex(ch) + ")"\r
1129                         + " but expected '" + want + "' (" + hex(want) + ")"\r
1130                         + " at index " + index);\r
1131             }\r
1132         }\r
1133     }\r
1134     public void TestDebugIterOld(){\r
1135         String input = "\\U0001D15E";\r
1136         String expected = "\uD834\uDD57\uD834\uDD65";\r
1137         String expectedReverse = "\uD834\uDD65\uD834\uDD57";\r
1138         int index = 0;\r
1139         int ch;\r
1140         Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(input)),\r
1141                                                 Normalizer.NFKC,0);\r
1142         StringBuffer got = new StringBuffer();\r
1143         for (ch = iter.first();ch!=Normalizer.DONE;ch=iter.next())\r
1144         {\r
1145             if (index >= expected.length()) {\r
1146                 errln("FAIL: " +  "Unexpected character '" + (char)ch +\r
1147                        "' (" + hex(ch) + ")" + " at index " + index);\r
1148                 break;\r
1149             }\r
1150             got.append(UCharacter.toString(ch));\r
1151             index++;\r
1152         }\r
1153         if (!expected.equals(got.toString())) {\r
1154                 errln("FAIL: " +  "got '" +got+ "' (" + hex(got) + ")"\r
1155                         + " but expected '" + expected + "' ("\r
1156                         + hex(expected) + ")");\r
1157         }\r
1158         if (got.length() < expected.length()) {\r
1159             errln("FAIL: " +  "Only got " + index + " chars, expected "\r
1160                            + expected.length());\r
1161         }\r
1162 \r
1163         logln("Reverse Iteration\n");\r
1164         iter.setIndexOnly(iter.endIndex());\r
1165         got.setLength(0);\r
1166         for(ch=iter.previous();ch!=Normalizer.DONE;ch=iter.previous()){\r
1167             if (index >= expected.length()) {\r
1168                 errln("FAIL: " +  "Unexpected character '" + (char)ch\r
1169                                + "' (" + hex(ch) + ")" + " at index " + index);\r
1170                 break;\r
1171             }\r
1172             got.append(UCharacter.toString(ch));\r
1173         }\r
1174         if (!expectedReverse.equals(got.toString())) {\r
1175                 errln("FAIL: " +  "got '" +got+ "' (" + hex(got) + ")"\r
1176                                + " but expected '" + expected\r
1177                                + "' (" + hex(expected) + ")");\r
1178         }\r
1179         if (got.length() < expected.length()) {\r
1180             errln("FAIL: " +  "Only got " + index + " chars, expected "\r
1181                       + expected.length());\r
1182         }\r
1183 \r
1184     }\r
1185     //--------------------------------------------------------------------------\r
1186     // helper class for TestPreviousNext()\r
1187     // simple UTF-32 character iterator\r
1188     class UCharIterator {\r
1189 \r
1190        public UCharIterator(int[] src, int len, int index){\r
1191 \r
1192             s=src;\r
1193             length=len;\r
1194             i=index;\r
1195        }\r
1196 \r
1197         public int current() {\r
1198             if(i<length) {\r
1199                 return s[i];\r
1200             } else {\r
1201                 return -1;\r
1202             }\r
1203         }\r
1204 \r
1205         public int next() {\r
1206             if(i<length) {\r
1207                 return s[i++];\r
1208             } else {\r
1209                 return -1;\r
1210             }\r
1211         }\r
1212 \r
1213         public int previous() {\r
1214             if(i>0) {\r
1215                 return s[--i];\r
1216             } else {\r
1217                 return -1;\r
1218             }\r
1219         }\r
1220 \r
1221         public int getIndex() {\r
1222             return i;\r
1223         }\r
1224 \r
1225         private int[] s;\r
1226         private int length, i;\r
1227     }\r
1228     public void TestPreviousNext() {\r
1229         // src and expect strings\r
1230         char src[]={\r
1231             UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999),\r
1232             UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f),\r
1233             0xc4,\r
1234             0x1ed0\r
1235         };\r
1236         int expect[]={\r
1237             0x831d,\r
1238             0x1d158, 0x1d165,\r
1239             0x41, 0x308,\r
1240             0x4f, 0x302, 0x301\r
1241         };\r
1242 \r
1243         // expected src indexes corresponding to expect indexes\r
1244         int expectIndex[]={\r
1245             0,\r
1246             2, 2,\r
1247             4, 4,\r
1248             5, 5, 5,\r
1249             6 // behind last character\r
1250         };\r
1251 \r
1252         // initial indexes into the src and expect strings\r
1253 \r
1254         final int SRC_MIDDLE=4;\r
1255         final int EXPECT_MIDDLE=3;\r
1256 \r
1257 \r
1258         // movement vector\r
1259         // - for previous(), 0 for current(), + for next()\r
1260         // not const so that we can terminate it below for the error message\r
1261         String moves="0+0+0--0-0-+++0--+++++++0--------";\r
1262 \r
1263         // iterators\r
1264         Normalizer iter = new Normalizer(new String(src),\r
1265                                                 Normalizer.NFD,0);\r
1266         UCharIterator iter32 = new UCharIterator(expect, expect.length,\r
1267                                                      EXPECT_MIDDLE);\r
1268 \r
1269         int c1, c2;\r
1270         char m;\r
1271 \r
1272         // initially set the indexes into the middle of the strings\r
1273         iter.setIndexOnly(SRC_MIDDLE);\r
1274 \r
1275         // move around and compare the iteration code points with\r
1276         // the expected ones\r
1277         int movesIndex =0;\r
1278         while(movesIndex<moves.length()) {\r
1279             m=moves.charAt(movesIndex++);\r
1280             if(m=='-') {\r
1281                 c1=iter.previous();\r
1282                 c2=iter32.previous();\r
1283             } else if(m=='0') {\r
1284                 c1=iter.current();\r
1285                 c2=iter32.current();\r
1286             } else /* m=='+' */ {\r
1287                 c1=iter.next();\r
1288                 c2=iter32.next();\r
1289             }\r
1290 \r
1291             // compare results\r
1292             if(c1!=c2) {\r
1293                 // copy the moves until the current (m) move, and terminate\r
1294                 String history = moves.substring(0,movesIndex);\r
1295                 errln("error: mismatch in Normalizer iteration at "+history+": "\r
1296                       +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2));\r
1297                 break;\r
1298             }\r
1299 \r
1300             // compare indexes\r
1301             if(iter.getIndex()!=expectIndex[iter32.getIndex()]) {\r
1302                 // copy the moves until the current (m) move, and terminate\r
1303                 String history = moves.substring(0,movesIndex);\r
1304                 errln("error: index mismatch in Normalizer iteration at "\r
1305                       +history+ " : "+ "Normalizer index " +iter.getIndex()\r
1306                       +" expected "+ expectIndex[iter32.getIndex()]);\r
1307                 break;\r
1308             }\r
1309         }\r
1310     }\r
1311     // Only in ICU4j\r
1312     public void TestPreviousNextJCI() {\r
1313         // src and expect strings\r
1314         char src[]={\r
1315             UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999),\r
1316             UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f),\r
1317             0xc4,\r
1318             0x1ed0\r
1319         };\r
1320         int expect[]={\r
1321             0x831d,\r
1322             0x1d158, 0x1d165,\r
1323             0x41, 0x308,\r
1324             0x4f, 0x302, 0x301\r
1325         };\r
1326 \r
1327         // expected src indexes corresponding to expect indexes\r
1328         int expectIndex[]={\r
1329             0,\r
1330             2, 2,\r
1331             4, 4,\r
1332             5, 5, 5,\r
1333             6 // behind last character\r
1334         };\r
1335 \r
1336         // initial indexes into the src and expect strings\r
1337 \r
1338         final int SRC_MIDDLE=4;\r
1339         final int EXPECT_MIDDLE=3;\r
1340 \r
1341 \r
1342         // movement vector\r
1343         // - for previous(), 0 for current(), + for next()\r
1344         // not const so that we can terminate it below for the error message\r
1345         String moves="0+0+0--0-0-+++0--+++++++0--------";\r
1346 \r
1347         // iterators\r
1348         StringCharacterIterator text = new StringCharacterIterator(new String(src));\r
1349         Normalizer iter = new Normalizer(text,Normalizer.NFD,0);\r
1350         UCharIterator iter32 = new UCharIterator(expect, expect.length,\r
1351                                                      EXPECT_MIDDLE);\r
1352 \r
1353         int c1, c2;\r
1354         char m;\r
1355 \r
1356         // initially set the indexes into the middle of the strings\r
1357         iter.setIndexOnly(SRC_MIDDLE);\r
1358 \r
1359         // move around and compare the iteration code points with\r
1360         // the expected ones\r
1361         int movesIndex =0;\r
1362         while(movesIndex<moves.length()) {\r
1363             m=moves.charAt(movesIndex++);\r
1364             if(m=='-') {\r
1365                 c1=iter.previous();\r
1366                 c2=iter32.previous();\r
1367             } else if(m=='0') {\r
1368                 c1=iter.current();\r
1369                 c2=iter32.current();\r
1370             } else /* m=='+' */ {\r
1371                 c1=iter.next();\r
1372                 c2=iter32.next();\r
1373             }\r
1374 \r
1375             // compare results\r
1376             if(c1!=c2) {\r
1377                 // copy the moves until the current (m) move, and terminate\r
1378                 String history = moves.substring(0,movesIndex);\r
1379                 errln("error: mismatch in Normalizer iteration at "+history+": "\r
1380                       +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2));\r
1381                 break;\r
1382             }\r
1383 \r
1384             // compare indexes\r
1385             if(iter.getIndex()!=expectIndex[iter32.getIndex()]) {\r
1386                 // copy the moves until the current (m) move, and terminate\r
1387                 String history = moves.substring(0,movesIndex);\r
1388                 errln("error: index mismatch in Normalizer iteration at "\r
1389                       +history+ " : "+ "Normalizer index " +iter.getIndex()\r
1390                       +" expected "+ expectIndex[iter32.getIndex()]);\r
1391                 break;\r
1392             }\r
1393         }\r
1394     }\r
1395 \r
1396     // test APIs that are not otherwise used - improve test coverage\r
1397     public void TestNormalizerAPI() throws Exception {\r
1398         try{\r
1399             // instantiate a Normalizer from a CharacterIterator\r
1400             String s=Utility.unescape("a\u0308\uac00\\U0002f800");\r
1401             // make s a bit longer and more interesting\r
1402             UCharacterIterator iter = UCharacterIterator.getInstance(s+s);\r
1403             Normalizer norm = new Normalizer(iter, Normalizer.NFC,0);\r
1404             if(norm.next()!=0xe4) {\r
1405                 errln("error in Normalizer(CharacterIterator).next()");\r
1406             }   \r
1407     \r
1408             // test clone(), ==, and hashCode()\r
1409             Normalizer clone=(Normalizer)norm.clone();\r
1410             if(clone.equals(norm)) {\r
1411                 errln("error in Normalizer(Normalizer(CharacterIterator)).clone()!=norm");\r
1412             }\r
1413     \r
1414             \r
1415             if(clone.getLength()!= norm.getLength()){\r
1416                errln("error in Normalizer.getBeginIndex()");\r
1417             } \r
1418             // clone must have the same hashCode()\r
1419             //if(clone.hashCode()!=norm.hashCode()) {\r
1420             //    errln("error in Normalizer(Normalizer(CharacterIterator)).clone().hashCode()!=copy.hashCode()");\r
1421             //}\r
1422             if(clone.next()!=0xac00) {\r
1423                 errln("error in Normalizer(Normalizer(CharacterIterator)).next()");\r
1424             }\r
1425             int ch = clone.next();\r
1426             if(ch!=0x4e3d) {\r
1427                 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next()");\r
1428             }\r
1429             // position changed, must change hashCode()\r
1430             if(clone.hashCode()==norm.hashCode()) {\r
1431                 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next().hashCode()==copy.hashCode()");\r
1432             }\r
1433     \r
1434             // test compose() and decompose()\r
1435             StringBuffer tel;\r
1436             String nfkc, nfkd;\r
1437             tel=new StringBuffer("\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121");\r
1438             tel.insert(1,(char)0x0301);\r
1439     \r
1440             nfkc=Normalizer.compose(tel.toString(), true);\r
1441             nfkd=Normalizer.decompose(tel.toString(), true);\r
1442             if(\r
1443                 !nfkc.equals(Utility.unescape("TE\u0139TELTELTELTELTELTELTELTELTEL"))||\r
1444                 !nfkd.equals(Utility.unescape("TEL\u0301TELTELTELTELTELTELTELTELTEL"))\r
1445             ) {\r
1446                 errln("error in Normalizer::(de)compose(): wrong result(s)");\r
1447             }\r
1448     \r
1449             // test setIndex()\r
1450 //            ch=norm.setIndex(3);\r
1451 //            if(ch!=0x4e3d) {\r
1452 //                errln("error in Normalizer(CharacterIterator).setIndex(3)");\r
1453 //            }\r
1454     \r
1455             // test setText(CharacterIterator) and getText()\r
1456             String out, out2;\r
1457             clone.setText(iter);\r
1458     \r
1459             out = clone.getText();\r
1460             out2 = iter.getText();\r
1461             if( !out.equals(out2) ||\r
1462                 clone.startIndex()!=0||\r
1463                 clone.endIndex()!=iter.getLength()\r
1464             ) {\r
1465                 errln("error in Normalizer::setText() or Normalizer::getText()");\r
1466             }\r
1467      \r
1468             char[] fillIn1 = new char[clone.getLength()];\r
1469             char[] fillIn2 = new char[iter.getLength()];\r
1470             int len = clone.getText(fillIn1);\r
1471             iter.getText(fillIn2,0);\r
1472             if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){\r
1473                 errln("error in Normalizer.getText(). Normalizer: "+\r
1474                                 Utility.hex(new String(fillIn1))+ \r
1475                                 " Iter: " + Utility.hex(new String(fillIn2)));\r
1476             }\r
1477             \r
1478             clone.setText(fillIn1);\r
1479             len = clone.getText(fillIn2);\r
1480             if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){\r
1481                 errln("error in Normalizer.setText() or Normalizer.getText()"+\r
1482                                 Utility.hex(new String(fillIn1))+ \r
1483                                 " Iter: " + Utility.hex(new String(fillIn2)));\r
1484             }\r
1485     \r
1486             // test setText(UChar *), getUMode() and setMode()\r
1487             clone.setText(s);\r
1488             clone.setIndexOnly(1);\r
1489             clone.setMode(Normalizer.NFD);\r
1490             if(clone.getMode()!=Normalizer.NFD) {\r
1491                 errln("error in Normalizer::setMode() or Normalizer::getMode()");\r
1492             }\r
1493             if(clone.next()!=0x308 || clone.next()!=0x1100) {\r
1494                 errln("error in Normalizer::setText() or Normalizer::setMode()");\r
1495             }\r
1496     \r
1497             // test last()/previous() with an internal buffer overflow\r
1498             StringBuffer buf = new StringBuffer("aaaaaaaaaa");\r
1499             buf.setCharAt(10-1,'\u0308');\r
1500             clone.setText(buf);\r
1501             if(clone.last()!=0x308) {\r
1502                 errln("error in Normalizer(10*U+0308).last()");\r
1503             }\r
1504     \r
1505             // test UNORM_NONE\r
1506             norm.setMode(Normalizer.NONE);\r
1507             if(norm.first()!=0x61 || norm.next()!=0x308 || norm.last()!=0x2f800) {\r
1508                 errln("error in Normalizer(UNORM_NONE).first()/next()/last()");\r
1509             }\r
1510             out=Normalizer.normalize(s, Normalizer.NONE);\r
1511             if(!out.equals(s)) {\r
1512                 errln("error in Normalizer::normalize(UNORM_NONE)");\r
1513             }\r
1514             ch = 0x1D15E;\r
1515             String exp = "\\U0001D157\\U0001D165";\r
1516             String ns = Normalizer.normalize(ch,Normalizer.NFC);\r
1517             if(!ns.equals(Utility.unescape(exp))){\r
1518                 errln("error in Normalizer.normalize(int,Mode)");\r
1519             }\r
1520             ns = Normalizer.normalize(ch,Normalizer.NFC,0);\r
1521             if(!ns.equals(Utility.unescape(exp))){\r
1522                 errln("error in Normalizer.normalize(int,Mode,int)");\r
1523             }\r
1524             \r
1525             \r
1526         }catch(Exception e){\r
1527             throw e;\r
1528         }\r
1529     }\r
1530 \r
1531     public void TestConcatenate() {\r
1532 \r
1533         Object[][]cases=new Object[][]{\r
1534             /* mode, left, right, result */\r
1535             {\r
1536                 Normalizer.NFC,\r
1537                 "re",\r
1538                 "\u0301sum\u00e9",\r
1539                 "r\u00e9sum\u00e9"\r
1540             },\r
1541             {\r
1542                 Normalizer.NFC,\r
1543                 "a\u1100",\r
1544                 "\u1161bcdefghijk",\r
1545                 "a\uac00bcdefghijk"\r
1546             },\r
1547             /* ### TODO: add more interesting cases */\r
1548             {\r
1549                 Normalizer.NFD,\r
1550                 "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" +\r
1551                 "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" +\r
1552                 "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" +\r
1553                 "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" +\r
1554                 "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" +\r
1555                 "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" +\r
1556                 "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u1FDB",\r
1557 \r
1558                 "\u1FE3\u1FEB\u1FEE\u1FEF\u1FF9\u1FFB\u1FFD\u2000" +\r
1559                 "\u2001\u2126\u212A\u212B\u2329\u232A\uF900\uFA10" +\r
1560                 "\uFA12\uFA15\uFA20\uFA22\uFA25\uFA26\uFA2A\uFB1F" +\r
1561                 "\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E\uFB2F\uFB30\uFB31" +\r
1562                 "\uFB32\uFB33\uFB34\uFB35\uFB36\uFB38\uFB39\uFB3A" +\r
1563                 "\uFB3B\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46" +\r
1564                 "\uFB47\uFB48\uFB49\uFB4A\uFB4B\uFB4C\uFB4D\uFB4E",\r
1565 \r
1566                 "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" +\r
1567                 "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" +\r
1568                 "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" +\r
1569                 "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" +\r
1570                 "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" +\r
1571                 "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" +\r
1572                 "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u0399" +\r
1573                 "\u0301\u03C5\u0308\u0301\u1FEB\u1FEE\u1FEF\u1FF9" +\r
1574                 "\u1FFB\u1FFD\u2000\u2001\u2126\u212A\u212B\u2329" +\r
1575                 "\u232A\uF900\uFA10\uFA12\uFA15\uFA20\uFA22\uFA25" +\r
1576                 "\uFA26\uFA2A\uFB1F\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E" +\r
1577                 "\uFB2F\uFB30\uFB31\uFB32\uFB33\uFB34\uFB35\uFB36" +\r
1578                 "\uFB38\uFB39\uFB3A\uFB3B\uFB3C\uFB3E\uFB40\uFB41" +\r
1579                 "\uFB43\uFB44\uFB46\uFB47\uFB48\uFB49\uFB4A\uFB4B" +\r
1580                 "\uFB4C\uFB4D\uFB4E"\r
1581             }\r
1582         };\r
1583 \r
1584         String left, right, expect, result;\r
1585         Normalizer.Mode mode;\r
1586         int i;\r
1587 \r
1588         /* test concatenation */\r
1589         for(i=0; i<cases.length; ++i) {\r
1590             mode = (Normalizer.Mode)cases[i][0];\r
1591 \r
1592             left=(String)cases[i][1];\r
1593             right=(String)cases[i][2];\r
1594             expect=(String)cases[i][3];\r
1595             {\r
1596                 result=Normalizer.concatenate(left, right, mode,0);\r
1597                 if( result.equals(expect)) {\r
1598                     errln("error in Normalizer.concatenate(), cases[] failed"\r
1599                           +", result==expect: expected: "\r
1600                           + hex(expect)+" =========> got: " + hex(result));\r
1601                 }\r
1602             }\r
1603             {\r
1604                 result=Normalizer.concatenate(left.toCharArray(), right.toCharArray(), mode,0);\r
1605                 if( result.equals(expect)) {\r
1606                     errln("error in Normalizer.concatenate(), cases[] failed"\r
1607                           +", result==expect: expected: "\r
1608                           + hex(expect)+" =========> got: " + hex(result));\r
1609                 }\r
1610             }\r
1611         }\r
1612     }\r
1613     private final int RAND_MAX = 0x7fff;\r
1614 \r
1615     public void TestCheckFCD()\r
1616     {\r
1617       char[] FAST = {0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,\r
1618                      0x0008, 0x0009, 0x000A};\r
1619 \r
1620       char[] FALSE = {0x0001, 0x0002, 0x02EA, 0x03EB, 0x0300, 0x0301,\r
1621                       0x02B9, 0x0314, 0x0315, 0x0316};\r
1622 \r
1623       char[] TRUE = {0x0030, 0x0040, 0x0440, 0x056D, 0x064F, 0x06E7,\r
1624                      0x0050, 0x0730, 0x09EE, 0x1E10};\r
1625 \r
1626       char[][] datastr= { {0x0061, 0x030A, 0x1E05, 0x0302, 0},\r
1627                           {0x0061, 0x030A, 0x00E2, 0x0323, 0},\r
1628                           {0x0061, 0x0323, 0x00E2, 0x0323, 0},\r
1629                           {0x0061, 0x0323, 0x1E05, 0x0302, 0}\r
1630                         };\r
1631       Normalizer.QuickCheckResult result[] = {Normalizer.YES, Normalizer.NO, Normalizer.NO, Normalizer.YES};\r
1632 \r
1633       char[] datachar= {        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,\r
1634                                 0x6a,\r
1635                                 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,\r
1636                                 0xea,\r
1637                                 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306,\r
1638                                 0x0307, 0x0308, 0x0309, 0x030a,\r
1639                                 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326,\r
1640                                 0x0327, 0x0328, 0x0329, 0x032a,\r
1641                                 0x1e00, 0x1e01, 0x1e02, 0x1e03, 0x1e04, 0x1e05, 0x1e06,\r
1642                                 0x1e07, 0x1e08, 0x1e09, 0x1e0a\r
1643                        };\r
1644 \r
1645       int count = 0;\r
1646 \r
1647       if (Normalizer.quickCheck(FAST,0,FAST.length, Normalizer.FCD,0) != Normalizer.YES)\r
1648         errln("Normalizer.quickCheck(FCD) failed: expected value for fast Normalizer.quickCheck is Normalizer.YES\n");\r
1649       if (Normalizer.quickCheck(FALSE,0, FALSE.length,Normalizer.FCD,0) != Normalizer.NO)\r
1650         errln("Normalizer.quickCheck(FCD) failed: expected value for error Normalizer.quickCheck is Normalizer.NO\n");\r
1651       if (Normalizer.quickCheck(TRUE,0,TRUE.length,Normalizer.FCD,0) != Normalizer.YES)\r
1652         errln("Normalizer.quickCheck(FCD) failed: expected value for correct Normalizer.quickCheck is Normalizer.YES\n");\r
1653 \r
1654 \r
1655       while (count < 4)\r
1656       {\r
1657         Normalizer.QuickCheckResult fcdresult = Normalizer.quickCheck(datastr[count],0,datastr[count].length, Normalizer.FCD,0);\r
1658         if (result[count] != fcdresult) {\r
1659             errln("Normalizer.quickCheck(FCD) failed: Data set "+ count\r
1660                     + " expected value "+ result[count]);\r
1661         }\r
1662         count ++;\r
1663       }\r
1664 \r
1665       /* random checks of long strings */\r
1666       //srand((unsigned)time( NULL ));\r
1667       Random rand = createRandom(); // use test framework's random\r
1668 \r
1669       for (count = 0; count < 50; count ++)\r
1670       {\r
1671         int size = 0;\r
1672         Normalizer.QuickCheckResult testresult = Normalizer.YES;\r
1673         char[] data= new char[20];\r
1674         char[] norm= new char[100];\r
1675         char[] nfd = new char[100];\r
1676         int normStart = 0;\r
1677         int nfdsize = 0;\r
1678         while (size != 19) {\r
1679           data[size] = datachar[rand.nextInt(RAND_MAX)*50/RAND_MAX];\r
1680           logln("0x"+data[size]);\r
1681           normStart += Normalizer.normalize(data,size,size+1,\r
1682                                               norm,normStart,100,\r
1683                                               Normalizer.NFD,0);\r
1684           size ++;\r
1685         }\r
1686         logln("\n");\r
1687 \r
1688         nfdsize = Normalizer.normalize(data,0,size, nfd,0,nfd.length,Normalizer.NFD,0);\r
1689         //    nfdsize = unorm_normalize(data, size, UNORM_NFD, UCOL_IGNORE_HANGUL,\r
1690         //                      nfd, 100, &status);\r
1691         if (nfdsize != normStart || Utility.arrayRegionMatches(nfd,0, norm,0,nfdsize) ==false) {\r
1692           testresult = Normalizer.NO;\r
1693         }\r
1694         if (testresult == Normalizer.YES) {\r
1695           logln("result Normalizer.YES\n");\r
1696         }\r
1697         else {\r
1698           logln("result Normalizer.NO\n");\r
1699         }\r
1700 \r
1701         if (Normalizer.quickCheck(data,0,data.length, Normalizer.FCD,0) != testresult) {\r
1702           errln("Normalizer.quickCheck(FCD) failed: expected "+ testresult +" for random data: "+hex(new String(data)) );\r
1703         }\r
1704       }\r
1705     }\r
1706 \r
1707 \r
1708     // reference implementation of Normalizer::compare\r
1709     private int ref_norm_compare(String s1, String s2, int options) {\r
1710         String t1, t2,r1,r2;\r
1711 \r
1712         int normOptions=(int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT);\r
1713         \r
1714         if((options&Normalizer.COMPARE_IGNORE_CASE)!=0) {\r
1715             // NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))\r
1716             r1 = Normalizer.decompose(s1,false,normOptions);\r
1717             r2 = Normalizer.decompose(s2,false,normOptions);\r
1718             r1 = UCharacter.foldCase(r1,options);\r
1719             r2 = UCharacter.foldCase(r2,options);\r
1720         }else{\r
1721             r1 = s1;\r
1722             r2 = s2;\r
1723         }\r
1724         \r
1725         t1 = Normalizer.decompose(r1, false, normOptions);\r
1726         t2 = Normalizer.decompose(r2, false, normOptions);\r
1727 \r
1728         if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) {\r
1729             UTF16.StringComparator comp \r
1730                     = new UTF16.StringComparator(true, false, \r
1731                                      UTF16.StringComparator.FOLD_CASE_DEFAULT);\r
1732             return comp.compare(t1,t2);\r
1733         } else {\r
1734             return t1.compareTo(t2);\r
1735         }\r
1736 \r
1737     }\r
1738 \r
1739     // test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately\r
1740     private int norm_compare(String s1, String s2, int options) {\r
1741         int normOptions=(int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT);\r
1742 \r
1743         if( Normalizer.YES==Normalizer.quickCheck(s1,Normalizer.FCD,normOptions) &&\r
1744             Normalizer.YES==Normalizer.quickCheck(s2,Normalizer.FCD,normOptions)) {\r
1745             options|=Normalizer.INPUT_IS_FCD;\r
1746         }\r
1747 \r
1748         return Normalizer.compare(s1, s2, options);\r
1749     }\r
1750 \r
1751     // reference implementation of UnicodeString::caseCompare\r
1752     private int ref_case_compare(String s1, String s2, int options) {\r
1753         String t1, t2;\r
1754 \r
1755         t1=s1;\r
1756         t2=s2;\r
1757 \r
1758         t1 = UCharacter.foldCase(t1,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0));\r
1759         t2 = UCharacter.foldCase(t2,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0));\r
1760 \r
1761         if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) {\r
1762             UTF16.StringComparator comp \r
1763                     = new UTF16.StringComparator(true, false,\r
1764                                     UTF16.StringComparator.FOLD_CASE_DEFAULT);\r
1765             return comp.compare(t1,t2);\r
1766         } else {\r
1767             return t1.compareTo(t2);\r
1768         }\r
1769 \r
1770     }\r
1771 \r
1772     // reduce an integer to -1/0/1\r
1773     private static int sign(int value) {\r
1774         if(value==0) {\r
1775             return 0;\r
1776         } else {\r
1777             return (value>>31)|1;\r
1778         }\r
1779     }\r
1780     private static String signString(int value) {\r
1781         if(value<0) {\r
1782             return "<0";\r
1783         } else if(value==0) {\r
1784             return "=0";\r
1785         } else /* value>0 */ {\r
1786             return ">0";\r
1787         }\r
1788     }\r
1789     // test Normalizer::compare and unorm_compare (thinly wrapped by the former)\r
1790     // by comparing it with its semantic equivalent\r
1791     // since we trust the pieces, this is sufficient\r
1792 \r
1793     // test each string with itself and each other\r
1794     // each time with all options\r
1795     private  String strings[]=new String[]{\r
1796                 // some cases from NormalizationTest.txt\r
1797                 // 0..3\r
1798                 "D\u031B\u0307\u0323",\r
1799                 "\u1E0C\u031B\u0307",\r
1800                 "D\u031B\u0323\u0307",\r
1801                 "d\u031B\u0323\u0307",\r
1802         \r
1803                 // 4..6\r
1804                 "\u00E4",\r
1805                 "a\u0308",\r
1806                 "A\u0308",\r
1807         \r
1808                 // Angstrom sign = A ring\r
1809                 // 7..10\r
1810                 "\u212B",\r
1811                 "\u00C5",\r
1812                 "A\u030A",\r
1813                 "a\u030A",\r
1814         \r
1815                 // 11.14\r
1816                 "a\u059A\u0316\u302A\u032Fb",\r
1817                 "a\u302A\u0316\u032F\u059Ab",\r
1818                 "a\u302A\u0316\u032F\u059Ab",\r
1819                 "A\u059A\u0316\u302A\u032Fb",\r
1820         \r
1821                 // from ICU case folding tests\r
1822                 // 15..20\r
1823                 "A\u00df\u00b5\ufb03\\U0001040c\u0131",\r
1824                 "ass\u03bcffi\\U00010434i",\r
1825                 "\u0061\u0042\u0131\u03a3\u00df\ufb03\ud93f\udfff",\r
1826                 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udfff",\r
1827                 "\u0041\u0062\u0131\u03c3\u0053\u0073\u0066\u0046\u0069\ud93f\udfff",\r
1828                 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udffd",\r
1829         \r
1830                 //     U+d800 U+10001   see implementation comment in unorm_cmpEquivFold\r
1831                 // vs. U+10000          at bottom - code point order\r
1832                 // 21..22\r
1833                 "\ud800\ud800\udc01",\r
1834                 "\ud800\udc00",\r
1835         \r
1836                 // other code point order tests from ustrtest.cpp\r
1837                 // 23..31\r
1838                 "\u20ac\ud801",\r
1839                 "\u20ac\ud800\udc00",\r
1840                 "\ud800",\r
1841                 "\ud800\uff61",\r
1842                 "\udfff",\r
1843                 "\uff61\udfff",\r
1844                 "\uff61\ud800\udc02",\r
1845                 "\ud800\udc02",\r
1846                 "\ud84d\udc56",\r
1847         \r
1848                 // long strings, see cnormtst.c/TestNormCoverage()\r
1849                 // equivalent if case-insensitive\r
1850                 // 32..33\r
1851                 "\uAD8B\uAD8B\uAD8B\uAD8B"+\r
1852                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1853                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1854                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1855                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1856                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1857                 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+\r
1858                 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+\r
1859                 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+\r
1860                 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+\r
1861                 "\uAD8B\uAD8B\uAD8B\uAD8B"+\r
1862                 "d\u031B\u0307\u0323",\r
1863         \r
1864                 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+\r
1865                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1866                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1867                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1868                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1869                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+\r
1870                 "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+\r
1871                 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+\r
1872                 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+\r
1873                 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+\r
1874                 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+\r
1875                 "\u1E0C\u031B\u0307",\r
1876         \r
1877                 // some strings that may make a difference whether the compare function\r
1878                 // case-folds or decomposes first\r
1879                 // 34..41\r
1880                 "\u0360\u0345\u0334",\r
1881                 "\u0360\u03b9\u0334",\r
1882         \r
1883                 "\u0360\u1f80\u0334",\r
1884                 "\u0360\u03b1\u0313\u03b9\u0334",\r
1885         \r
1886                 "\u0360\u1ffc\u0334",\r
1887                 "\u0360\u03c9\u03b9\u0334",\r
1888         \r
1889                 "a\u0360\u0345\u0360\u0345b",\r
1890                 "a\u0345\u0360\u0345\u0360b",\r
1891         \r
1892                 // interesting cases for canonical caseless match with turkic i handling\r
1893                 // 42..43\r
1894                 "\u00cc",\r
1895                 "\u0069\u0300",\r
1896         \r
1897                 // strings with post-Unicode 3.2 normalization or normalization corrections\r
1898                 // 44..45\r
1899                 "\u00e4\u193b\\U0002f868",\r
1900                 "\u0061\u193b\u0308\u36fc",\r
1901 \r
1902 \r
1903     };\r
1904 \r
1905     // all combinations of options\r
1906     // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions\r
1907     final class Temp {\r
1908         int options;\r
1909         String name;\r
1910         public Temp(int opt,String str){\r
1911             options =opt;\r
1912             name = str;\r
1913         }\r
1914 \r
1915     }\r
1916     // set UNORM_UNICODE_3_2 in one additional combination\r
1917   \r
1918     private Temp[] opt = new Temp[]{\r
1919                     new Temp(0,"default"),\r
1920                     new Temp(Normalizer.COMPARE_CODE_POINT_ORDER, "code point order" ),\r
1921                     new Temp(Normalizer.COMPARE_IGNORE_CASE, "ignore case" ),\r
1922                     new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE, "code point order & ignore case" ),\r
1923                     new Temp(Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "ignore case & special i"),\r
1924                     new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "code point order & ignore case & special i"),\r
1925                     new Temp(Normalizer.UNICODE_3_2 << Normalizer.COMPARE_NORM_OPTIONS_SHIFT, "Unicode 3.2")\r
1926             };\r
1927 \r
1928 \r
1929     public void TestCompareDebug(){\r
1930 \r
1931         String[] s = new String[100]; // at least as many items as in strings[] !\r
1932 \r
1933 \r
1934         int i, j, k, count=strings.length;\r
1935         int result, refResult;\r
1936 \r
1937         // create the UnicodeStrings\r
1938         for(i=0; i<count; ++i) {\r
1939             s[i]=Utility.unescape(strings[i]);\r
1940         }\r
1941         UTF16.StringComparator comp = new UTF16.StringComparator(true, false, \r
1942                                      UTF16.StringComparator.FOLD_CASE_DEFAULT);\r
1943         // test them each with each other\r
1944 \r
1945         i = 42;\r
1946         j = 43;\r
1947         k = 2;\r
1948         // test Normalizer::compare\r
1949         result=norm_compare(s[i], s[j], opt[k].options);\r
1950         refResult=ref_norm_compare(s[i], s[j], opt[k].options);\r
1951         if(sign(result)!=sign(refResult)) {\r
1952             errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);\r
1953         }\r
1954 \r
1955         // test UnicodeString::caseCompare - same internal implementation function\r
1956          if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) {\r
1957         //    result=s[i]. (s[j], opt[k].options);\r
1958             if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)\r
1959             {\r
1960                 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);\r
1961             }\r
1962             else {\r
1963                 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);\r
1964             }\r
1965             \r
1966             result=comp.compare(s[i],s[j]);\r
1967             refResult=ref_case_compare(s[i], s[j], opt[k].options);\r
1968             if(sign(result)!=sign(refResult)) {\r
1969                       errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);\r
1970                             }\r
1971         }\r
1972         String value1 = "\u00dater\u00fd";\r
1973         String value2 = "\u00fater\u00fd";\r
1974         if(Normalizer.compare(value1,value2,0)!=0){\r
1975             if(Normalizer.compare(value1,value2,Normalizer.COMPARE_IGNORE_CASE)==0){\r
1976 \r
1977             }\r
1978         }\r
1979     }\r
1980 \r
1981     public void TestCompare() {\r
1982 \r
1983         String[] s = new String[100]; // at least as many items as in strings[] !\r
1984 \r
1985         int i, j, k, count=strings.length;\r
1986         int result, refResult;\r
1987 \r
1988         // create the UnicodeStrings\r
1989         for(i=0; i<count; ++i) {\r
1990             s[i]=Utility.unescape(strings[i]);\r
1991         }\r
1992         UTF16.StringComparator comp = new UTF16.StringComparator();\r
1993         // test them each with each other\r
1994         for(i=0; i<count; ++i) {\r
1995             for(j=i; j<count; ++j) {\r
1996                 for(k=0; k<opt.length; ++k) {\r
1997                     // test Normalizer::compare\r
1998                     result=norm_compare(s[i], s[j], opt[k].options);\r
1999                     refResult=ref_norm_compare(s[i], s[j], opt[k].options);\r
2000                     if(sign(result)!=sign(refResult)) {\r
2001                         errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);\r
2002                     }\r
2003 \r
2004                     // test UnicodeString::caseCompare - same internal implementation function\r
2005                      if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) {\r
2006                         //    result=s[i]. (s[j], opt[k].options);\r
2007                         if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)\r
2008                         {\r
2009                             comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);\r
2010                         }\r
2011                         else {\r
2012                             comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);\r
2013                         }\r
2014                         \r
2015                         comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0);\r
2016                         // result=comp.caseCompare(s[i],s[j], opt[k].options);\r
2017                         result=comp.compare(s[i],s[j]);\r
2018                         refResult=ref_case_compare(s[i], s[j], opt[k].options);\r
2019                         if(sign(result)!=sign(refResult)) {\r
2020                                   errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);\r
2021                                          }\r
2022                     }\r
2023                 }\r
2024             }\r
2025         }\r
2026         \r
2027         // test cases with i and I to make sure Turkic works\r
2028         char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 };\r
2029         USerializedSet sset=new USerializedSet();\r
2030         UnicodeSet set = new UnicodeSet();\r
2031     \r
2032         String s1, s2;\r
2033         int start, end;\r
2034     \r
2035         // collect all sets into one for contiguous output\r
2036         int[] startEnd = new int[2];\r
2037         for(i=0; i<iI.length; ++i) {\r
2038             if(NormalizerImpl.getCanonStartSet(iI[i], sset)) {\r
2039                 count=sset.countRanges();\r
2040                 for(j=0; j<count; ++j) {\r
2041                     sset.getRange(j, startEnd);\r
2042                     set.add(startEnd[0], startEnd[1]);\r
2043                 }\r
2044             }\r
2045         }\r
2046 \r
2047         // test all of these precomposed characters\r
2048         UnicodeSetIterator it = new UnicodeSetIterator(set);\r
2049         while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) {\r
2050             start=it.codepoint;\r
2051             end=it.codepointEnd;\r
2052             while(start<=end) {\r
2053                 s1 = Integer.toString(start);\r
2054                 s2 = Normalizer.decompose(s1, false, 0);\r
2055 //                if(U_FAILURE(errorCode)) {\r
2056 //                    errln("Normalizer::decompose(U+%04x) failed: %s", start, u_errorName(errorCode));\r
2057 //                    return;\r
2058 //                }\r
2059                 for(k=0; k<opt.length; ++k) {\r
2060                     // test Normalizer::compare\r
2061 \r
2062                     result= norm_compare(s1, s2, opt[k].options);\r
2063                     refResult=ref_norm_compare(s1, s2, opt[k].options);\r
2064                     if(sign(result)!=sign(refResult)) {\r
2065                         errln("Normalizer.compare(U+"+hex(start)+" with its NFD, "+opt[k].name+")" \r
2066                               + signString(result)+" should be "+signString(refResult));\r
2067                     }\r
2068     \r
2069                     // test UnicodeString::caseCompare - same internal implementation function\r
2070                     if((opt[k].options & Normalizer.COMPARE_IGNORE_CASE)>0) {\r
2071                          if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)\r
2072                         {\r
2073                             comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);\r
2074                         }\r
2075                         else {\r
2076                             comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);\r
2077                         }\r
2078                         \r
2079                         comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0);\r
2080          \r
2081                         result=comp.compare(s1,s2);\r
2082                         refResult=ref_case_compare(s1, s2, opt[k].options);\r
2083                         if(sign(result)!=sign(refResult)) {\r
2084                             errln("UTF16.compare(U+"+hex(start)+" with its NFD, "\r
2085                                   +opt[k].name+")"+signString(result) +" should be "+signString(refResult));\r
2086                         }\r
2087                     }\r
2088                 }\r
2089     \r
2090                 ++start;\r
2091             }\r
2092         }\r
2093 \r
2094     }\r
2095 \r
2096     // verify that case-folding does not un-FCD strings\r
2097     int countFoldFCDExceptions(int foldingOptions) {\r
2098         String s, d;\r
2099         int c;\r
2100         int count;\r
2101         int/*unsigned*/ cc, trailCC, foldCC, foldTrailCC;\r
2102         Normalizer.QuickCheckResult qcResult;\r
2103         int category;\r
2104         boolean isNFD;\r
2105 \r
2106 \r
2107         logln("Test if case folding may un-FCD a string (folding options 0x)"+hex(foldingOptions));\r
2108 \r
2109         count=0;\r
2110         for(c=0; c<=0x10ffff; ++c) {\r
2111             category=UCharacter.getType(c);\r
2112             if(category==UCharacterCategory.UNASSIGNED) {\r
2113                 continue; // skip unassigned code points\r
2114             }\r
2115             if(c==0xac00) {\r
2116                 c=0xd7a3; // skip Hangul - no case folding there\r
2117                 continue;\r
2118             }\r
2119             // skip Han blocks - no case folding there either\r
2120             if(c==0x3400) {\r
2121                 c=0x4db5;\r
2122                 continue;\r
2123             }\r
2124             if(c==0x4e00) {\r
2125                 c=0x9fa5;\r
2126                 continue;\r
2127             }\r
2128             if(c==0x20000) {\r
2129                 c=0x2a6d6;\r
2130                 continue;\r
2131             }\r
2132 \r
2133             s= UTF16.valueOf(c);\r
2134 \r
2135             // get leading and trailing cc for c\r
2136             d= Normalizer.decompose(s,false);\r
2137             isNFD= s==d;\r
2138             cc=UCharacter.getCombiningClass(UTF16.charAt(d,0));\r
2139             trailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1));\r
2140 \r
2141             // get leading and trailing cc for the case-folding of c\r
2142             UCharacter.foldCase(s,(foldingOptions==0));\r
2143             d = Normalizer.decompose(s, false);\r
2144             foldCC=UCharacter.getCombiningClass(UTF16.charAt(d,0));\r
2145             foldTrailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1));\r
2146 \r
2147             qcResult=Normalizer.quickCheck(s, Normalizer.FCD,0);\r
2148 \r
2149 \r
2150             // bad:\r
2151             // - character maps to empty string: adjacent characters may then need reordering\r
2152             // - folding has different leading/trailing cc's, and they don't become just 0\r
2153             // - folding itself is not FCD\r
2154             if( qcResult!=Normalizer.YES ||\r
2155                 s.length()==0 ||\r
2156                 (cc!=foldCC && foldCC!=0) || (trailCC!=foldTrailCC && foldTrailCC!=0)\r
2157             ) {\r
2158                 ++count;\r
2159                 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")");\r
2160                 //errln("  cc %02x trailCC %02x    foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x   quickCheck(folded)=%d", cc, trailCC, UTF16.charAt(d,0), foldCC, UTF16.charAt(d,d.length()-1), foldTrailCC, qcResult);\r
2161                 continue;\r
2162             }\r
2163 \r
2164             // also bad:\r
2165             // if a code point is in NFD but its case folding is not, then\r
2166             // unorm_compare will also fail\r
2167             if(isNFD && Normalizer.YES!=Normalizer.quickCheck(s, Normalizer.NFD,0)) {\r
2168                 ++count;\r
2169                 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")");\r
2170             }\r
2171         }\r
2172 \r
2173         logln("There are "+hex(count)+" code points for which case-folding may un-FCD a string (folding options"+foldingOptions+"x)" );\r
2174         return count;\r
2175     }\r
2176 \r
2177     public void TestFindFoldFCDExceptions() {\r
2178         int count;\r
2179 \r
2180         count=countFoldFCDExceptions(0);\r
2181         count+=countFoldFCDExceptions(Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I);\r
2182         if(count>0) {\r
2183             //*\r
2184             //* If case-folding un-FCDs any strings, then unorm_compare() must be\r
2185             //* re-implemented.\r
2186             //* It currently assumes that one can check for FCD then case-fold\r
2187             //* and then still have FCD strings for raw decomposition without reordering.\r
2188             //*\r
2189             errln("error: There are "+count+" code points for which case-folding"+\r
2190                   " may un-FCD a string for all folding options.\n See comment"+\r
2191                   " in BasicNormalizerTest::FindFoldFCDExceptions()!");\r
2192         }\r
2193     }\r
2194     \r
2195     public void TestCombiningMarks(){\r
2196         String src = "\u0f71\u0f72\u0f73\u0f74\u0f75";\r
2197         String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74";\r
2198         String result = Normalizer.decompose(src,false);\r
2199         if(!expected.equals(result)){\r
2200             errln("Reordering of combining marks failed. Expected: "+Utility.hex(expected)+" Got: "+ Utility.hex(result));\r
2201         }\r
2202     }\r
2203 \r
2204     /*\r
2205      * Re-enable this test when UTC fixes UAX 21\r
2206     public void TestUAX21Failure(){\r
2207         final String[][] cases = new String[][]{\r
2208                 {"\u0061\u0345\u0360\u0345\u0062", "\u0061\u0360\u0345\u0345\u0062"},\r
2209                 {"\u0061\u0345\u0345\u0360\u0062", "\u0061\u0360\u0345\u0345\u0062"},\r
2210                 {"\u0061\u0345\u0360\u0362\u0360\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"},\r
2211                 {"\u0061\u0360\u0345\u0360\u0362\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"},\r
2212                 {"\u0061\u0345\u0360\u0362\u0361\u0062", "\u0061\u0362\u0360\u0361\u0345\u0062"},\r
2213                 {"\u0061\u0361\u0345\u0360\u0362\u0062", "\u0061\u0362\u0361\u0360\u0345\u0062"},\r
2214         };\r
2215         for(int i = 0; i< cases.length; i++){\r
2216             String s1 =cases[0][0]; \r
2217             String s2 = cases[0][1];\r
2218             if( (Normalizer.compare(s1,s2,Normalizer.FOLD_CASE_DEFAULT ==0)//case sensitive compare\r
2219                 &&\r
2220                 (Normalizer.compare(s1,s2,Normalizer.COMPARE_IGNORE_CASE)!=0)){\r
2221                 errln("Normalizer.compare() failed for s1: " \r
2222                         + Utility.hex(s1) +" s2: " + Utility.hex(s2));\r
2223             }\r
2224         }\r
2225     }\r
2226     */\r
2227     public void TestFCNFKCClosure() {\r
2228         final class TestStruct{\r
2229             int c;\r
2230             String s;\r
2231             TestStruct(int cp, String src){\r
2232                 c=cp;\r
2233                 s=src;\r
2234             }\r
2235         }\r
2236         \r
2237         TestStruct[] tests= new TestStruct[]{\r
2238             new TestStruct( 0x037A, "\u0020\u03B9" ),\r
2239             new TestStruct( 0x03D2, "\u03C5" ),\r
2240             new TestStruct( 0x20A8, "\u0072\u0073" ) ,\r
2241             new TestStruct( 0x210B, "\u0068" ),\r
2242             new TestStruct( 0x210C, "\u0068" ),\r
2243             new TestStruct( 0x2121, "\u0074\u0065\u006C" ),\r
2244             new TestStruct( 0x2122, "\u0074\u006D" ),\r
2245             new TestStruct( 0x2128, "\u007A" ),\r
2246             new TestStruct( 0x1D5DB,"\u0068" ),\r
2247             new TestStruct( 0x1D5ED,"\u007A" ),\r
2248             new TestStruct( 0x0061, "" )\r
2249         };\r
2250     \r
2251 \r
2252         for(int i = 0; i < tests.length; ++ i) {\r
2253             String result=Normalizer.getFC_NFKC_Closure(tests[i].c);\r
2254             if(!result.equals(new String(tests[i].s))) {\r
2255                 errln("getFC_NFKC_Closure(U+"+Integer.toHexString(tests[i].c)+") is wrong");\r
2256             }\r
2257         }\r
2258     \r
2259         /* error handling */\r
2260 \r
2261         int length=Normalizer.getFC_NFKC_Closure(0x5c, null);\r
2262         if(length!=0){\r
2263             errln("getFC_NFKC_Closure did not perform error handling correctly");\r
2264         }\r
2265     }\r
2266     public void TestBugJ2324(){\r
2267        /* String[] input = new String[]{\r
2268                             //"\u30FD\u3099",\r
2269                             "\u30FA\u309A",\r
2270                             "\u30FB\u309A",\r
2271                             "\u30FC\u309A",\r
2272                             "\u30FE\u309A",\r
2273                             "\u30FD\u309A",\r
2274 \r
2275         };*/\r
2276         String troublesome = "\u309A";\r
2277         for(int i=0x3000; i<0x3100;i++){\r
2278             String input = ((char)i)+troublesome;\r
2279             try{                            \r
2280               /*  String result =*/ Normalizer.compose(input,false);\r
2281             }catch(IndexOutOfBoundsException e){\r
2282                 errln("compose() failed for input: " + Utility.hex(input) + " Exception: " + e.toString());\r
2283             }\r
2284         }\r
2285                 \r
2286     }\r
2287 \r
2288      static final int D = 0, C = 1, KD= 2, KC = 3, FCD=4, NONE=5;   \r
2289     private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets){\r
2290         if( skipSets.length < 4 ){\r
2291             return null;\r
2292         }\r
2293         skipSets[D].applyPattern(\r
2294             "[^\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6\\u00D9-\\u00DD"\r
2295             + "\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6\\u00F9-\\u00FD"\r
2296             + "\\u00FF-\\u010F\\u0112-\\u0125\\u0128-\\u0130\\u0134-\\u0137"\r
2297             + "\\u0139-\\u013E\\u0143-\\u0148\\u014C-\\u0151\\u0154-\\u0165"\r
2298             + "\\u0168-\\u017E\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\\u01DC"\r
2299             + "\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B"\r
2300             + "\\u021E\\u021F\\u0226-\\u0233\\u0300-\\u034E\\u0350-\\u036F"\r
2301             + "\\u0374\\u037E\\u0385-\\u038A\\u038C\\u038E-\\u0390\\u03AA-"\r
2302             + "\\u03B0\\u03CA-\\u03CE\\u03D3\\u03D4\\u0400\\u0401\\u0403\\u0407"\r
2303             + "\\u040C-\\u040E\\u0419\\u0439\\u0450\\u0451\\u0453\\u0457\\u045C"\r
2304             + "-\\u045E\\u0476\\u0477\\u0483-\\u0487\\u04C1\\u04C2\\u04D0-"\r
2305             + "\\u04D3\\u04D6\\u04D7\\u04DA-\\u04DF\\u04E2-\\u04E7\\u04EA-"\r
2306             + "\\u04F5\\u04F8\\u04F9\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4"\r
2307             + "\\u05C5\\u05C7\\u0610-\\u061A\\u0622-\\u0626\\u064B-\\u065E"\r
2308             + "\\u0670\\u06C0\\u06C2\\u06D3\\u06D6-\\u06DC\\u06DF-\\u06E4"\r
2309             + "\\u06E7\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-"\r
2310             + "\\u07F3\\u0929\\u0931\\u0934\\u093C\\u094D\\u0951-\\u0954\\u0958"\r
2311             + "-\\u095F\\u09BC\\u09CB-\\u09CD\\u09DC\\u09DD\\u09DF\\u0A33"\r
2312             + "\\u0A36\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C"\r
2313             + "\\u0B48\\u0B4B-\\u0B4D\\u0B5C\\u0B5D\\u0B94\\u0BCA-\\u0BCD"\r
2314             + "\\u0C48\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CC0\\u0CC7\\u0CC8\\u0CCA"\r
2315             + "\\u0CCB\\u0CCD\\u0D4A-\\u0D4D\\u0DCA\\u0DDA\\u0DDC-\\u0DDE"\r
2316             + "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB8\\u0EB9\\u0EC8-\\u0ECB"\r
2317             + "\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D\\u0F52\\u0F57"\r
2318             + "\\u0F5C\\u0F69\\u0F71-\\u0F76\\u0F78\\u0F7A-\\u0F7D\\u0F80-"\r
2319             + "\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"\r
2320             + "\\u0FC6\\u1026\\u1037\\u1039\\u103A\\u108D\\u135F\\u1714\\u1734"\r
2321             + "\\u17D2\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1B06\\u1B08"\r
2322             + "\\u1B0A\\u1B0C\\u1B0E\\u1B12\\u1B34\\u1B3B\\u1B3D\\u1B40\\u1B41"\r
2323             + "\\u1B43\\u1B44\\u1B6B-\\u1B73\\u1BAA\\u1C37\\u1DC0-\\u1DE6"\r
2324             + "\\u1DFE-\\u1E99\\u1E9B\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-"\r
2325             + "\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59"\r
2326             + "\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-\\u1FBC"\r
2327             + "\\u1FBE\\u1FC1-\\u1FC4\\u1FC6-\\u1FD3\\u1FD6-\\u1FDB\\u1FDD-"\r
2328             + "\\u1FEF\\u1FF2-\\u1FF4\\u1FF6-\\u1FFD\\u2000\\u2001\\u20D0-"\r
2329             + "\\u20DC\\u20E1\\u20E5-\\u20F0\\u2126\\u212A\\u212B\\u219A\\u219B"\r
2330             + "\\u21AE\\u21CD-\\u21CF\\u2204\\u2209\\u220C\\u2224\\u2226\\u2241"\r
2331             + "\\u2244\\u2247\\u2249\\u2260\\u2262\\u226D-\\u2271\\u2274\\u2275"\r
2332             + "\\u2278\\u2279\\u2280\\u2281\\u2284\\u2285\\u2288\\u2289\\u22AC-"\r
2333             + "\\u22AF\\u22E0-\\u22E3\\u22EA-\\u22ED\\u2329\\u232A\\u2ADC"\r
2334             + "\\u2DE0-\\u2DFF\\u302A-\\u302F\\u304C\\u304E\\u3050\\u3052"\r
2335             + "\\u3054\\u3056\\u3058\\u305A\\u305C\\u305E\\u3060\\u3062\\u3065"\r
2336             + "\\u3067\\u3069\\u3070\\u3071\\u3073\\u3074\\u3076\\u3077\\u3079"\r
2337             + "\\u307A\\u307C\\u307D\\u3094\\u3099\\u309A\\u309E\\u30AC\\u30AE"\r
2338             + "\\u30B0\\u30B2\\u30B4\\u30B6\\u30B8\\u30BA\\u30BC\\u30BE\\u30C0"\r
2339             + "\\u30C2\\u30C5\\u30C7\\u30C9\\u30D0\\u30D1\\u30D3\\u30D4\\u30D6"\r
2340             + "\\u30D7\\u30D9\\u30DA\\u30DC\\u30DD\\u30F4\\u30F7-\\u30FA\\u30FE"\r
2341             + "\\uA66F\\uA67C\\uA67D\\uA806\\uA8C4\\uA92B-\\uA92D\\uA953\\uAC00"\r
2342             + "-\\uD7A3\\uF900-\\uFA0D\\uFA10\\uFA12\\uFA15-\\uFA1E\\uFA20"\r
2343             + "\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D\\uFA30-\\uFA6A\\uFA70-"\r
2344             + "\\uFAD9\\uFB1D-\\uFB1F\\uFB2A-\\uFB36\\uFB38-\\uFB3C\\uFB3E"\r
2345             + "\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-\\uFB4E\\uFE20-\\uFE26"\r
2346             + "\\U000101FD\\U00010A0D\\U00010A0F\\U00010A38-\\U00010A3A\\U00010"\r
2347             + "A3F\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001D172\\U0001D17B-"\r
2348             + "\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-\\U0001D1AD\\U0001"\r
2349             + "D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0002F800-\\U0002FA1D]", false);\r
2350 \r
2351       skipSets[C].applyPattern(\r
2352           "[^<->A-PR-Za-pr-z\\u00A8\\u00C0-\\u00CF\\u00D1-\\u00D6\\u00D8-"\r
2353           + "\\u00DD\\u00E0-\\u00EF\\u00F1-\\u00F6\\u00F8-\\u00FD\\u00FF-"\r
2354           + "\\u0103\\u0106-\\u010F\\u0112-\\u0117\\u011A-\\u0121\\u0124"\r
2355           + "\\u0125\\u0128-\\u012D\\u0130\\u0139\\u013A\\u013D\\u013E\\u0143"\r
2356           + "\\u0144\\u0147\\u0148\\u014C-\\u0151\\u0154\\u0155\\u0158-"\r
2357           + "\\u015D\\u0160\\u0161\\u0164\\u0165\\u0168-\\u0171\\u0174-"\r
2358           + "\\u017F\\u01A0\\u01A1\\u01AF\\u01B0\\u01B7\\u01CD-\\u01DC\\u01DE"\r
2359           + "-\\u01E1\\u01E6-\\u01EB\\u01F4\\u01F5\\u01F8-\\u01FB\\u0200-"\r
2360           + "\\u021B\\u021E\\u021F\\u0226-\\u0233\\u0292\\u0300-\\u034E"\r
2361           + "\\u0350-\\u036F\\u0374\\u037E\\u0387\\u0391\\u0395\\u0397\\u0399"\r
2362           + "\\u039F\\u03A1\\u03A5\\u03A9\\u03AC\\u03AE\\u03B1\\u03B5\\u03B7"\r
2363           + "\\u03B9\\u03BF\\u03C1\\u03C5\\u03C9-\\u03CB\\u03CE\\u03D2\\u0406"\r
2364           + "\\u0410\\u0413\\u0415-\\u0418\\u041A\\u041E\\u0423\\u0427\\u042B"\r
2365           + "\\u042D\\u0430\\u0433\\u0435-\\u0438\\u043A\\u043E\\u0443\\u0447"\r
2366           + "\\u044B\\u044D\\u0456\\u0474\\u0475\\u0483-\\u0487\\u04D8\\u04D9"\r
2367           + "\\u04E8\\u04E9\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5"\r
2368           + "\\u05C7\\u0610-\\u061A\\u0622\\u0623\\u0627\\u0648\\u064A-"\r
2369           + "\\u065E\\u0670\\u06C1\\u06D2\\u06D5-\\u06DC\\u06DF-\\u06E4"\r
2370           + "\\u06E7\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-"\r
2371           + "\\u07F3\\u0928\\u0930\\u0933\\u093C\\u094D\\u0951-\\u0954\\u0958"\r
2372           + "-\\u095F\\u09BC\\u09BE\\u09C7\\u09CD\\u09D7\\u09DC\\u09DD\\u09DF"\r
2373           + "\\u0A33\\u0A36\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD"\r
2374           + "\\u0B3C\\u0B3E\\u0B47\\u0B4D\\u0B56\\u0B57\\u0B5C\\u0B5D\\u0B92"\r
2375           + "\\u0BBE\\u0BC6\\u0BC7\\u0BCD\\u0BD7\\u0C46\\u0C4D\\u0C55\\u0C56"\r
2376           + "\\u0CBC\\u0CBF\\u0CC2\\u0CC6\\u0CCA\\u0CCD\\u0CD5\\u0CD6\\u0D3E"\r
2377           + "\\u0D46\\u0D47\\u0D4D\\u0D57\\u0DCA\\u0DCF\\u0DD9\\u0DDC\\u0DDF"\r
2378           + "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB8\\u0EB9\\u0EC8-\\u0ECB"\r
2379           + "\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D\\u0F52\\u0F57"\r
2380           + "\\u0F5C\\u0F69\\u0F71-\\u0F76\\u0F78\\u0F7A-\\u0F7D\\u0F80-"\r
2381           + "\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"\r
2382           + "\\u0FC6\\u1025\\u102E\\u1037\\u1039\\u103A\\u108D\\u1100-\\u1112"\r
2383           + "\\u1161-\\u1175\\u11A8-\\u11C2\\u135F\\u1714\\u1734\\u17D2"\r
2384           + "\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1B05\\u1B07\\u1B09"\r
2385           + "\\u1B0B\\u1B0D\\u1B11\\u1B34\\u1B35\\u1B3A\\u1B3C\\u1B3E\\u1B3F"\r
2386           + "\\u1B42\\u1B44\\u1B6B-\\u1B73\\u1BAA\\u1C37\\u1DC0-\\u1DE6"\r
2387           + "\\u1DFE-\\u1E03\\u1E0A-\\u1E0F\\u1E12-\\u1E1B\\u1E20-\\u1E27"\r
2388           + "\\u1E2A-\\u1E41\\u1E44-\\u1E53\\u1E58-\\u1E7D\\u1E80-\\u1E87"\r
2389           + "\\u1E8E-\\u1E91\\u1E96-\\u1E99\\u1EA0-\\u1EF3\\u1EF6-\\u1EF9"\r
2390           + "\\u1F00-\\u1F11\\u1F18\\u1F19\\u1F20-\\u1F31\\u1F38\\u1F39"\r
2391           + "\\u1F40\\u1F41\\u1F48\\u1F49\\u1F50\\u1F51\\u1F59\\u1F60-\\u1F71"\r
2392           + "\\u1F73-\\u1F75\\u1F77\\u1F79\\u1F7B-\\u1F7D\\u1F80\\u1F81"\r
2393           + "\\u1F88\\u1F89\\u1F90\\u1F91\\u1F98\\u1F99\\u1FA0\\u1FA1\\u1FA8"\r
2394           + "\\u1FA9\\u1FB3\\u1FB6\\u1FBB\\u1FBC\\u1FBE\\u1FBF\\u1FC3\\u1FC6"\r
2395           + "\\u1FC9\\u1FCB\\u1FCC\\u1FD3\\u1FDB\\u1FE3\\u1FEB\\u1FEE\\u1FEF"\r
2396           + "\\u1FF3\\u1FF6\\u1FF9\\u1FFB-\\u1FFE\\u2000\\u2001\\u20D0-"\r
2397           + "\\u20DC\\u20E1\\u20E5-\\u20F0\\u2126\\u212A\\u212B\\u2190\\u2192"\r
2398           + "\\u2194\\u21D0\\u21D2\\u21D4\\u2203\\u2208\\u220B\\u2223\\u2225"\r
2399           + "\\u223C\\u2243\\u2245\\u2248\\u224D\\u2261\\u2264\\u2265\\u2272"\r
2400           + "\\u2273\\u2276\\u2277\\u227A-\\u227D\\u2282\\u2283\\u2286\\u2287"\r
2401           + "\\u2291\\u2292\\u22A2\\u22A8\\u22A9\\u22AB\\u22B2-\\u22B5\\u2329"\r
2402           + "\\u232A\\u2ADC\\u2DE0-\\u2DFF\\u302A-\\u302F\\u3046\\u304B"\r
2403           + "\\u304D\\u304F\\u3051\\u3053\\u3055\\u3057\\u3059\\u305B\\u305D"\r
2404           + "\\u305F\\u3061\\u3064\\u3066\\u3068\\u306F\\u3072\\u3075\\u3078"\r
2405           + "\\u307B\\u3099\\u309A\\u309D\\u30A6\\u30AB\\u30AD\\u30AF\\u30B1"\r
2406           + "\\u30B3\\u30B5\\u30B7\\u30B9\\u30BB\\u30BD\\u30BF\\u30C1\\u30C4"\r
2407           + "\\u30C6\\u30C8\\u30CF\\u30D2\\u30D5\\u30D8\\u30DB\\u30EF-\\u30F2"\r
2408           + "\\u30FD\\uA66F\\uA67C\\uA67D\\uA806\\uA8C4\\uA92B-\\uA92D\\uA953"\r
2409           + "\\uAC00\\uAC1C\\uAC38\\uAC54\\uAC70\\uAC8C\\uACA8\\uACC4\\uACE0"\r
2410           + "\\uACFC\\uAD18\\uAD34\\uAD50\\uAD6C\\uAD88\\uADA4\\uADC0\\uADDC"\r
2411           + "\\uADF8\\uAE14\\uAE30\\uAE4C\\uAE68\\uAE84\\uAEA0\\uAEBC\\uAED8"\r
2412           + "\\uAEF4\\uAF10\\uAF2C\\uAF48\\uAF64\\uAF80\\uAF9C\\uAFB8\\uAFD4"\r
2413           + "\\uAFF0\\uB00C\\uB028\\uB044\\uB060\\uB07C\\uB098\\uB0B4\\uB0D0"\r
2414           + "\\uB0EC\\uB108\\uB124\\uB140\\uB15C\\uB178\\uB194\\uB1B0\\uB1CC"\r
2415           + "\\uB1E8\\uB204\\uB220\\uB23C\\uB258\\uB274\\uB290\\uB2AC\\uB2C8"\r
2416           + "\\uB2E4\\uB300\\uB31C\\uB338\\uB354\\uB370\\uB38C\\uB3A8\\uB3C4"\r
2417           + "\\uB3E0\\uB3FC\\uB418\\uB434\\uB450\\uB46C\\uB488\\uB4A4\\uB4C0"\r
2418           + "\\uB4DC\\uB4F8\\uB514\\uB530\\uB54C\\uB568\\uB584\\uB5A0\\uB5BC"\r
2419           + "\\uB5D8\\uB5F4\\uB610\\uB62C\\uB648\\uB664\\uB680\\uB69C\\uB6B8"\r
2420           + "\\uB6D4\\uB6F0\\uB70C\\uB728\\uB744\\uB760\\uB77C\\uB798\\uB7B4"\r
2421           + "\\uB7D0\\uB7EC\\uB808\\uB824\\uB840\\uB85C\\uB878\\uB894\\uB8B0"\r
2422           + "\\uB8CC\\uB8E8\\uB904\\uB920\\uB93C\\uB958\\uB974\\uB990\\uB9AC"\r
2423           + "\\uB9C8\\uB9E4\\uBA00\\uBA1C\\uBA38\\uBA54\\uBA70\\uBA8C\\uBAA8"\r
2424           + "\\uBAC4\\uBAE0\\uBAFC\\uBB18\\uBB34\\uBB50\\uBB6C\\uBB88\\uBBA4"\r
2425           + "\\uBBC0\\uBBDC\\uBBF8\\uBC14\\uBC30\\uBC4C\\uBC68\\uBC84\\uBCA0"\r
2426           + "\\uBCBC\\uBCD8\\uBCF4\\uBD10\\uBD2C\\uBD48\\uBD64\\uBD80\\uBD9C"\r
2427           + "\\uBDB8\\uBDD4\\uBDF0\\uBE0C\\uBE28\\uBE44\\uBE60\\uBE7C\\uBE98"\r
2428           + "\\uBEB4\\uBED0\\uBEEC\\uBF08\\uBF24\\uBF40\\uBF5C\\uBF78\\uBF94"\r
2429           + "\\uBFB0\\uBFCC\\uBFE8\\uC004\\uC020\\uC03C\\uC058\\uC074\\uC090"\r
2430           + "\\uC0AC\\uC0C8\\uC0E4\\uC100\\uC11C\\uC138\\uC154\\uC170\\uC18C"\r
2431           + "\\uC1A8\\uC1C4\\uC1E0\\uC1FC\\uC218\\uC234\\uC250\\uC26C\\uC288"\r
2432           + "\\uC2A4\\uC2C0\\uC2DC\\uC2F8\\uC314\\uC330\\uC34C\\uC368\\uC384"\r
2433           + "\\uC3A0\\uC3BC\\uC3D8\\uC3F4\\uC410\\uC42C\\uC448\\uC464\\uC480"\r
2434           + "\\uC49C\\uC4B8\\uC4D4\\uC4F0\\uC50C\\uC528\\uC544\\uC560\\uC57C"\r
2435           + "\\uC598\\uC5B4\\uC5D0\\uC5EC\\uC608\\uC624\\uC640\\uC65C\\uC678"\r
2436           + "\\uC694\\uC6B0\\uC6CC\\uC6E8\\uC704\\uC720\\uC73C\\uC758\\uC774"\r
2437           + "\\uC790\\uC7AC\\uC7C8\\uC7E4\\uC800\\uC81C\\uC838\\uC854\\uC870"\r
2438           + "\\uC88C\\uC8A8\\uC8C4\\uC8E0\\uC8FC\\uC918\\uC934\\uC950\\uC96C"\r
2439           + "\\uC988\\uC9A4\\uC9C0\\uC9DC\\uC9F8\\uCA14\\uCA30\\uCA4C\\uCA68"\r
2440           + "\\uCA84\\uCAA0\\uCABC\\uCAD8\\uCAF4\\uCB10\\uCB2C\\uCB48\\uCB64"\r
2441           + "\\uCB80\\uCB9C\\uCBB8\\uCBD4\\uCBF0\\uCC0C\\uCC28\\uCC44\\uCC60"\r
2442           + "\\uCC7C\\uCC98\\uCCB4\\uCCD0\\uCCEC\\uCD08\\uCD24\\uCD40\\uCD5C"\r
2443           + "\\uCD78\\uCD94\\uCDB0\\uCDCC\\uCDE8\\uCE04\\uCE20\\uCE3C\\uCE58"\r
2444           + "\\uCE74\\uCE90\\uCEAC\\uCEC8\\uCEE4\\uCF00\\uCF1C\\uCF38\\uCF54"\r
2445           + "\\uCF70\\uCF8C\\uCFA8\\uCFC4\\uCFE0\\uCFFC\\uD018\\uD034\\uD050"\r
2446           + "\\uD06C\\uD088\\uD0A4\\uD0C0\\uD0DC\\uD0F8\\uD114\\uD130\\uD14C"\r
2447           + "\\uD168\\uD184\\uD1A0\\uD1BC\\uD1D8\\uD1F4\\uD210\\uD22C\\uD248"\r
2448           + "\\uD264\\uD280\\uD29C\\uD2B8\\uD2D4\\uD2F0\\uD30C\\uD328\\uD344"\r
2449           + "\\uD360\\uD37C\\uD398\\uD3B4\\uD3D0\\uD3EC\\uD408\\uD424\\uD440"\r
2450           + "\\uD45C\\uD478\\uD494\\uD4B0\\uD4CC\\uD4E8\\uD504\\uD520\\uD53C"\r
2451           + "\\uD558\\uD574\\uD590\\uD5AC\\uD5C8\\uD5E4\\uD600\\uD61C\\uD638"\r
2452           + "\\uD654\\uD670\\uD68C\\uD6A8\\uD6C4\\uD6E0\\uD6FC\\uD718\\uD734"\r
2453           + "\\uD750\\uD76C\\uD788\\uF900-\\uFA0D\\uFA10\\uFA12\\uFA15-"\r
2454           + "\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D\\uFA30-"\r
2455           + "\\uFA6A\\uFA70-\\uFAD9\\uFB1D-\\uFB1F\\uFB2A-\\uFB36\\uFB38-"\r
2456           + "\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-\\uFB4E\\uFE20"\r
2457           + "-\\uFE26\\U000101FD\\U00010A0D\\U00010A0F\\U00010A38-\\U00010A3A"\r
2458           + "\\U00010A3F\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001D172\\U0001"\r
2459           + "D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-\\U0001D1AD"\r
2460           + "\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0002F800-"\r
2461           + "\\U0002FA1D]", false);\r
2462    \r
2463         skipSets[KD].applyPattern(\r
2464               "[^\\u00A0\\u00A8\\u00AA\\u00AF\\u00B2-\\u00B5\\u00B8-\\u00BA"\r
2465               + "\\u00BC-\\u00BE\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6"\r
2466               + "\\u00D9-\\u00DD\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6"\r
2467               + "\\u00F9-\\u00FD\\u00FF-\\u010F\\u0112-\\u0125\\u0128-\\u0130"\r
2468               + "\\u0132-\\u0137\\u0139-\\u0140\\u0143-\\u0149\\u014C-\\u0151"\r
2469               + "\\u0154-\\u0165\\u0168-\\u017F\\u01A0\\u01A1\\u01AF\\u01B0"\r
2470               + "\\u01C4-\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F5\\u01F8-\\u021B"\r
2471               + "\\u021E\\u021F\\u0226-\\u0233\\u02B0-\\u02B8\\u02D8-\\u02DD"\r
2472               + "\\u02E0-\\u02E4\\u0300-\\u034E\\u0350-\\u036F\\u0374\\u037A"\r
2473               + "\\u037E\\u0384-\\u038A\\u038C\\u038E-\\u0390\\u03AA-\\u03B0"\r
2474               + "\\u03CA-\\u03CE\\u03D0-\\u03D6\\u03F0-\\u03F2\\u03F4\\u03F5"\r
2475               + "\\u03F9\\u0400\\u0401\\u0403\\u0407\\u040C-\\u040E\\u0419\\u0439"\r
2476               + "\\u0450\\u0451\\u0453\\u0457\\u045C-\\u045E\\u0476\\u0477\\u0483"\r
2477               + "-\\u0487\\u04C1\\u04C2\\u04D0-\\u04D3\\u04D6\\u04D7\\u04DA-"\r
2478               + "\\u04DF\\u04E2-\\u04E7\\u04EA-\\u04F5\\u04F8\\u04F9\\u0587"\r
2479               + "\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5\\u05C7\\u0610"\r
2480               + "-\\u061A\\u0622-\\u0626\\u064B-\\u065E\\u0670\\u0675-\\u0678"\r
2481               + "\\u06C0\\u06C2\\u06D3\\u06D6-\\u06DC\\u06DF-\\u06E4\\u06E7"\r
2482               + "\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-\\u07F3"\r
2483               + "\\u0929\\u0931\\u0934\\u093C\\u094D\\u0951-\\u0954\\u0958-"\r
2484               + "\\u095F\\u09BC\\u09CB-\\u09CD\\u09DC\\u09DD\\u09DF\\u0A33\\u0A36"\r
2485               + "\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C\\u0B48"\r
2486               + "\\u0B4B-\\u0B4D\\u0B5C\\u0B5D\\u0B94\\u0BCA-\\u0BCD\\u0C48"\r
2487               + "\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CC0\\u0CC7\\u0CC8\\u0CCA\\u0CCB"\r
2488               + "\\u0CCD\\u0D4A-\\u0D4D\\u0DCA\\u0DDA\\u0DDC-\\u0DDE\\u0E33"\r
2489               + "\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB3\\u0EB8\\u0EB9\\u0EC8-"\r
2490               + "\\u0ECB\\u0EDC\\u0EDD\\u0F0C\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39"\r
2491               + "\\u0F43\\u0F4D\\u0F52\\u0F57\\u0F5C\\u0F69\\u0F71-\\u0F7D\\u0F80"\r
2492               + "-\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"\r
2493               + "\\u0FC6\\u1026\\u1037\\u1039\\u103A\\u108D\\u10FC\\u135F\\u1714"\r
2494               + "\\u1734\\u17D2\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1B06"\r
2495               + "\\u1B08\\u1B0A\\u1B0C\\u1B0E\\u1B12\\u1B34\\u1B3B\\u1B3D\\u1B40"\r
2496               + "\\u1B41\\u1B43\\u1B44\\u1B6B-\\u1B73\\u1BAA\\u1C37\\u1D2C-"\r
2497               + "\\u1D2E\\u1D30-\\u1D3A\\u1D3C-\\u1D4D\\u1D4F-\\u1D6A\\u1D78"\r
2498               + "\\u1D9B-\\u1DE6\\u1DFE-\\u1E9B\\u1EA0-\\u1EF9\\u1F00-\\u1F15"\r
2499               + "\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57"\r
2500               + "\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-"\r
2501               + "\\u1FC4\\u1FC6-\\u1FD3\\u1FD6-\\u1FDB\\u1FDD-\\u1FEF\\u1FF2-"\r
2502               + "\\u1FF4\\u1FF6-\\u1FFE\\u2000-\\u200A\\u2011\\u2017\\u2024-"\r
2503               + "\\u2026\\u202F\\u2033\\u2034\\u2036\\u2037\\u203C\\u203E\\u2047-"\r
2504               + "\\u2049\\u2057\\u205F\\u2070\\u2071\\u2074-\\u208E\\u2090-"\r
2505               + "\\u2094\\u20A8\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2100-"\r
2506               + "\\u2103\\u2105-\\u2107\\u2109-\\u2113\\u2115\\u2116\\u2119-"\r
2507               + "\\u211D\\u2120-\\u2122\\u2124\\u2126\\u2128\\u212A-\\u212D"\r
2508               + "\\u212F-\\u2131\\u2133-\\u2139\\u213B-\\u2140\\u2145-\\u2149"\r
2509               + "\\u2153-\\u217F\\u219A\\u219B\\u21AE\\u21CD-\\u21CF\\u2204"\r
2510               + "\\u2209\\u220C\\u2224\\u2226\\u222C\\u222D\\u222F\\u2230\\u2241"\r
2511               + "\\u2244\\u2247\\u2249\\u2260\\u2262\\u226D-\\u2271\\u2274\\u2275"\r
2512               + "\\u2278\\u2279\\u2280\\u2281\\u2284\\u2285\\u2288\\u2289\\u22AC-"\r
2513               + "\\u22AF\\u22E0-\\u22E3\\u22EA-\\u22ED\\u2329\\u232A\\u2460-"\r
2514               + "\\u24EA\\u2A0C\\u2A74-\\u2A76\\u2ADC\\u2C7C\\u2C7D\\u2D6F\\u2DE0"\r
2515               + "-\\u2DFF\\u2E9F\\u2EF3\\u2F00-\\u2FD5\\u3000\\u302A-\\u302F"\r
2516               + "\\u3036\\u3038-\\u303A\\u304C\\u304E\\u3050\\u3052\\u3054\\u3056"\r
2517               + "\\u3058\\u305A\\u305C\\u305E\\u3060\\u3062\\u3065\\u3067\\u3069"\r
2518               + "\\u3070\\u3071\\u3073\\u3074\\u3076\\u3077\\u3079\\u307A\\u307C"\r
2519               + "\\u307D\\u3094\\u3099-\\u309C\\u309E\\u309F\\u30AC\\u30AE\\u30B0"\r
2520               + "\\u30B2\\u30B4\\u30B6\\u30B8\\u30BA\\u30BC\\u30BE\\u30C0\\u30C2"\r
2521               + "\\u30C5\\u30C7\\u30C9\\u30D0\\u30D1\\u30D3\\u30D4\\u30D6\\u30D7"\r
2522               + "\\u30D9\\u30DA\\u30DC\\u30DD\\u30F4\\u30F7-\\u30FA\\u30FE\\u30FF"\r
2523               + "\\u3131-\\u318E\\u3192-\\u319F\\u3200-\\u321E\\u3220-\\u3243"\r
2524               + "\\u3250-\\u327E\\u3280-\\u32FE\\u3300-\\u33FF\\uA66F\\uA67C"\r
2525               + "\\uA67D\\uA770\\uA806\\uA8C4\\uA92B-\\uA92D\\uA953\\uAC00-"\r
2526               + "\\uD7A3\\uF900-\\uFA0D\\uFA10\\uFA12\\uFA15-\\uFA1E\\uFA20"\r
2527               + "\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D\\uFA30-\\uFA6A\\uFA70-"\r
2528               + "\\uFAD9\\uFB00-\\uFB06\\uFB13-\\uFB17\\uFB1D-\\uFB36\\uFB38-"\r
2529               + "\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-\\uFBB1\\uFBD3"\r
2530               + "-\\uFD3D\\uFD50-\\uFD8F\\uFD92-\\uFDC7\\uFDF0-\\uFDFC\\uFE10-"\r
2531               + "\\uFE19\\uFE20-\\uFE26\\uFE30-\\uFE44\\uFE47-\\uFE52\\uFE54-"\r
2532               + "\\uFE66\\uFE68-\\uFE6B\\uFE70-\\uFE72\\uFE74\\uFE76-\\uFEFC"\r
2533               + "\\uFF01-\\uFFBE\\uFFC2-\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7"\r
2534               + "\\uFFDA-\\uFFDC\\uFFE0-\\uFFE6\\uFFE8-\\uFFEE\\U000101FD\\U00010"\r
2535               + "A0D\\U00010A0F\\U00010A38-\\U00010A3A\\U00010A3F\\U0001D15E-"\r
2536               + "\\U0001D169\\U0001D16D-\\U0001D172\\U0001D17B-\\U0001D182\\U0001"\r
2537               + "D185-\\U0001D18B\\U0001D1AA-\\U0001D1AD\\U0001D1BB-\\U0001D1C0"\r
2538               + "\\U0001D242-\\U0001D244\\U0001D400-\\U0001D454\\U0001D456-"\r
2539               + "\\U0001D49C\\U0001D49E\\U0001D49F\\U0001D4A2\\U0001D4A5\\U0001D4"\r
2540               + "A6\\U0001D4A9-\\U0001D4AC\\U0001D4AE-\\U0001D4B9\\U0001D4BB"\r
2541               + "\\U0001D4BD-\\U0001D4C3\\U0001D4C5-\\U0001D505\\U0001D507-"\r
2542               + "\\U0001D50A\\U0001D50D-\\U0001D514\\U0001D516-\\U0001D51C\\U0001"\r
2543               + "D51E-\\U0001D539\\U0001D53B-\\U0001D53E\\U0001D540-\\U0001D544"\r
2544               + "\\U0001D546\\U0001D54A-\\U0001D550\\U0001D552-\\U0001D6A5\\U0001"\r
2545               + "D6A8-\\U0001D7CB\\U0001D7CE-\\U0001D7FF\\U0002F800-\\U0002FA1D]", false);\r
2546    \r
2547         skipSets[KC].applyPattern(\r
2548                 "[^<->A-PR-Za-pr-z\\u00A0\\u00A8\\u00AA\\u00AF\\u00B2-\\u00B5"\r
2549                 + "\\u00B8-\\u00BA\\u00BC-\\u00BE\\u00C0-\\u00CF\\u00D1-\\u00D6"\r
2550                 + "\\u00D8-\\u00DD\\u00E0-\\u00EF\\u00F1-\\u00F6\\u00F8-\\u00FD"\r
2551                 + "\\u00FF-\\u0103\\u0106-\\u010F\\u0112-\\u0117\\u011A-\\u0121"\r
2552                 + "\\u0124\\u0125\\u0128-\\u012D\\u0130\\u0132\\u0133\\u0139\\u013A"\r
2553                 + "\\u013D-\\u0140\\u0143\\u0144\\u0147-\\u0149\\u014C-\\u0151"\r
2554                 + "\\u0154\\u0155\\u0158-\\u015D\\u0160\\u0161\\u0164\\u0165\\u0168"\r
2555                 + "-\\u0171\\u0174-\\u017F\\u01A0\\u01A1\\u01AF\\u01B0\\u01B7"\r
2556                 + "\\u01C4-\\u01DC\\u01DE-\\u01E1\\u01E6-\\u01EB\\u01F1-\\u01F5"\r
2557                 + "\\u01F8-\\u01FB\\u0200-\\u021B\\u021E\\u021F\\u0226-\\u0233"\r
2558                 + "\\u0292\\u02B0-\\u02B8\\u02D8-\\u02DD\\u02E0-\\u02E4\\u0300-"\r
2559                 + "\\u034E\\u0350-\\u036F\\u0374\\u037A\\u037E\\u0384\\u0385\\u0387"\r
2560                 + "\\u0391\\u0395\\u0397\\u0399\\u039F\\u03A1\\u03A5\\u03A9\\u03AC"\r
2561                 + "\\u03AE\\u03B1\\u03B5\\u03B7\\u03B9\\u03BF\\u03C1\\u03C5\\u03C9-"\r
2562                 + "\\u03CB\\u03CE\\u03D0-\\u03D6\\u03F0-\\u03F2\\u03F4\\u03F5"\r
2563                 + "\\u03F9\\u0406\\u0410\\u0413\\u0415-\\u0418\\u041A\\u041E\\u0423"\r
2564                 + "\\u0427\\u042B\\u042D\\u0430\\u0433\\u0435-\\u0438\\u043A\\u043E"\r
2565                 + "\\u0443\\u0447\\u044B\\u044D\\u0456\\u0474\\u0475\\u0483-\\u0487"\r
2566                 + "\\u04D8\\u04D9\\u04E8\\u04E9\\u0587\\u0591-\\u05BD\\u05BF\\u05C1"\r
2567                 + "\\u05C2\\u05C4\\u05C5\\u05C7\\u0610-\\u061A\\u0622\\u0623\\u0627"\r
2568                 + "\\u0648\\u064A-\\u065E\\u0670\\u0675-\\u0678\\u06C1\\u06D2"\r
2569                 + "\\u06D5-\\u06DC\\u06DF-\\u06E4\\u06E7\\u06E8\\u06EA-\\u06ED"\r
2570                 + "\\u0711\\u0730-\\u074A\\u07EB-\\u07F3\\u0928\\u0930\\u0933"\r
2571                 + "\\u093C\\u094D\\u0951-\\u0954\\u0958-\\u095F\\u09BC\\u09BE"\r
2572                 + "\\u09C7\\u09CD\\u09D7\\u09DC\\u09DD\\u09DF\\u0A33\\u0A36\\u0A3C"\r
2573                 + "\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C\\u0B3E\\u0B47"\r
2574                 + "\\u0B4D\\u0B56\\u0B57\\u0B5C\\u0B5D\\u0B92\\u0BBE\\u0BC6\\u0BC7"\r
2575                 + "\\u0BCD\\u0BD7\\u0C46\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CBF\\u0CC2"\r
2576                 + "\\u0CC6\\u0CCA\\u0CCD\\u0CD5\\u0CD6\\u0D3E\\u0D46\\u0D47\\u0D4D"\r
2577                 + "\\u0D57\\u0DCA\\u0DCF\\u0DD9\\u0DDC\\u0DDF\\u0E33\\u0E38-\\u0E3A"\r
2578                 + "\\u0E48-\\u0E4B\\u0EB3\\u0EB8\\u0EB9\\u0EC8-\\u0ECB\\u0EDC"\r
2579                 + "\\u0EDD\\u0F0C\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D"\r
2580                 + "\\u0F52\\u0F57\\u0F5C\\u0F69\\u0F71-\\u0F7D\\u0F80-\\u0F84"\r
2581                 + "\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9\\u0FC6"\r
2582                 + "\\u1025\\u102E\\u1037\\u1039\\u103A\\u108D\\u10FC\\u1100-\\u1112"\r
2583                 + "\\u1161-\\u1175\\u11A8-\\u11C2\\u135F\\u1714\\u1734\\u17D2"\r
2584                 + "\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1B05\\u1B07\\u1B09"\r
2585                 + "\\u1B0B\\u1B0D\\u1B11\\u1B34\\u1B35\\u1B3A\\u1B3C\\u1B3E\\u1B3F"\r
2586                 + "\\u1B42\\u1B44\\u1B6B-\\u1B73\\u1BAA\\u1C37\\u1D2C-\\u1D2E"\r
2587                 + "\\u1D30-\\u1D3A\\u1D3C-\\u1D4D\\u1D4F-\\u1D6A\\u1D78\\u1D9B-"\r
2588                 + "\\u1DE6\\u1DFE-\\u1E03\\u1E0A-\\u1E0F\\u1E12-\\u1E1B\\u1E20-"\r
2589                 + "\\u1E27\\u1E2A-\\u1E41\\u1E44-\\u1E53\\u1E58-\\u1E7D\\u1E80-"\r
2590                 + "\\u1E87\\u1E8E-\\u1E91\\u1E96-\\u1E9B\\u1EA0-\\u1EF3\\u1EF6-"\r
2591                 + "\\u1EF9\\u1F00-\\u1F11\\u1F18\\u1F19\\u1F20-\\u1F31\\u1F38"\r
2592                 + "\\u1F39\\u1F40\\u1F41\\u1F48\\u1F49\\u1F50\\u1F51\\u1F59\\u1F60-"\r
2593                 + "\\u1F71\\u1F73-\\u1F75\\u1F77\\u1F79\\u1F7B-\\u1F7D\\u1F80"\r
2594                 + "\\u1F81\\u1F88\\u1F89\\u1F90\\u1F91\\u1F98\\u1F99\\u1FA0\\u1FA1"\r
2595                 + "\\u1FA8\\u1FA9\\u1FB3\\u1FB6\\u1FBB-\\u1FC1\\u1FC3\\u1FC6\\u1FC9"\r
2596                 + "\\u1FCB-\\u1FCF\\u1FD3\\u1FDB\\u1FDD-\\u1FDF\\u1FE3\\u1FEB"\r
2597                 + "\\u1FED-\\u1FEF\\u1FF3\\u1FF6\\u1FF9\\u1FFB-\\u1FFE\\u2000-"\r
2598                 + "\\u200A\\u2011\\u2017\\u2024-\\u2026\\u202F\\u2033\\u2034\\u2036"\r
2599                 + "\\u2037\\u203C\\u203E\\u2047-\\u2049\\u2057\\u205F\\u2070\\u2071"\r
2600                 + "\\u2074-\\u208E\\u2090-\\u2094\\u20A8\\u20D0-\\u20DC\\u20E1"\r
2601                 + "\\u20E5-\\u20F0\\u2100-\\u2103\\u2105-\\u2107\\u2109-\\u2113"\r
2602                 + "\\u2115\\u2116\\u2119-\\u211D\\u2120-\\u2122\\u2124\\u2126"\r
2603                 + "\\u2128\\u212A-\\u212D\\u212F-\\u2131\\u2133-\\u2139\\u213B-"\r
2604                 + "\\u2140\\u2145-\\u2149\\u2153-\\u217F\\u2190\\u2192\\u2194"\r
2605                 + "\\u21D0\\u21D2\\u21D4\\u2203\\u2208\\u220B\\u2223\\u2225\\u222C"\r
2606                 + "\\u222D\\u222F\\u2230\\u223C\\u2243\\u2245\\u2248\\u224D\\u2261"\r
2607                 + "\\u2264\\u2265\\u2272\\u2273\\u2276\\u2277\\u227A-\\u227D\\u2282"\r
2608                 + "\\u2283\\u2286\\u2287\\u2291\\u2292\\u22A2\\u22A8\\u22A9\\u22AB"\r
2609                 + "\\u22B2-\\u22B5\\u2329\\u232A\\u2460-\\u24EA\\u2A0C\\u2A74-"\r
2610                 + "\\u2A76\\u2ADC\\u2C7C\\u2C7D\\u2D6F\\u2DE0-\\u2DFF\\u2E9F\\u2EF3"\r
2611                 + "\\u2F00-\\u2FD5\\u3000\\u302A-\\u302F\\u3036\\u3038-\\u303A"\r
2612                 + "\\u3046\\u304B\\u304D\\u304F\\u3051\\u3053\\u3055\\u3057\\u3059"\r
2613                 + "\\u305B\\u305D\\u305F\\u3061\\u3064\\u3066\\u3068\\u306F\\u3072"\r
2614                 + "\\u3075\\u3078\\u307B\\u3099-\\u309D\\u309F\\u30A6\\u30AB\\u30AD"\r
2615                 + "\\u30AF\\u30B1\\u30B3\\u30B5\\u30B7\\u30B9\\u30BB\\u30BD\\u30BF"\r
2616                 + "\\u30C1\\u30C4\\u30C6\\u30C8\\u30CF\\u30D2\\u30D5\\u30D8\\u30DB"\r
2617                 + "\\u30EF-\\u30F2\\u30FD\\u30FF\\u3131-\\u318E\\u3192-\\u319F"\r
2618                 + "\\u3200-\\u321E\\u3220-\\u3243\\u3250-\\u327E\\u3280-\\u32FE"\r
2619                 + "\\u3300-\\u33FF\\uA66F\\uA67C\\uA67D\\uA770\\uA806\\uA8C4\\uA92B"\r
2620                 + "-\\uA92D\\uA953\\uAC00\\uAC1C\\uAC38\\uAC54\\uAC70\\uAC8C\\uACA8"\r
2621                 + "\\uACC4\\uACE0\\uACFC\\uAD18\\uAD34\\uAD50\\uAD6C\\uAD88\\uADA4"\r
2622                 + "\\uADC0\\uADDC\\uADF8\\uAE14\\uAE30\\uAE4C\\uAE68\\uAE84\\uAEA0"\r
2623                 + "\\uAEBC\\uAED8\\uAEF4\\uAF10\\uAF2C\\uAF48\\uAF64\\uAF80\\uAF9C"\r
2624                 + "\\uAFB8\\uAFD4\\uAFF0\\uB00C\\uB028\\uB044\\uB060\\uB07C\\uB098"\r
2625                 + "\\uB0B4\\uB0D0\\uB0EC\\uB108\\uB124\\uB140\\uB15C\\uB178\\uB194"\r
2626                 + "\\uB1B0\\uB1CC\\uB1E8\\uB204\\uB220\\uB23C\\uB258\\uB274\\uB290"\r
2627                 + "\\uB2AC\\uB2C8\\uB2E4\\uB300\\uB31C\\uB338\\uB354\\uB370\\uB38C"\r
2628                 + "\\uB3A8\\uB3C4\\uB3E0\\uB3FC\\uB418\\uB434\\uB450\\uB46C\\uB488"\r
2629                 + "\\uB4A4\\uB4C0\\uB4DC\\uB4F8\\uB514\\uB530\\uB54C\\uB568\\uB584"\r
2630                 + "\\uB5A0\\uB5BC\\uB5D8\\uB5F4\\uB610\\uB62C\\uB648\\uB664\\uB680"\r
2631                 + "\\uB69C\\uB6B8\\uB6D4\\uB6F0\\uB70C\\uB728\\uB744\\uB760\\uB77C"\r
2632                 + "\\uB798\\uB7B4\\uB7D0\\uB7EC\\uB808\\uB824\\uB840\\uB85C\\uB878"\r
2633                 + "\\uB894\\uB8B0\\uB8CC\\uB8E8\\uB904\\uB920\\uB93C\\uB958\\uB974"\r
2634                 + "\\uB990\\uB9AC\\uB9C8\\uB9E4\\uBA00\\uBA1C\\uBA38\\uBA54\\uBA70"\r
2635                 + "\\uBA8C\\uBAA8\\uBAC4\\uBAE0\\uBAFC\\uBB18\\uBB34\\uBB50\\uBB6C"\r
2636                 + "\\uBB88\\uBBA4\\uBBC0\\uBBDC\\uBBF8\\uBC14\\uBC30\\uBC4C\\uBC68"\r
2637                 + "\\uBC84\\uBCA0\\uBCBC\\uBCD8\\uBCF4\\uBD10\\uBD2C\\uBD48\\uBD64"\r
2638                 + "\\uBD80\\uBD9C\\uBDB8\\uBDD4\\uBDF0\\uBE0C\\uBE28\\uBE44\\uBE60"\r
2639                 + "\\uBE7C\\uBE98\\uBEB4\\uBED0\\uBEEC\\uBF08\\uBF24\\uBF40\\uBF5C"\r
2640                 + "\\uBF78\\uBF94\\uBFB0\\uBFCC\\uBFE8\\uC004\\uC020\\uC03C\\uC058"\r
2641                 + "\\uC074\\uC090\\uC0AC\\uC0C8\\uC0E4\\uC100\\uC11C\\uC138\\uC154"\r
2642                 + "\\uC170\\uC18C\\uC1A8\\uC1C4\\uC1E0\\uC1FC\\uC218\\uC234\\uC250"\r
2643                 + "\\uC26C\\uC288\\uC2A4\\uC2C0\\uC2DC\\uC2F8\\uC314\\uC330\\uC34C"\r
2644                 + "\\uC368\\uC384\\uC3A0\\uC3BC\\uC3D8\\uC3F4\\uC410\\uC42C\\uC448"\r
2645                 + "\\uC464\\uC480\\uC49C\\uC4B8\\uC4D4\\uC4F0\\uC50C\\uC528\\uC544"\r
2646                 + "\\uC560\\uC57C\\uC598\\uC5B4\\uC5D0\\uC5EC\\uC608\\uC624\\uC640"\r
2647                 + "\\uC65C\\uC678\\uC694\\uC6B0\\uC6CC\\uC6E8\\uC704\\uC720\\uC73C"\r
2648                 + "\\uC758\\uC774\\uC790\\uC7AC\\uC7C8\\uC7E4\\uC800\\uC81C\\uC838"\r
2649                 + "\\uC854\\uC870\\uC88C\\uC8A8\\uC8C4\\uC8E0\\uC8FC\\uC918\\uC934"\r
2650                 + "\\uC950\\uC96C\\uC988\\uC9A4\\uC9C0\\uC9DC\\uC9F8\\uCA14\\uCA30"\r
2651                 + "\\uCA4C\\uCA68\\uCA84\\uCAA0\\uCABC\\uCAD8\\uCAF4\\uCB10\\uCB2C"\r
2652                 + "\\uCB48\\uCB64\\uCB80\\uCB9C\\uCBB8\\uCBD4\\uCBF0\\uCC0C\\uCC28"\r
2653                 + "\\uCC44\\uCC60\\uCC7C\\uCC98\\uCCB4\\uCCD0\\uCCEC\\uCD08\\uCD24"\r
2654                 + "\\uCD40\\uCD5C\\uCD78\\uCD94\\uCDB0\\uCDCC\\uCDE8\\uCE04\\uCE20"\r
2655                 + "\\uCE3C\\uCE58\\uCE74\\uCE90\\uCEAC\\uCEC8\\uCEE4\\uCF00\\uCF1C"\r
2656                 + "\\uCF38\\uCF54\\uCF70\\uCF8C\\uCFA8\\uCFC4\\uCFE0\\uCFFC\\uD018"\r
2657                 + "\\uD034\\uD050\\uD06C\\uD088\\uD0A4\\uD0C0\\uD0DC\\uD0F8\\uD114"\r
2658                 + "\\uD130\\uD14C\\uD168\\uD184\\uD1A0\\uD1BC\\uD1D8\\uD1F4\\uD210"\r
2659                 + "\\uD22C\\uD248\\uD264\\uD280\\uD29C\\uD2B8\\uD2D4\\uD2F0\\uD30C"\r
2660                 + "\\uD328\\uD344\\uD360\\uD37C\\uD398\\uD3B4\\uD3D0\\uD3EC\\uD408"\r
2661                 + "\\uD424\\uD440\\uD45C\\uD478\\uD494\\uD4B0\\uD4CC\\uD4E8\\uD504"\r
2662                 + "\\uD520\\uD53C\\uD558\\uD574\\uD590\\uD5AC\\uD5C8\\uD5E4\\uD600"\r
2663                 + "\\uD61C\\uD638\\uD654\\uD670\\uD68C\\uD6A8\\uD6C4\\uD6E0\\uD6FC"\r
2664                 + "\\uD718\\uD734\\uD750\\uD76C\\uD788\\uF900-\\uFA0D\\uFA10\\uFA12"\r
2665                 + "\\uFA15-\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D"\r
2666                 + "\\uFA30-\\uFA6A\\uFA70-\\uFAD9\\uFB00-\\uFB06\\uFB13-\\uFB17"\r
2667                 + "\\uFB1D-\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43"\r
2668                 + "\\uFB44\\uFB46-\\uFBB1\\uFBD3-\\uFD3D\\uFD50-\\uFD8F\\uFD92-"\r
2669                 + "\\uFDC7\\uFDF0-\\uFDFC\\uFE10-\\uFE19\\uFE20-\\uFE26\\uFE30-"\r
2670                 + "\\uFE44\\uFE47-\\uFE52\\uFE54-\\uFE66\\uFE68-\\uFE6B\\uFE70-"\r
2671                 + "\\uFE72\\uFE74\\uFE76-\\uFEFC\\uFF01-\\uFFBE\\uFFC2-\\uFFC7"\r
2672                 + "\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7\\uFFDA-\\uFFDC\\uFFE0-\\uFFE6"\r
2673                 + "\\uFFE8-\\uFFEE\\U000101FD\\U00010A0D\\U00010A0F\\U00010A38-"\r
2674                 + "\\U00010A3A\\U00010A3F\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001"\r
2675                 + "D172\\U0001D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-"\r
2676                 + "\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0001"\r
2677                 + "D400-\\U0001D454\\U0001D456-\\U0001D49C\\U0001D49E\\U0001D49F"\r
2678                 + "\\U0001D4A2\\U0001D4A5\\U0001D4A6\\U0001D4A9-\\U0001D4AC\\U0001D"\r
2679                 + "4AE-\\U0001D4B9\\U0001D4BB\\U0001D4BD-\\U0001D4C3\\U0001D4C5-"\r
2680                 + "\\U0001D505\\U0001D507-\\U0001D50A\\U0001D50D-\\U0001D514\\U0001"\r
2681                 + "D516-\\U0001D51C\\U0001D51E-\\U0001D539\\U0001D53B-\\U0001D53E"\r
2682                 + "\\U0001D540-\\U0001D544\\U0001D546\\U0001D54A-\\U0001D550\\U0001"\r
2683                 + "D552-\\U0001D6A5\\U0001D6A8-\\U0001D7CB\\U0001D7CE-\\U0001D7FF"\r
2684                 + "\\U0002F800-\\U0002FA1D]", false);\r
2685    \r
2686         return skipSets;\r
2687     }\r
2688 \r
2689     public void TestSkippable() {\r
2690        UnicodeSet starts;\r
2691        UnicodeSet[] skipSets = new UnicodeSet[]{\r
2692                                                     new UnicodeSet(), //NFD\r
2693                                                     new UnicodeSet(), //NFC\r
2694                                                     new UnicodeSet(), //NFKC\r
2695                                                     new UnicodeSet(), //NFKD\r
2696                                                     new UnicodeSet(), //FCD\r
2697                                                     new UnicodeSet(), //NONE\r
2698                                                };\r
2699        UnicodeSet[] expectSets = new UnicodeSet[]{\r
2700                                                     new UnicodeSet(),\r
2701                                                     new UnicodeSet(),\r
2702                                                     new UnicodeSet(),\r
2703                                                     new UnicodeSet(),\r
2704                                                     new UnicodeSet(),\r
2705                                                     new UnicodeSet(),\r
2706                                                };\r
2707        StringBuffer s, pattern;\r
2708        int start, limit, rangeEnd;\r
2709        int i, range, count;\r
2710        starts = new UnicodeSet();\r
2711        /*\r
2712        //[\u0350-\u0357\u035D-\u035F\u0610-\u0615\u0656-\u0658\u0CBC\u17DD\u1939-\u193B]\r
2713        for(int ch=0;ch<=0x10FFFF;ch++){\r
2714                if(Normalizer.isNFSkippable(ch, Normalizer.NFD)) {\r
2715                    skipSets[D].add(ch);\r
2716                }\r
2717                if(Normalizer.isNFSkippable(ch, Normalizer.NFKD)) {\r
2718                    skipSets[KD].add(ch);\r
2719                }\r
2720                if(Normalizer.isNFSkippable(ch, Normalizer.NFC)) {\r
2721                    skipSets[C].add(ch);\r
2722                }\r
2723                if(Normalizer.isNFSkippable(ch, Normalizer.NFKC)) {\r
2724                    skipSets[KC].add(ch);\r
2725                }\r
2726                if(Normalizer.isNFSkippable(ch, Normalizer.FCD)) {\r
2727                    skipSets[FCD].add(ch);\r
2728                }\r
2729                if(Normalizer.isNFSkippable(ch, Normalizer.NONE)) {\r
2730                    skipSets[NONE].add(ch);\r
2731                }\r
2732        }\r
2733        */\r
2734        // build NF*Skippable sets from runtime data \r
2735        NormalizerImpl.addPropertyStarts(starts);\r
2736        count=starts.getRangeCount();\r
2737    \r
2738        start=limit=0;\r
2739        rangeEnd=0;\r
2740        range=0;\r
2741        for(;;) {\r
2742            if(start<limit) {\r
2743                // get properties for start and apply them to [start..limit[ \r
2744                if(Normalizer.isNFSkippable(start, Normalizer.NFD)) {\r
2745                    skipSets[D].add(start, limit-1);\r
2746                }\r
2747                if(Normalizer.isNFSkippable(start, Normalizer.NFKD)) {\r
2748                    skipSets[KD].add(start, limit-1);\r
2749                }\r
2750                if(Normalizer.isNFSkippable(start, Normalizer.NFC)) {\r
2751                    skipSets[C].add(start, limit-1);\r
2752                }\r
2753                if(Normalizer.isNFSkippable(start, Normalizer.NFKC)) {\r
2754                    skipSets[KC].add(start, limit-1);\r
2755                }\r
2756                if(Normalizer.isNFSkippable(start, Normalizer.FCD)) {\r
2757                    skipSets[FCD].add(start, limit-1);\r
2758                }\r
2759                if(Normalizer.isNFSkippable(start, Normalizer.NONE)) {\r
2760                    skipSets[NONE].add(start, limit-1);\r
2761                }\r
2762                \r
2763            }\r
2764    \r
2765            // go to next range of same properties \r
2766            start=limit;\r
2767            if(++limit>rangeEnd) {\r
2768                if(range<count) {\r
2769                    limit=starts.getRangeStart(range);\r
2770                    rangeEnd=starts.getRangeEnd(range);\r
2771                    ++range;\r
2772                } else if(range==count) {\r
2773                    // additional range to complete the Unicode code space \r
2774                    limit=rangeEnd=0x110000;\r
2775                    ++range;\r
2776                } else {\r
2777                    break;\r
2778                }\r
2779            }\r
2780        }\r
2781    \r
2782        expectSets = initSkippables(expectSets);\r
2783        if(expectSets[D].contains(0x0350)){\r
2784             errln("expectSets[D] contains 0x0350");\r
2785        }\r
2786        //expectSets.length for now do not test FCD and NONE since there is no data\r
2787        for(i=0; i< 4; ++i) {\r
2788 \r
2789            if(!skipSets[i].equals(expectSets[i])) {\r
2790                errln("error: TestSkippable skipSets["+i+"]!=expectedSets["+i+"]\n"+\r
2791                      "May need to update hardcoded UnicodeSet patterns in com.ibm.icu.dev.test.normalizer.BasicTest.java\n"+\r
2792                      "See ICU4J - unicodetools.com.ibm.text.UCD.NFSkippable\n" +\r
2793                      "Run com.ibm.text.UCD.Main with the option NFSkippable.");\r
2794    \r
2795                s=new StringBuffer();\r
2796                \r
2797                s.append("\n\nskip=       ");\r
2798                s.append(skipSets[i].toPattern(true));\r
2799                s.append("\n\n");\r
2800                \r
2801                s.append("skip-expect=");             \r
2802                pattern = new StringBuffer(((UnicodeSet)skipSets[i].clone()).removeAll(expectSets[i]).toPattern(true));\r
2803                s.append(pattern);\r
2804    \r
2805                pattern.delete(0,pattern.length());\r
2806                s.append("\n\nexpect-skip=");\r
2807                pattern = new StringBuffer(((UnicodeSet)expectSets[i].clone()).removeAll(skipSets[i]).toPattern(true));\r
2808                s.append(pattern);\r
2809                s.append("\n\n");\r
2810                \r
2811                pattern.delete(0,pattern.length());\r
2812                s.append("\n\nintersection(expect,skip)=");\r
2813                UnicodeSet intersection  = ((UnicodeSet) expectSets[i].clone()).retainAll(skipSets[i]);\r
2814                pattern = new StringBuffer(intersection.toPattern(true));\r
2815                s.append(pattern);\r
2816                s.append("\n\n");\r
2817                \r
2818 \r
2819                \r
2820                errln(s.toString());\r
2821            }\r
2822        }\r
2823      }\r
2824      \r
2825      public void TestBugJ2068(){\r
2826         String sample = "The quick brown fox jumped over the lazy dog";\r
2827         UCharacterIterator text = UCharacterIterator.getInstance(sample);\r
2828         Normalizer norm = new Normalizer(text,Normalizer.NFC,0);\r
2829         text.setIndex(4);\r
2830         if(text.current() == norm.current()){\r
2831             errln("Normalizer is not cloning the UCharacterIterator");\r
2832         }\r
2833      }   \r
2834      public void TestGetCombiningClass(){\r
2835         for(int i=0;i<0x10FFFF;i++){\r
2836             int cc = UCharacter.getCombiningClass(i);\r
2837             if(0xD800<= i && i<=0xDFFF && cc >0 ){\r
2838                 cc = UCharacter.getCombiningClass(i);\r
2839                 errln("CC: "+ cc + " for codepoint: " +Utility.hex(i,8));\r
2840             } \r
2841         }\r
2842     }  \r
2843     \r
2844     public void TestGetNX(){\r
2845         UnicodeSet set = NormalizerImpl.getNX(1 /*NormalizerImpl.NX_HANGUL*/);\r
2846         if(!set.contains(0xac01)){\r
2847             errln("getNX did not return correct set for NX_HANGUL");\r
2848         }\r
2849         \r
2850         set = NormalizerImpl.getNX(2/*NormalizerImpl.NX_CJK_COMPAT*/);\r
2851         if(!set.contains('\uFA20')){\r
2852             errln("getNX did not return correct set for NX_CJK_COMPAT");\r
2853         }\r
2854     }\r
2855     public void TestSerializedSet(){\r
2856         USerializedSet sset=new USerializedSet();\r
2857         UnicodeSet set = new UnicodeSet();\r
2858         int start, end;\r
2859     \r
2860         // collect all sets into one for contiguous output\r
2861         int[] startEnd = new int[2];\r
2862 \r
2863         if(NormalizerImpl.getCanonStartSet(0x0130, sset)) {\r
2864             int count=sset.countRanges();\r
2865             for(int j=0; j<count; ++j) {\r
2866                 sset.getRange(j, startEnd);\r
2867                 set.add(startEnd[0], startEnd[1]);\r
2868             }\r
2869         }\r
2870        \r
2871 \r
2872         // test all of these precomposed characters\r
2873         UnicodeSetIterator it = new UnicodeSetIterator(set);\r
2874         while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) {\r
2875             start=it.codepoint;\r
2876             end=it.codepointEnd;\r
2877             while(start<=end) {\r
2878                 if(!sset.contains(start)){\r
2879                     errln("USerializedSet.contains failed for "+Utility.hex(start,8));\r
2880                 }\r
2881             }\r
2882         }\r
2883     }\r
2884     \r
2885     public void TestReturnFailure(){\r
2886         char[] term = {'r','\u00e9','s','u','m','\u00e9' };\r
2887         char[] decomposed_term = new char[10 + term.length + 2];\r
2888         int rc = Normalizer.decompose(term,0,term.length, decomposed_term,0,decomposed_term.length,true, 0);\r
2889         int rc1 = Normalizer.decompose(term,0,term.length, decomposed_term,10,decomposed_term.length,true, 0); \r
2890         if(rc!=rc1){\r
2891             errln("Normalizer decompose did not return correct length");\r
2892         }\r
2893     }\r
2894 \r
2895     private final static class TestCompositionCase {\r
2896         public Normalizer.Mode mode;\r
2897         public int options;\r
2898         public String input, expect;\r
2899         TestCompositionCase(Normalizer.Mode mode, int options, String input, String expect) {\r
2900             this.mode=mode;\r
2901             this.options=options;\r
2902             this.input=input;\r
2903             this.expect=expect;\r
2904         }\r
2905     }\r
2906 \r
2907     public void TestComposition() {\r
2908         final TestCompositionCase cases[]=new TestCompositionCase[]{\r
2909             /*\r
2910              * special cases for UAX #15 bug\r
2911              * see Unicode Public Review Issue #29\r
2912              * at http://www.unicode.org/review/resolved-pri.html#pri29\r
2913              */\r
2914             new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327",      "\u1100\u0300\u1161\u0327"),\r
2915             new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327\u11a8","\u1100\u0300\u1161\u0327\u11a8"),\r
2916             new TestCompositionCase(Normalizer.NFC, 0, "\uac00\u0300\u0327\u11a8",      "\uac00\u0327\u0300\u11a8"),\r
2917             new TestCompositionCase(Normalizer.NFC, 0, "\u0b47\u0300\u0b3e",            "\u0b47\u0300\u0b3e"),\r
2918 \r
2919             new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\u1100\u0300\u1161\u0327",       "\uac00\u0300\u0327"),\r
2920             new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\u1100\u0300\u1161\u0327\u11a8", "\uac01\u0300\u0327"),\r
2921             new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\uac00\u0300\u0327\u11a8",       "\uac01\u0327\u0300"),\r
2922             new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\u0b47\u0300\u0b3e",             "\u0b4b\u0300")\r
2923 \r
2924             /* TODO: add test cases for UNORM_FCC here (j2151) */\r
2925         };\r
2926 \r
2927         String output;\r
2928         int i;\r
2929 \r
2930         for(i=0; i<cases.length; ++i) {\r
2931             output=Normalizer.normalize(cases[i].input, cases[i].mode, cases[i].options);\r
2932             if(!output.equals(cases[i].expect)) {\r
2933                 errln("unexpected result for case "+i);\r
2934             }\r
2935         }\r
2936     }\r
2937 }\r