2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 // copied from the Transliterator demo
\r
10 package com.ibm.icu.dev.test.util;
\r
11 import java.util.HashMap;
\r
12 import java.util.HashSet;
\r
13 import java.util.Iterator;
\r
14 import java.util.Map;
\r
15 import java.util.Set;
\r
16 import java.util.TreeSet;
\r
18 import com.ibm.icu.lang.UCharacter;
\r
19 import com.ibm.icu.text.Transliterator;
\r
20 import com.ibm.icu.text.UTF16;
\r
21 import com.ibm.icu.text.UnicodeSet;
\r
24 * Incrementally returns the set of all strings that case-fold to the same value.
\r
26 public class CaseIterator {
\r
29 private static Transliterator toName = Transliterator.getInstance("[:^ascii:] Any-Name");
\r
30 private static Transliterator toHex = Transliterator.getInstance("[:^ascii:] Any-Hex");
\r
31 private static Transliterator toHex2 = Transliterator.getInstance("[[^\u0021-\u007F]-[,]] Any-Hex");
\r
33 // global tables (could be precompiled)
\r
34 private static Map fromCaseFold = new HashMap();
\r
35 private static Map toCaseFold = new HashMap();
\r
36 private static int maxLength = 0;
\r
38 // This exception list is generated on the console by turning on the GENERATED flag,
\r
39 // which MUST be false for normal operation.
\r
40 // Once the list is generated, it is pasted in here.
\r
41 // A bit of a cludge, but this bootstrapping is the easiest way
\r
42 // to get around certain complications in the data.
\r
44 private static final boolean GENERATE = false;
\r
46 private static final boolean DUMP = false;
\r
48 private static String[][] exceptionList = {
\r
49 // a\N{MODIFIER LETTER RIGHT HALF RING}
\r
50 {"a\u02BE","A\u02BE","a\u02BE",},
\r
52 {"ff","FF","Ff","fF","ff",},
\r
54 {"ffi","FFI","FFi","FfI","Ffi","F\uFB01","fFI","fFi","ffI","ffi","f\uFB01","\uFB00I","\uFB00i",},
\r
56 {"ffl","FFL","FFl","FfL","Ffl","F\uFB02","fFL","fFl","ffL","ffl","f\uFB02","\uFB00L","\uFB00l",},
\r
58 {"fi","FI","Fi","fI","fi",},
\r
60 {"fl","FL","Fl","fL","fl",},
\r
61 // h\N{COMBINING MACRON BELOW}
\r
62 {"h\u0331","H\u0331","h\u0331",},
\r
63 // i\N{COMBINING DOT ABOVE}
\r
64 {"i\u0307","I\u0307","i\u0307",},
\r
65 // j\N{COMBINING CARON}
\r
66 {"j\u030C","J\u030C","j\u030C",},
\r
68 {"ss","SS","Ss","S\u017F","sS","ss","s\u017F","\u017FS","\u017Fs","\u017F\u017F",},
\r
70 {"st","ST","St","sT","st","\u017FT","\u017Ft",},
\r
71 // t\N{COMBINING DIAERESIS}
\r
72 {"t\u0308","T\u0308","t\u0308",},
\r
73 // w\N{COMBINING RING ABOVE}
\r
74 {"w\u030A","W\u030A","w\u030A",},
\r
75 // y\N{COMBINING RING ABOVE}
\r
76 {"y\u030A","Y\u030A","y\u030A",},
\r
77 // \N{MODIFIER LETTER APOSTROPHE}n
\r
78 {"\u02BCn","\u02BCN","\u02BCn",},
\r
79 // \N{GREEK SMALL LETTER ALPHA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
\r
80 {"\u03AC\u03B9","\u0386\u0345","\u0386\u0399","\u0386\u03B9","\u0386\u1FBE","\u03AC\u0345","\u03AC\u0399","\u03AC\u03B9","\u03AC\u1FBE",},
\r
81 // \N{GREEK SMALL LETTER ETA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
\r
82 {"\u03AE\u03B9","\u0389\u0345","\u0389\u0399","\u0389\u03B9","\u0389\u1FBE","\u03AE\u0345","\u03AE\u0399","\u03AE\u03B9","\u03AE\u1FBE",},
\r
83 // \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}
\r
84 {"\u03B1\u0342","\u0391\u0342","\u03B1\u0342",},
\r
85 // \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
\r
86 {"\u03B1\u0342\u03B9","\u0391\u0342\u0345","\u0391\u0342\u0399","\u0391\u0342\u03B9","\u0391\u0342\u1FBE",
\r
87 "\u03B1\u0342\u0345","\u03B1\u0342\u0399","\u03B1\u0342\u03B9","\u03B1\u0342\u1FBE","\u1FB6\u0345",
\r
88 "\u1FB6\u0399","\u1FB6\u03B9","\u1FB6\u1FBE",},
\r
89 // \N{GREEK SMALL LETTER ALPHA}\N{GREEK SMALL LETTER IOTA}
\r
90 {"\u03B1\u03B9","\u0391\u0345","\u0391\u0399","\u0391\u03B9","\u0391\u1FBE","\u03B1\u0345","\u03B1\u0399","\u03B1\u03B9","\u03B1\u1FBE",},
\r
91 // \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}
\r
92 {"\u03B7\u0342","\u0397\u0342","\u03B7\u0342",},
\r
93 // \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
\r
94 {"\u03B7\u0342\u03B9","\u0397\u0342\u0345","\u0397\u0342\u0399","\u0397\u0342\u03B9","\u0397\u0342\u1FBE",
\r
95 "\u03B7\u0342\u0345","\u03B7\u0342\u0399","\u03B7\u0342\u03B9","\u03B7\u0342\u1FBE","\u1FC6\u0345","\u1FC6\u0399",
\r
96 "\u1FC6\u03B9","\u1FC6\u1FBE",},
\r
97 // \N{GREEK SMALL LETTER ETA}\N{GREEK SMALL LETTER IOTA}
\r
98 {"\u03B7\u03B9","\u0397\u0345","\u0397\u0399","\u0397\u03B9","\u0397\u1FBE","\u03B7\u0345","\u03B7\u0399","\u03B7\u03B9","\u03B7\u1FBE",},
\r
99 // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}
\r
100 {"\u03B9\u0308\u0300","\u0345\u0308\u0300","\u0399\u0308\u0300","\u03B9\u0308\u0300","\u1FBE\u0308\u0300",},
\r
101 // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT}
\r
102 {"\u03B9\u0308\u0301","\u0345\u0308\u0301","\u0399\u0308\u0301","\u03B9\u0308\u0301","\u1FBE\u0308\u0301",},
\r
103 // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI}
\r
104 {"\u03B9\u0308\u0342","\u0345\u0308\u0342","\u0399\u0308\u0342","\u03B9\u0308\u0342","\u1FBE\u0308\u0342",},
\r
105 // \N{GREEK SMALL LETTER IOTA}\N{COMBINING GREEK PERISPOMENI}
\r
106 {"\u03B9\u0342","\u0345\u0342","\u0399\u0342","\u03B9\u0342","\u1FBE\u0342",},
\r
107 // \N{GREEK SMALL LETTER RHO}\N{COMBINING COMMA ABOVE}
\r
108 {"\u03C1\u0313","\u03A1\u0313","\u03C1\u0313","\u03F1\u0313",},
\r
109 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}
\r
110 {"\u03C5\u0308\u0300","\u03A5\u0308\u0300","\u03C5\u0308\u0300",},
\r
111 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT}
\r
112 {"\u03C5\u0308\u0301","\u03A5\u0308\u0301","\u03C5\u0308\u0301",},
\r
113 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI}
\r
114 {"\u03C5\u0308\u0342","\u03A5\u0308\u0342","\u03C5\u0308\u0342",},
\r
115 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}
\r
116 {"\u03C5\u0313","\u03A5\u0313","\u03C5\u0313",},
\r
117 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GRAVE ACCENT}
\r
118 {"\u03C5\u0313\u0300","\u03A5\u0313\u0300","\u03C5\u0313\u0300","\u1F50\u0300",},
\r
119 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING ACUTE ACCENT}
\r
120 {"\u03C5\u0313\u0301","\u03A5\u0313\u0301","\u03C5\u0313\u0301","\u1F50\u0301",},
\r
121 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GREEK PERISPOMENI}
\r
122 {"\u03C5\u0313\u0342","\u03A5\u0313\u0342","\u03C5\u0313\u0342","\u1F50\u0342",},
\r
123 // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING GREEK PERISPOMENI}
\r
124 {"\u03C5\u0342","\u03A5\u0342","\u03C5\u0342",},
\r
125 // \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}
\r
126 {"\u03C9\u0342","\u03A9\u0342","\u03C9\u0342","\u2126\u0342",},
\r
127 // \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
\r
128 {"\u03C9\u0342\u03B9","\u03A9\u0342\u0345","\u03A9\u0342\u0399","\u03A9\u0342\u03B9","\u03A9\u0342\u1FBE","\u03C9\u0342\u0345","\u03C9\u0342\u0399","\u03C9\u0342\u03B9","\u03C9\u0342\u1FBE","\u1FF6\u0345",
\r
129 "\u1FF6\u0399","\u1FF6\u03B9","\u1FF6\u1FBE","\u2126\u0342\u0345","\u2126\u0342\u0399","\u2126\u0342\u03B9","\u2126\u0342\u1FBE",},
\r
130 // \N{GREEK SMALL LETTER OMEGA}\N{GREEK SMALL LETTER IOTA}
\r
131 {"\u03C9\u03B9","\u03A9\u0345","\u03A9\u0399","\u03A9\u03B9","\u03A9\u1FBE","\u03C9\u0345","\u03C9\u0399","\u03C9\u03B9","\u03C9\u1FBE","\u2126\u0345","\u2126\u0399","\u2126\u03B9","\u2126\u1FBE",},
\r
132 // \N{GREEK SMALL LETTER OMEGA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
\r
133 {"\u03CE\u03B9","\u038F\u0345","\u038F\u0399","\u038F\u03B9","\u038F\u1FBE","\u03CE\u0345","\u03CE\u0399","\u03CE\u03B9","\u03CE\u1FBE",},
\r
134 // \N{ARMENIAN SMALL LETTER ECH}\N{ARMENIAN SMALL LETTER YIWN}
\r
135 {"\u0565\u0582","\u0535\u0552","\u0535\u0582","\u0565\u0552","\u0565\u0582",},
\r
136 // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER ECH}
\r
137 {"\u0574\u0565","\u0544\u0535","\u0544\u0565","\u0574\u0535","\u0574\u0565",},
\r
138 // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER INI}
\r
139 {"\u0574\u056B","\u0544\u053B","\u0544\u056B","\u0574\u053B","\u0574\u056B",},
\r
140 // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER XEH}
\r
141 {"\u0574\u056D","\u0544\u053D","\u0544\u056D","\u0574\u053D","\u0574\u056D",},
\r
142 // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER NOW}
\r
143 {"\u0574\u0576","\u0544\u0546","\u0544\u0576","\u0574\u0546","\u0574\u0576",},
\r
144 // \N{ARMENIAN SMALL LETTER VEW}\N{ARMENIAN SMALL LETTER NOW}
\r
145 {"\u057E\u0576","\u054E\u0546","\u054E\u0576","\u057E\u0546","\u057E\u0576",},
\r
146 // \N{GREEK SMALL LETTER ALPHA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
\r
147 {"\u1F00\u03B9","\u1F00\u0345","\u1F00\u0399","\u1F00\u03B9","\u1F00\u1FBE","\u1F08\u0345","\u1F08\u0399","\u1F08\u03B9","\u1F08\u1FBE",},
\r
148 // \N{GREEK SMALL LETTER ALPHA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
\r
149 {"\u1F01\u03B9","\u1F01\u0345","\u1F01\u0399","\u1F01\u03B9","\u1F01\u1FBE","\u1F09\u0345","\u1F09\u0399","\u1F09\u03B9","\u1F09\u1FBE",},
\r
150 // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
\r
151 {"\u1F02\u03B9","\u1F02\u0345","\u1F02\u0399","\u1F02\u03B9","\u1F02\u1FBE","\u1F0A\u0345","\u1F0A\u0399","\u1F0A\u03B9","\u1F0A\u1FBE",},
\r
152 // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
\r
153 {"\u1F03\u03B9","\u1F03\u0345","\u1F03\u0399","\u1F03\u03B9","\u1F03\u1FBE","\u1F0B\u0345","\u1F0B\u0399","\u1F0B\u03B9","\u1F0B\u1FBE",},
\r
154 // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
\r
155 {"\u1F04\u03B9","\u1F04\u0345","\u1F04\u0399","\u1F04\u03B9","\u1F04\u1FBE","\u1F0C\u0345","\u1F0C\u0399","\u1F0C\u03B9","\u1F0C\u1FBE",},
\r
156 // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
\r
157 {"\u1F05\u03B9","\u1F05\u0345","\u1F05\u0399","\u1F05\u03B9","\u1F05\u1FBE","\u1F0D\u0345","\u1F0D\u0399","\u1F0D\u03B9","\u1F0D\u1FBE",},
\r
158 // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
\r
159 {"\u1F06\u03B9","\u1F06\u0345","\u1F06\u0399","\u1F06\u03B9","\u1F06\u1FBE","\u1F0E\u0345","\u1F0E\u0399","\u1F0E\u03B9","\u1F0E\u1FBE",},
\r
160 // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
\r
161 {"\u1F07\u03B9","\u1F07\u0345","\u1F07\u0399","\u1F07\u03B9","\u1F07\u1FBE","\u1F0F\u0345","\u1F0F\u0399","\u1F0F\u03B9","\u1F0F\u1FBE",},
\r
162 // \N{GREEK SMALL LETTER ETA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
\r
163 {"\u1F20\u03B9","\u1F20\u0345","\u1F20\u0399","\u1F20\u03B9","\u1F20\u1FBE","\u1F28\u0345","\u1F28\u0399","\u1F28\u03B9","\u1F28\u1FBE",},
\r
164 // \N{GREEK SMALL LETTER ETA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
\r
165 {"\u1F21\u03B9","\u1F21\u0345","\u1F21\u0399","\u1F21\u03B9","\u1F21\u1FBE","\u1F29\u0345","\u1F29\u0399","\u1F29\u03B9","\u1F29\u1FBE",},
\r
166 // \N{GREEK SMALL LETTER ETA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
\r
167 {"\u1F22\u03B9","\u1F22\u0345","\u1F22\u0399","\u1F22\u03B9","\u1F22\u1FBE","\u1F2A\u0345","\u1F2A\u0399","\u1F2A\u03B9","\u1F2A\u1FBE",},
\r
168 // \N{GREEK SMALL LETTER ETA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
\r
169 {"\u1F23\u03B9","\u1F23\u0345","\u1F23\u0399","\u1F23\u03B9","\u1F23\u1FBE","\u1F2B\u0345","\u1F2B\u0399","\u1F2B\u03B9","\u1F2B\u1FBE",},
\r
170 // \N{GREEK SMALL LETTER ETA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
\r
171 {"\u1F24\u03B9","\u1F24\u0345","\u1F24\u0399","\u1F24\u03B9","\u1F24\u1FBE","\u1F2C\u0345","\u1F2C\u0399","\u1F2C\u03B9","\u1F2C\u1FBE",},
\r
172 // \N{GREEK SMALL LETTER ETA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
\r
173 {"\u1F25\u03B9","\u1F25\u0345","\u1F25\u0399","\u1F25\u03B9","\u1F25\u1FBE","\u1F2D\u0345","\u1F2D\u0399","\u1F2D\u03B9","\u1F2D\u1FBE",},
\r
174 // \N{GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
\r
175 {"\u1F26\u03B9","\u1F26\u0345","\u1F26\u0399","\u1F26\u03B9","\u1F26\u1FBE","\u1F2E\u0345","\u1F2E\u0399","\u1F2E\u03B9","\u1F2E\u1FBE",},
\r
176 // \N{GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
\r
177 {"\u1F27\u03B9","\u1F27\u0345","\u1F27\u0399","\u1F27\u03B9","\u1F27\u1FBE","\u1F2F\u0345","\u1F2F\u0399","\u1F2F\u03B9","\u1F2F\u1FBE",},
\r
178 // \N{GREEK SMALL LETTER OMEGA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
\r
179 {"\u1F60\u03B9","\u1F60\u0345","\u1F60\u0399","\u1F60\u03B9","\u1F60\u1FBE","\u1F68\u0345","\u1F68\u0399","\u1F68\u03B9","\u1F68\u1FBE",},
\r
180 // \N{GREEK SMALL LETTER OMEGA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
\r
181 {"\u1F61\u03B9","\u1F61\u0345","\u1F61\u0399","\u1F61\u03B9","\u1F61\u1FBE","\u1F69\u0345","\u1F69\u0399","\u1F69\u03B9","\u1F69\u1FBE",},
\r
182 // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
\r
183 {"\u1F62\u03B9","\u1F62\u0345","\u1F62\u0399","\u1F62\u03B9","\u1F62\u1FBE","\u1F6A\u0345","\u1F6A\u0399","\u1F6A\u03B9","\u1F6A\u1FBE",},
\r
184 // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
\r
185 {"\u1F63\u03B9","\u1F63\u0345","\u1F63\u0399","\u1F63\u03B9","\u1F63\u1FBE","\u1F6B\u0345","\u1F6B\u0399","\u1F6B\u03B9","\u1F6B\u1FBE",},
\r
186 // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
\r
187 {"\u1F64\u03B9","\u1F64\u0345","\u1F64\u0399","\u1F64\u03B9","\u1F64\u1FBE","\u1F6C\u0345","\u1F6C\u0399","\u1F6C\u03B9","\u1F6C\u1FBE",},
\r
188 // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
\r
189 {"\u1F65\u03B9","\u1F65\u0345","\u1F65\u0399","\u1F65\u03B9","\u1F65\u1FBE","\u1F6D\u0345","\u1F6D\u0399","\u1F6D\u03B9","\u1F6D\u1FBE",},
\r
190 // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
\r
191 {"\u1F66\u03B9","\u1F66\u0345","\u1F66\u0399","\u1F66\u03B9","\u1F66\u1FBE","\u1F6E\u0345","\u1F6E\u0399","\u1F6E\u03B9","\u1F6E\u1FBE",},
\r
192 // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
\r
193 {"\u1F67\u03B9","\u1F67\u0345","\u1F67\u0399","\u1F67\u03B9","\u1F67\u1FBE","\u1F6F\u0345","\u1F6F\u0399","\u1F6F\u03B9","\u1F6F\u1FBE",},
\r
194 // \N{GREEK SMALL LETTER ALPHA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
\r
195 {"\u1F70\u03B9","\u1F70\u0345","\u1F70\u0399","\u1F70\u03B9","\u1F70\u1FBE","\u1FBA\u0345","\u1FBA\u0399","\u1FBA\u03B9","\u1FBA\u1FBE",},
\r
196 // \N{GREEK SMALL LETTER ETA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
\r
197 {"\u1F74\u03B9","\u1F74\u0345","\u1F74\u0399","\u1F74\u03B9","\u1F74\u1FBE","\u1FCA\u0345","\u1FCA\u0399","\u1FCA\u03B9","\u1FCA\u1FBE",},
\r
198 // \N{GREEK SMALL LETTER OMEGA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
\r
199 {"\u1F7C\u03B9","\u1F7C\u0345","\u1F7C\u0399","\u1F7C\u03B9","\u1F7C\u1FBE","\u1FFA\u0345","\u1FFA\u0399","\u1FFA\u03B9","\u1FFA\u1FBE",},
\r
202 // this initializes the data used to generated the case-equivalents
\r
206 // Gather up the exceptions in a form we can use
\r
209 for (int i = 0; i < exceptionList.length; ++i) {
\r
210 String[] exception = exceptionList[i];
\r
211 Set s = new HashSet();
\r
212 // there has to be some method to do the following, but I can't find it in the collections
\r
213 for (int j = 0; j < exception.length; ++j) {
\r
214 s.add(exception[j]);
\r
216 fromCaseFold.put(exception[0], s);
\r
220 // walk through all the characters, and at every case fold result,
\r
221 // put a set of all the characters that map to that result
\r
223 boolean defaultmapping = true; // false for turkish
\r
224 for (int i = 0; i <= 0x10FFFF; ++i) {
\r
225 int cat = UCharacter.getType(i);
\r
226 if (cat == Character.UNASSIGNED || cat == Character.PRIVATE_USE) continue;
\r
228 String cp = UTF16.valueOf(i);
\r
229 String mapped = UCharacter.foldCase(cp, defaultmapping);
\r
230 if (mapped.equals(cp)) continue;
\r
232 if (maxLength < mapped.length()) maxLength = mapped.length();
\r
234 // at this point, have different case folding
\r
236 Set s = (Set) fromCaseFold.get(mapped);
\r
239 s.add(mapped); // add the case fold result itself
\r
240 fromCaseFold.put(mapped, s);
\r
243 toCaseFold.put(cp, mapped);
\r
244 toCaseFold.put(mapped, mapped); // add mapping to self
\r
247 // Emit the final data
\r
250 System.out.println("maxLength = " + maxLength);
\r
252 System.out.println("\nfromCaseFold:");
\r
253 Iterator it = fromCaseFold.keySet().iterator();
\r
254 while (it.hasNext()) {
\r
255 Object key = it.next();
\r
256 System.out.print(" " + toHex2.transliterate((String)key) + ": ");
\r
257 Set s = (Set) fromCaseFold.get(key);
\r
258 Iterator it2 = s.iterator();
\r
259 boolean first = true;
\r
260 while (it2.hasNext()) {
\r
264 System.out.print(", ");
\r
266 System.out.print(toHex2.transliterate((String)it2.next()));
\r
268 System.out.println("");
\r
271 System.out.println("\ntoCaseFold:");
\r
272 it = toCaseFold.keySet().iterator();
\r
273 while (it.hasNext()) {
\r
274 String key = (String) it.next();
\r
275 String value = (String) toCaseFold.get(key);
\r
276 System.out.println(" " + toHex2.transliterate(key) + ": " + toHex2.transliterate(value));
\r
280 // Now convert all those sets into linear arrays
\r
281 // We can't do this in place in Java, so make a temporary target array
\r
283 // Note: This could be transformed into a single array, with offsets into it.
\r
284 // Might be best choice in C.
\r
287 Map fromCaseFold2 = new HashMap();
\r
288 Iterator it = fromCaseFold.keySet().iterator();
\r
289 while (it.hasNext()) {
\r
290 Object key = it.next();
\r
291 Set s = (Set) fromCaseFold.get(key);
\r
292 String[] temp = new String[s.size()];
\r
294 fromCaseFold2.put(key, temp);
\r
296 fromCaseFold = fromCaseFold2;
\r
298 // We have processed everything, so the iterator will now work
\r
299 // The following is normally OFF.
\r
300 // It is here to generate (under the GENERATE flag) the static exception list.
\r
301 // It must be at the very end of initialization, so that the iterator is functional.
\r
302 // (easiest to do it that way)
\r
306 // first get small set of items that have multiple characters
\r
308 Set multichars = new TreeSet();
\r
309 it = fromCaseFold.keySet().iterator();
\r
310 while (it.hasNext()) {
\r
311 String key = (String) it.next();
\r
312 if (UTF16.countCodePoint(key) < 2) continue;
\r
313 multichars.add(key);
\r
316 // now we will go through each of them.
\r
318 CaseIterator ci = new CaseIterator();
\r
319 it = multichars.iterator();
\r
321 while (it.hasNext()) {
\r
322 String key = (String) it.next();
\r
324 // here is a nasty complication. Take 'ffi' ligature. We
\r
325 // can't just close it, since we would miss the combination
\r
326 // that includes the 'fi' => "fi" ligature
\r
327 // so first do a pass through, and add substring combinations
\r
328 // we call this a 'partial closure'
\r
330 Set partialClosure = new TreeSet();
\r
331 partialClosure.add(key);
\r
333 if (UTF16.countCodePoint(key) > 2) {
\r
334 Iterator multiIt2 = multichars.iterator();
\r
335 while (multiIt2.hasNext()) {
\r
336 String otherKey = (String) multiIt2.next();
\r
337 if (otherKey.length() >= key.length()) continue;
\r
340 // The following is not completely general
\r
341 // but works for the actual cased stuff,
\r
342 // and should work for future characters, since we won't have
\r
343 // more ligatures & other oddities.
\r
344 pos = key.indexOf(otherKey, pos+1);
\r
345 if (pos < 0) break;
\r
346 int endPos = pos + otherKey.length();
\r
347 // we know we have a proper substring,
\r
348 // so get the combinations
\r
349 String[] choices = (String[]) fromCaseFold.get(otherKey);
\r
350 for (int ii = 0; ii < choices.length; ++ii) {
\r
351 String patchwork = key.substring(0, pos)
\r
353 + key.substring(endPos);
\r
354 partialClosure.add(patchwork);
\r
360 // now, for each thing in the partial closure, get its
\r
361 // case closure and add it to the final result.
\r
363 Set closure = new TreeSet(); // this will be the real closure
\r
364 Iterator partialIt = partialClosure.iterator();
\r
365 while (partialIt.hasNext()) {
\r
366 String key2 = (String) partialIt.next();
\r
368 for (String temp = ci.next(); temp != null; temp = ci.next()) {
\r
372 /*String[] choices = (String[]) fromCaseFold.get(key2);
\r
373 for (int i = 0; i < choices.length; ++i) {
\r
374 ci.reset(choices[i]);
\r
376 while (null != (temp = ci.next())) {
\r
383 // print it out, so that it can be cut and pasted back into this document.
\r
385 Iterator it2 = closure.iterator();
\r
386 System.out.println("\t// " + toName.transliterate(key));
\r
387 System.out.print("\t{\"" + toHex.transliterate(key) + "\",");
\r
388 while (it2.hasNext()) {
\r
389 String item = (String)it2.next();
\r
390 System.out.print("\"" + toHex.transliterate(item) + "\",");
\r
392 System.out.println("},");
\r
397 // ============ PRIVATE CLASS DATA ============
\r
399 // pieces that we will put together
\r
400 // is not changed during iteration
\r
401 private int count = 0;
\r
402 private String[][] variants;
\r
404 // state information, changes during iteration
\r
405 private boolean done = false;
\r
406 private int[] counts;
\r
408 // internal buffer for efficiency
\r
409 private StringBuffer nextBuffer = new StringBuffer();
\r
411 // ========================
\r
414 * Reset to different source. Once reset, the iteration starts from the beginning.
\r
415 * @param source The string to get case variants for
\r
417 public void reset(String source) {
\r
419 // allocate arrays to store pieces
\r
420 // using length might be slightly too long, but we don't care much
\r
422 counts = new int[source.length()];
\r
423 variants = new String[source.length()][];
\r
425 // walk through the source, and break up into pieces
\r
426 // each piece becomes an array of equivalent values
\r
427 // TODO: could optimized this later to coalesce all single string pieces
\r
429 String piece = null;
\r
431 for (int i = 0; i < source.length(); i += piece.length()) {
\r
433 // find *longest* matching piece
\r
434 String caseFold = null;
\r
437 // do exactly one CP
\r
438 piece = UTF16.valueOf(source, i);
\r
439 caseFold = (String) toCaseFold.get(piece);
\r
441 int max = i + maxLength;
\r
442 if (max > source.length()) max = source.length();
\r
443 for (int j = max; j > i; --j) {
\r
444 piece = source.substring(i, j);
\r
445 caseFold = (String) toCaseFold.get(piece);
\r
446 if (caseFold != null) break;
\r
450 // if we fail, pick one code point
\r
451 if (caseFold == null) {
\r
452 piece = UTF16.valueOf(source, i);
\r
453 variants[count++] = new String[] {piece}; // single item string
\r
455 variants[count++] = (String[])fromCaseFold.get(caseFold);
\r
462 * Restart the iteration from the beginning, but with same source
\r
464 public void reset() {
\r
466 for (int i = 0; i < count; ++i) {
\r
472 * Iterates through the case variants.
\r
473 * @return next case variant. Each variant will case-fold to the same value as the source will.
\r
474 * When the iteration is done, null is returned.
\r
476 public String next() {
\r
478 if (done) return null;
\r
481 // TODO Optimize so we keep the piece before and after the current position
\r
482 // so we don't have so much concatenation
\r
484 // get the result, a concatenation
\r
486 nextBuffer.setLength(0);
\r
487 for (i = 0; i < count; ++i) {
\r
488 nextBuffer.append(variants[i][counts[i]]);
\r
491 // find the next right set of pieces to concatenate
\r
493 for (i = count-1; i >= 0; --i) {
\r
495 if (counts[i] < variants[i].length) break;
\r
499 // if we go too far, bail
\r
505 return nextBuffer.toString();
\r
510 * Temporary test, just to see how the stuff works.
\r
512 static public void main(String[] args) {
\r
513 String[] testCases = {"fiss", "h\u03a3"};
\r
514 CaseIterator ci = new CaseIterator();
\r
516 for (int i = 0; i < testCases.length; ++i) {
\r
517 String item = testCases[i];
\r
518 System.out.println();
\r
519 System.out.println("Testing: " + toName.transliterate(item));
\r
520 System.out.println();
\r
523 for (String temp = ci.next(); temp != null; temp = ci.next()) {
\r
524 System.out.println(toName.transliterate(temp));
\r
527 System.out.println("Total: " + count);
\r
530 // generate a list of all caseless characters -- characters whose
\r
531 // case closure is themselves.
\r
533 UnicodeSet caseless = new UnicodeSet();
\r
535 for (int i = 0; i <= 0x10FFFF; ++i) {
\r
536 String cp = UTF16.valueOf(i);
\r
539 String fold = null;
\r
540 for (String temp = ci.next(); temp != null; temp = ci.next()) {
\r
542 if (++count > 1) break;
\r
544 if (count==1 && fold.equals(cp)) {
\r
549 System.out.println("caseless = " + caseless.toPattern(true));
\r
551 UnicodeSet not_lc = new UnicodeSet("[:^lc:]");
\r
553 UnicodeSet a = new UnicodeSet();
\r
555 a.removeAll(caseless);
\r
556 System.out.println("[:^lc:] - caseless = " + a.toPattern(true));
\r
559 a.removeAll(not_lc);
\r
560 System.out.println("caseless - [:^lc:] = " + a.toPattern(true));
\r