]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-52_1/main/tests/translit/src/com/ibm/icu/dev/util/CaseIterator.java
Upgrade ICU4J.
[Dictionary.git] / jars / icu4j-52_1 / main / tests / translit / src / com / ibm / icu / dev / util / CaseIterator.java
1 /**
2 *******************************************************************************
3 * Copyright (C) 1996-2012, International Business Machines Corporation and    *
4 * others. All Rights Reserved.                                                *
5 *******************************************************************************
6 */
7
8 // copied from the Transliterator demo
9
10 package com.ibm.icu.dev.util;
11
12 import java.util.HashMap;
13 import java.util.HashSet;
14 import java.util.Iterator;
15 import java.util.Map;
16 import java.util.Set;
17 import java.util.TreeSet;
18
19 import com.ibm.icu.lang.UCharacter;
20 import com.ibm.icu.text.Transliterator;
21 import com.ibm.icu.text.UTF16;
22 import com.ibm.icu.text.UnicodeSet;
23
24 /**
25  * Incrementally returns the set of all strings that case-fold to the same value.
26  */
27 public class CaseIterator {
28     
29     // testing stuff
30     private static Transliterator toName = Transliterator.getInstance("[:^ascii:] Any-Name");
31     private static Transliterator toHex = Transliterator.getInstance("[:^ascii:] Any-Hex");
32     private static Transliterator toHex2 = Transliterator.getInstance("[[^\u0021-\u007F]-[,]] Any-Hex");
33     
34     // global tables (could be precompiled)
35     private static Map fromCaseFold = new HashMap();
36     private static Map toCaseFold = new HashMap();
37     private static int maxLength = 0;
38     
39     // This exception list is generated on the console by turning on the GENERATED flag, 
40     // which MUST be false for normal operation.
41     // Once the list is generated, it is pasted in here.
42     // A bit of a cludge, but this bootstrapping is the easiest way 
43     // to get around certain complications in the data.
44     
45     private static final boolean GENERATE = false;
46
47     private static final boolean DUMP = false;
48     
49     private static String[][] exceptionList = {
50         // a\N{MODIFIER LETTER RIGHT HALF RING}
51         {"a\u02BE","A\u02BE","a\u02BE",},
52         // ff
53         {"ff","FF","Ff","fF","ff",},
54         // ffi
55         {"ffi","FFI","FFi","FfI","Ffi","F\uFB01","fFI","fFi","ffI","ffi","f\uFB01","\uFB00I","\uFB00i",},
56         // ffl
57         {"ffl","FFL","FFl","FfL","Ffl","F\uFB02","fFL","fFl","ffL","ffl","f\uFB02","\uFB00L","\uFB00l",},
58         // fi
59         {"fi","FI","Fi","fI","fi",},
60         // fl
61         {"fl","FL","Fl","fL","fl",},
62         // h\N{COMBINING MACRON BELOW}
63         {"h\u0331","H\u0331","h\u0331",},
64         // i\N{COMBINING DOT ABOVE}
65         {"i\u0307","I\u0307","i\u0307",},
66         // j\N{COMBINING CARON}
67         {"j\u030C","J\u030C","j\u030C",},
68         // ss
69         {"ss","SS","Ss","S\u017F","sS","ss","s\u017F","\u017FS","\u017Fs","\u017F\u017F",},
70         // st
71         {"st","ST","St","sT","st","\u017FT","\u017Ft",},
72         // t\N{COMBINING DIAERESIS}
73         {"t\u0308","T\u0308","t\u0308",},
74         // w\N{COMBINING RING ABOVE}
75         {"w\u030A","W\u030A","w\u030A",},
76         // y\N{COMBINING RING ABOVE}
77         {"y\u030A","Y\u030A","y\u030A",},
78         // \N{MODIFIER LETTER APOSTROPHE}n
79         {"\u02BCn","\u02BCN","\u02BCn",},
80         // \N{GREEK SMALL LETTER ALPHA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
81         {"\u03AC\u03B9","\u0386\u0345","\u0386\u0399","\u0386\u03B9","\u0386\u1FBE","\u03AC\u0345","\u03AC\u0399","\u03AC\u03B9","\u03AC\u1FBE",},
82         // \N{GREEK SMALL LETTER ETA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
83         {"\u03AE\u03B9","\u0389\u0345","\u0389\u0399","\u0389\u03B9","\u0389\u1FBE","\u03AE\u0345","\u03AE\u0399","\u03AE\u03B9","\u03AE\u1FBE",},
84         // \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}
85         {"\u03B1\u0342","\u0391\u0342","\u03B1\u0342",},
86         // \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
87         {"\u03B1\u0342\u03B9","\u0391\u0342\u0345","\u0391\u0342\u0399","\u0391\u0342\u03B9","\u0391\u0342\u1FBE",
88             "\u03B1\u0342\u0345","\u03B1\u0342\u0399","\u03B1\u0342\u03B9","\u03B1\u0342\u1FBE","\u1FB6\u0345",
89             "\u1FB6\u0399","\u1FB6\u03B9","\u1FB6\u1FBE",},
90         // \N{GREEK SMALL LETTER ALPHA}\N{GREEK SMALL LETTER IOTA}
91         {"\u03B1\u03B9","\u0391\u0345","\u0391\u0399","\u0391\u03B9","\u0391\u1FBE","\u03B1\u0345","\u03B1\u0399","\u03B1\u03B9","\u03B1\u1FBE",},
92         // \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}
93         {"\u03B7\u0342","\u0397\u0342","\u03B7\u0342",},
94         // \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
95         {"\u03B7\u0342\u03B9","\u0397\u0342\u0345","\u0397\u0342\u0399","\u0397\u0342\u03B9","\u0397\u0342\u1FBE",
96             "\u03B7\u0342\u0345","\u03B7\u0342\u0399","\u03B7\u0342\u03B9","\u03B7\u0342\u1FBE","\u1FC6\u0345","\u1FC6\u0399",
97             "\u1FC6\u03B9","\u1FC6\u1FBE",},
98         // \N{GREEK SMALL LETTER ETA}\N{GREEK SMALL LETTER IOTA}
99         {"\u03B7\u03B9","\u0397\u0345","\u0397\u0399","\u0397\u03B9","\u0397\u1FBE","\u03B7\u0345","\u03B7\u0399","\u03B7\u03B9","\u03B7\u1FBE",},
100         // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}
101         {"\u03B9\u0308\u0300","\u0345\u0308\u0300","\u0399\u0308\u0300","\u03B9\u0308\u0300","\u1FBE\u0308\u0300",},
102         // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT}
103         {"\u03B9\u0308\u0301","\u0345\u0308\u0301","\u0399\u0308\u0301","\u03B9\u0308\u0301","\u1FBE\u0308\u0301",},
104         // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI}
105         {"\u03B9\u0308\u0342","\u0345\u0308\u0342","\u0399\u0308\u0342","\u03B9\u0308\u0342","\u1FBE\u0308\u0342",},
106         // \N{GREEK SMALL LETTER IOTA}\N{COMBINING GREEK PERISPOMENI}
107         {"\u03B9\u0342","\u0345\u0342","\u0399\u0342","\u03B9\u0342","\u1FBE\u0342",},
108         // \N{GREEK SMALL LETTER RHO}\N{COMBINING COMMA ABOVE}
109         {"\u03C1\u0313","\u03A1\u0313","\u03C1\u0313","\u03F1\u0313",},
110         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}
111         {"\u03C5\u0308\u0300","\u03A5\u0308\u0300","\u03C5\u0308\u0300",},
112         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT}
113         {"\u03C5\u0308\u0301","\u03A5\u0308\u0301","\u03C5\u0308\u0301",},
114         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI}
115         {"\u03C5\u0308\u0342","\u03A5\u0308\u0342","\u03C5\u0308\u0342",},
116         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}
117         {"\u03C5\u0313","\u03A5\u0313","\u03C5\u0313",},
118         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GRAVE ACCENT}
119         {"\u03C5\u0313\u0300","\u03A5\u0313\u0300","\u03C5\u0313\u0300","\u1F50\u0300",},
120         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING ACUTE ACCENT}
121         {"\u03C5\u0313\u0301","\u03A5\u0313\u0301","\u03C5\u0313\u0301","\u1F50\u0301",},
122         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GREEK PERISPOMENI}
123         {"\u03C5\u0313\u0342","\u03A5\u0313\u0342","\u03C5\u0313\u0342","\u1F50\u0342",},
124         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING GREEK PERISPOMENI}
125         {"\u03C5\u0342","\u03A5\u0342","\u03C5\u0342",},
126         // \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}
127         {"\u03C9\u0342","\u03A9\u0342","\u03C9\u0342","\u2126\u0342",},
128         // \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
129         {"\u03C9\u0342\u03B9","\u03A9\u0342\u0345","\u03A9\u0342\u0399","\u03A9\u0342\u03B9","\u03A9\u0342\u1FBE","\u03C9\u0342\u0345","\u03C9\u0342\u0399","\u03C9\u0342\u03B9","\u03C9\u0342\u1FBE","\u1FF6\u0345",
130             "\u1FF6\u0399","\u1FF6\u03B9","\u1FF6\u1FBE","\u2126\u0342\u0345","\u2126\u0342\u0399","\u2126\u0342\u03B9","\u2126\u0342\u1FBE",},
131         // \N{GREEK SMALL LETTER OMEGA}\N{GREEK SMALL LETTER IOTA}
132         {"\u03C9\u03B9","\u03A9\u0345","\u03A9\u0399","\u03A9\u03B9","\u03A9\u1FBE","\u03C9\u0345","\u03C9\u0399","\u03C9\u03B9","\u03C9\u1FBE","\u2126\u0345","\u2126\u0399","\u2126\u03B9","\u2126\u1FBE",},
133         // \N{GREEK SMALL LETTER OMEGA WITH TONOS}\N{GREEK SMALL LETTER IOTA}
134         {"\u03CE\u03B9","\u038F\u0345","\u038F\u0399","\u038F\u03B9","\u038F\u1FBE","\u03CE\u0345","\u03CE\u0399","\u03CE\u03B9","\u03CE\u1FBE",},
135         // \N{ARMENIAN SMALL LETTER ECH}\N{ARMENIAN SMALL LETTER YIWN}
136         {"\u0565\u0582","\u0535\u0552","\u0535\u0582","\u0565\u0552","\u0565\u0582",},
137         // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER ECH}
138         {"\u0574\u0565","\u0544\u0535","\u0544\u0565","\u0574\u0535","\u0574\u0565",},
139         // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER INI}
140         {"\u0574\u056B","\u0544\u053B","\u0544\u056B","\u0574\u053B","\u0574\u056B",},
141         // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER XEH}
142         {"\u0574\u056D","\u0544\u053D","\u0544\u056D","\u0574\u053D","\u0574\u056D",},
143         // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER NOW}
144         {"\u0574\u0576","\u0544\u0546","\u0544\u0576","\u0574\u0546","\u0574\u0576",},
145         // \N{ARMENIAN SMALL LETTER VEW}\N{ARMENIAN SMALL LETTER NOW}
146         {"\u057E\u0576","\u054E\u0546","\u054E\u0576","\u057E\u0546","\u057E\u0576",},
147         // \N{GREEK SMALL LETTER ALPHA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
148         {"\u1F00\u03B9","\u1F00\u0345","\u1F00\u0399","\u1F00\u03B9","\u1F00\u1FBE","\u1F08\u0345","\u1F08\u0399","\u1F08\u03B9","\u1F08\u1FBE",},
149         // \N{GREEK SMALL LETTER ALPHA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
150         {"\u1F01\u03B9","\u1F01\u0345","\u1F01\u0399","\u1F01\u03B9","\u1F01\u1FBE","\u1F09\u0345","\u1F09\u0399","\u1F09\u03B9","\u1F09\u1FBE",},
151         // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
152         {"\u1F02\u03B9","\u1F02\u0345","\u1F02\u0399","\u1F02\u03B9","\u1F02\u1FBE","\u1F0A\u0345","\u1F0A\u0399","\u1F0A\u03B9","\u1F0A\u1FBE",},
153         // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
154         {"\u1F03\u03B9","\u1F03\u0345","\u1F03\u0399","\u1F03\u03B9","\u1F03\u1FBE","\u1F0B\u0345","\u1F0B\u0399","\u1F0B\u03B9","\u1F0B\u1FBE",},
155         // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
156         {"\u1F04\u03B9","\u1F04\u0345","\u1F04\u0399","\u1F04\u03B9","\u1F04\u1FBE","\u1F0C\u0345","\u1F0C\u0399","\u1F0C\u03B9","\u1F0C\u1FBE",},
157         // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
158         {"\u1F05\u03B9","\u1F05\u0345","\u1F05\u0399","\u1F05\u03B9","\u1F05\u1FBE","\u1F0D\u0345","\u1F0D\u0399","\u1F0D\u03B9","\u1F0D\u1FBE",},
159         // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
160         {"\u1F06\u03B9","\u1F06\u0345","\u1F06\u0399","\u1F06\u03B9","\u1F06\u1FBE","\u1F0E\u0345","\u1F0E\u0399","\u1F0E\u03B9","\u1F0E\u1FBE",},
161         // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
162         {"\u1F07\u03B9","\u1F07\u0345","\u1F07\u0399","\u1F07\u03B9","\u1F07\u1FBE","\u1F0F\u0345","\u1F0F\u0399","\u1F0F\u03B9","\u1F0F\u1FBE",},
163         // \N{GREEK SMALL LETTER ETA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
164         {"\u1F20\u03B9","\u1F20\u0345","\u1F20\u0399","\u1F20\u03B9","\u1F20\u1FBE","\u1F28\u0345","\u1F28\u0399","\u1F28\u03B9","\u1F28\u1FBE",},
165         // \N{GREEK SMALL LETTER ETA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
166         {"\u1F21\u03B9","\u1F21\u0345","\u1F21\u0399","\u1F21\u03B9","\u1F21\u1FBE","\u1F29\u0345","\u1F29\u0399","\u1F29\u03B9","\u1F29\u1FBE",},
167         // \N{GREEK SMALL LETTER ETA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
168         {"\u1F22\u03B9","\u1F22\u0345","\u1F22\u0399","\u1F22\u03B9","\u1F22\u1FBE","\u1F2A\u0345","\u1F2A\u0399","\u1F2A\u03B9","\u1F2A\u1FBE",},
169         // \N{GREEK SMALL LETTER ETA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
170         {"\u1F23\u03B9","\u1F23\u0345","\u1F23\u0399","\u1F23\u03B9","\u1F23\u1FBE","\u1F2B\u0345","\u1F2B\u0399","\u1F2B\u03B9","\u1F2B\u1FBE",},
171         // \N{GREEK SMALL LETTER ETA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
172         {"\u1F24\u03B9","\u1F24\u0345","\u1F24\u0399","\u1F24\u03B9","\u1F24\u1FBE","\u1F2C\u0345","\u1F2C\u0399","\u1F2C\u03B9","\u1F2C\u1FBE",},
173         // \N{GREEK SMALL LETTER ETA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
174         {"\u1F25\u03B9","\u1F25\u0345","\u1F25\u0399","\u1F25\u03B9","\u1F25\u1FBE","\u1F2D\u0345","\u1F2D\u0399","\u1F2D\u03B9","\u1F2D\u1FBE",},
175         // \N{GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
176         {"\u1F26\u03B9","\u1F26\u0345","\u1F26\u0399","\u1F26\u03B9","\u1F26\u1FBE","\u1F2E\u0345","\u1F2E\u0399","\u1F2E\u03B9","\u1F2E\u1FBE",},
177         // \N{GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
178         {"\u1F27\u03B9","\u1F27\u0345","\u1F27\u0399","\u1F27\u03B9","\u1F27\u1FBE","\u1F2F\u0345","\u1F2F\u0399","\u1F2F\u03B9","\u1F2F\u1FBE",},
179         // \N{GREEK SMALL LETTER OMEGA WITH PSILI}\N{GREEK SMALL LETTER IOTA}
180         {"\u1F60\u03B9","\u1F60\u0345","\u1F60\u0399","\u1F60\u03B9","\u1F60\u1FBE","\u1F68\u0345","\u1F68\u0399","\u1F68\u03B9","\u1F68\u1FBE",},
181         // \N{GREEK SMALL LETTER OMEGA WITH DASIA}\N{GREEK SMALL LETTER IOTA}
182         {"\u1F61\u03B9","\u1F61\u0345","\u1F61\u0399","\u1F61\u03B9","\u1F61\u1FBE","\u1F69\u0345","\u1F69\u0399","\u1F69\u03B9","\u1F69\u1FBE",},
183         // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}
184         {"\u1F62\u03B9","\u1F62\u0345","\u1F62\u0399","\u1F62\u03B9","\u1F62\u1FBE","\u1F6A\u0345","\u1F6A\u0399","\u1F6A\u03B9","\u1F6A\u1FBE",},
185         // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}
186         {"\u1F63\u03B9","\u1F63\u0345","\u1F63\u0399","\u1F63\u03B9","\u1F63\u1FBE","\u1F6B\u0345","\u1F6B\u0399","\u1F6B\u03B9","\u1F6B\u1FBE",},
187         // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}
188         {"\u1F64\u03B9","\u1F64\u0345","\u1F64\u0399","\u1F64\u03B9","\u1F64\u1FBE","\u1F6C\u0345","\u1F6C\u0399","\u1F6C\u03B9","\u1F6C\u1FBE",},
189         // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}
190         {"\u1F65\u03B9","\u1F65\u0345","\u1F65\u0399","\u1F65\u03B9","\u1F65\u1FBE","\u1F6D\u0345","\u1F6D\u0399","\u1F6D\u03B9","\u1F6D\u1FBE",},
191         // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
192         {"\u1F66\u03B9","\u1F66\u0345","\u1F66\u0399","\u1F66\u03B9","\u1F66\u1FBE","\u1F6E\u0345","\u1F6E\u0399","\u1F6E\u03B9","\u1F6E\u1FBE",},
193         // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}
194         {"\u1F67\u03B9","\u1F67\u0345","\u1F67\u0399","\u1F67\u03B9","\u1F67\u1FBE","\u1F6F\u0345","\u1F6F\u0399","\u1F6F\u03B9","\u1F6F\u1FBE",},
195         // \N{GREEK SMALL LETTER ALPHA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
196         {"\u1F70\u03B9","\u1F70\u0345","\u1F70\u0399","\u1F70\u03B9","\u1F70\u1FBE","\u1FBA\u0345","\u1FBA\u0399","\u1FBA\u03B9","\u1FBA\u1FBE",},
197         // \N{GREEK SMALL LETTER ETA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
198         {"\u1F74\u03B9","\u1F74\u0345","\u1F74\u0399","\u1F74\u03B9","\u1F74\u1FBE","\u1FCA\u0345","\u1FCA\u0399","\u1FCA\u03B9","\u1FCA\u1FBE",},
199         // \N{GREEK SMALL LETTER OMEGA WITH VARIA}\N{GREEK SMALL LETTER IOTA}
200         {"\u1F7C\u03B9","\u1F7C\u0345","\u1F7C\u0399","\u1F7C\u03B9","\u1F7C\u1FBE","\u1FFA\u0345","\u1FFA\u0399","\u1FFA\u03B9","\u1FFA\u1FBE",},
201     };
202     
203     // this initializes the data used to generated the case-equivalents
204
205     static {
206         
207         // Gather up the exceptions in a form we can use
208         
209         if (!GENERATE) {
210             for (int i = 0; i < exceptionList.length; ++i) {
211                 String[] exception = exceptionList[i];
212                 Set s = new HashSet();
213                 // there has to be some method to do the following, but I can't find it in the collections
214                 for (int j = 0; j < exception.length; ++j) {
215                     s.add(exception[j]);
216                 }
217                 fromCaseFold.put(exception[0], s);
218             }
219         }
220         
221         // walk through all the characters, and at every case fold result,
222         // put a set of all the characters that map to that result
223
224         boolean defaultmapping = true; // false for turkish
225         for (int i = 0; i <= 0x10FFFF; ++i) {
226             int cat = UCharacter.getType(i);
227             if (cat == Character.UNASSIGNED || cat == Character.PRIVATE_USE) continue;
228             
229             String cp = UTF16.valueOf(i);
230             String mapped = UCharacter.foldCase(cp, defaultmapping);
231             if (mapped.equals(cp)) continue;
232             
233             if (maxLength < mapped.length()) maxLength = mapped.length();
234             
235             // at this point, have different case folding
236             
237             Set s = (Set) fromCaseFold.get(mapped);
238             if (s == null) {
239                 s = new HashSet();
240                 s.add(mapped); // add the case fold result itself
241                 fromCaseFold.put(mapped, s);
242             }
243             s.add(cp);
244             toCaseFold.put(cp, mapped);
245             toCaseFold.put(mapped, mapped); // add mapping to self
246         }
247         
248         // Emit the final data
249
250         if (DUMP) {
251             System.out.println("maxLength = " + maxLength);
252
253             System.out.println("\nfromCaseFold:");
254             Iterator it = fromCaseFold.keySet().iterator();
255             while (it.hasNext()) {
256                 Object key = it.next();
257                 System.out.print(" " + toHex2.transliterate((String)key) + ": ");
258                 Set s = (Set) fromCaseFold.get(key);
259                 Iterator it2 = s.iterator();
260                 boolean first = true;
261                 while (it2.hasNext()) {
262                     if (first) {
263                         first = false;
264                     } else {
265                         System.out.print(", ");
266                     }
267                     System.out.print(toHex2.transliterate((String)it2.next()));
268                 }
269                 System.out.println("");
270             }
271
272             System.out.println("\ntoCaseFold:");
273             it = toCaseFold.keySet().iterator();
274             while (it.hasNext()) {
275                 String key = (String) it.next();
276                 String value = (String) toCaseFold.get(key);
277                 System.out.println(" " + toHex2.transliterate(key) + ": " + toHex2.transliterate(value));
278             }            
279         }
280         
281         // Now convert all those sets into linear arrays
282         // We can't do this in place in Java, so make a temporary target array
283         
284         // Note: This could be transformed into a single array, with offsets into it.
285         // Might be best choice in C.
286         
287         
288         Map fromCaseFold2 = new HashMap();
289         Iterator it = fromCaseFold.keySet().iterator();
290         while (it.hasNext()) {
291             Object key = it.next();
292             Set s = (Set) fromCaseFold.get(key);
293             String[] temp = new String[s.size()];
294             s.toArray(temp);
295             fromCaseFold2.put(key, temp);
296         }
297         fromCaseFold = fromCaseFold2;
298
299         // We have processed everything, so the iterator will now work
300         // The following is normally OFF. 
301         // It is here to generate (under the GENERATE flag) the static exception list.
302         // It must be at the very end of initialization, so that the iterator is functional.
303         // (easiest to do it that way)
304             
305         if (GENERATE) {
306
307             // first get small set of items that have multiple characters
308             
309             Set multichars = new TreeSet();
310             it = fromCaseFold.keySet().iterator();
311             while (it.hasNext()) {
312                 String key = (String) it.next();
313                 if (UTF16.countCodePoint(key) < 2) continue;
314                 multichars.add(key);
315             }            
316             
317             // now we will go through each of them.
318             
319             CaseIterator ci = new CaseIterator();
320             it = multichars.iterator();
321             
322             while (it.hasNext()) {
323                 String key = (String) it.next();
324                 
325                 // here is a nasty complication. Take 'ffi' ligature. We
326                 // can't just close it, since we would miss the combination
327                 // that includes the 'fi' => "fi" ligature
328                 // so first do a pass through, and add substring combinations
329                 // we call this a 'partial closure'
330                 
331                 Set partialClosure = new TreeSet();
332                 partialClosure.add(key);
333                 
334                 if (UTF16.countCodePoint(key) > 2) {
335                     Iterator multiIt2 = multichars.iterator();
336                     while (multiIt2.hasNext()) {
337                         String otherKey = (String) multiIt2.next();
338                         if (otherKey.length() >= key.length()) continue;
339                         int pos = -1;
340                         while (true) {
341                             // The following is not completely general
342                             // but works for the actual cased stuff,
343                             // and should work for future characters, since we won't have
344                             // more ligatures & other oddities.
345                             pos = key.indexOf(otherKey, pos+1);
346                             if (pos < 0) break;
347                             int endPos = pos + otherKey.length();
348                             // we know we have a proper substring,
349                             // so get the combinations
350                             String[] choices = (String[]) fromCaseFold.get(otherKey);
351                             for (int ii = 0; ii < choices.length; ++ii) {
352                                 String patchwork = key.substring(0, pos)
353                                     + choices[ii]
354                                     + key.substring(endPos);
355                                 partialClosure.add(patchwork);
356                             }
357                         }
358                     }
359                 }
360                 
361                 // now, for each thing in the partial closure, get its
362                 // case closure and add it to the final result.
363                 
364                 Set closure = new TreeSet(); // this will be the real closure
365                 Iterator partialIt = partialClosure.iterator();
366                 while (partialIt.hasNext()) {
367                     String key2 = (String) partialIt.next();
368                     ci.reset(key2);
369                     for (String temp = ci.next(); temp != null; temp = ci.next()) {
370                         closure.add(temp);
371                     }
372                     // form closure
373                     /*String[] choices = (String[]) fromCaseFold.get(key2);
374                     for (int i = 0; i < choices.length; ++i) {
375                         ci.reset(choices[i]);
376                         String temp;
377                         while (null != (temp = ci.next())) {
378                             closure.add(temp);
379                         }
380                     }
381                     */
382                 }
383                 
384                 // print it out, so that it can be cut and pasted back into this document.
385                 
386                 Iterator it2 = closure.iterator();
387                 System.out.println("\t// " + toName.transliterate(key));
388                 System.out.print("\t{\"" + toHex.transliterate(key) + "\",");
389                 while (it2.hasNext()) {
390                     String item = (String)it2.next();
391                     System.out.print("\"" + toHex.transliterate(item) + "\",");
392                 }
393                 System.out.println("},");
394             }
395         }
396     }
397     
398     // ============ PRIVATE CLASS DATA ============ 
399     
400     // pieces that we will put together
401     // is not changed during iteration
402     private int count = 0;
403     private String[][] variants;
404     
405     // state information, changes during iteration
406     private boolean done = false;
407     private int[] counts;
408     
409     // internal buffer for efficiency
410     private StringBuffer nextBuffer = new StringBuffer();
411     
412     // ========================  
413
414     /**
415      * Reset to different source. Once reset, the iteration starts from the beginning.
416      * @param source The string to get case variants for
417      */
418     public void reset(String source) {
419         
420         // allocate arrays to store pieces
421         // using length might be slightly too long, but we don't care much
422         
423         counts = new int[source.length()];
424         variants = new String[source.length()][];
425         
426         // walk through the source, and break up into pieces
427         // each piece becomes an array of equivalent values
428         // TODO: could optimized this later to coalesce all single string pieces
429         
430         String piece = null;
431         count = 0;
432         for (int i = 0; i < source.length(); i += piece.length()) {
433             
434             // find *longest* matching piece
435             String caseFold = null;
436             
437             if (GENERATE) {
438                 // do exactly one CP
439                 piece = UTF16.valueOf(source, i);
440                 caseFold = (String) toCaseFold.get(piece);
441             } else {               
442                 int max = i + maxLength;
443                 if (max > source.length()) max = source.length();
444                 for (int j = max; j > i; --j) {
445                     piece = source.substring(i, j);
446                     caseFold = (String) toCaseFold.get(piece);
447                     if (caseFold != null) break;
448                 }
449             }
450             
451             // if we fail, pick one code point
452             if (caseFold == null) {
453                 piece = UTF16.valueOf(source, i);
454                 variants[count++] = new String[] {piece}; // single item string
455             } else {
456                 variants[count++] = (String[])fromCaseFold.get(caseFold);
457             }
458         }
459         reset();
460     }
461     
462     /**
463      * Restart the iteration from the beginning, but with same source
464      */
465     public void reset() {
466         done = false;
467         for (int i = 0; i < count; ++i) {
468             counts[i] = 0;
469         }
470     }
471     
472     /**
473      * Iterates through the case variants.
474      * @return next case variant. Each variant will case-fold to the same value as the source will.
475      * When the iteration is done, null is returned.
476      */
477     public String next() {
478         
479         if (done) return null;
480         int i;
481         
482         // TODO Optimize so we keep the piece before and after the current position
483         // so we don't have so much concatenation
484         
485         // get the result, a concatenation
486         
487         nextBuffer.setLength(0);
488         for (i = 0; i < count; ++i) {
489             nextBuffer.append(variants[i][counts[i]]);
490         }
491         
492         // find the next right set of pieces to concatenate
493         
494         for (i = count-1; i >= 0; --i) {
495             counts[i]++;
496             if (counts[i] < variants[i].length) break;
497             counts[i] = 0;
498         }
499         
500         // if we go too far, bail
501         
502         if (i < 0) {
503             done = true;
504         }
505         
506         return nextBuffer.toString();            
507     }
508         
509         
510     /**
511      * Temporary test, just to see how the stuff works.
512      */
513     static public void main(String[] args) {
514         String[] testCases = {"fiss", "h\u03a3"};
515         CaseIterator ci = new CaseIterator();
516         
517         for (int i = 0; i < testCases.length; ++i) {
518             String item = testCases[i];
519             System.out.println();
520             System.out.println("Testing: " + toName.transliterate(item));
521             System.out.println();
522             ci.reset(item);
523             int count = 0;
524             for (String temp = ci.next(); temp != null; temp = ci.next()) {
525                 System.out.println(toName.transliterate(temp));
526                 count++;
527             }
528             System.out.println("Total: " + count);
529         }
530
531         // generate a list of all caseless characters -- characters whose
532         // case closure is themselves.
533
534         UnicodeSet caseless = new UnicodeSet();
535
536         for (int i = 0; i <= 0x10FFFF; ++i) {
537             String cp = UTF16.valueOf(i);
538             ci.reset(cp);
539             int count = 0;
540             String fold = null;
541             for (String temp = ci.next(); temp != null; temp = ci.next()) {
542                 fold = temp;
543                 if (++count > 1) break;
544             }
545             if (count==1 && fold.equals(cp)) {
546                 caseless.add(i);
547             }
548         }
549
550         System.out.println("caseless = " + caseless.toPattern(true));
551
552         UnicodeSet not_lc = new UnicodeSet("[:^lc:]");
553         
554         UnicodeSet a = new UnicodeSet();
555         a.set(not_lc);
556         a.removeAll(caseless);
557         System.out.println("[:^lc:] - caseless = " + a.toPattern(true));
558
559         a.set(caseless);
560         a.removeAll(not_lc);
561         System.out.println("caseless - [:^lc:] = " + a.toPattern(true));
562     }
563 }