]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_4_2-src/demos/src/com/ibm/icu/dev/demo/translit/CaseIterator.java
go
[Dictionary.git] / jars / icu4j-4_4_2-src / demos / src / com / ibm / icu / dev / demo / translit / CaseIterator.java
1 /**\r
2 *******************************************************************************\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and    *\r
4 * others. All Rights Reserved.                                                *\r
5 *******************************************************************************\r
6 */\r
7 \r
8 package com.ibm.icu.dev.demo.translit;\r
9 import java.util.HashMap;\r
10 import java.util.HashSet;\r
11 import java.util.Iterator;\r
12 import java.util.Map;\r
13 import java.util.Set;\r
14 import java.util.TreeSet;\r
15 \r
16 import com.ibm.icu.lang.UCharacter;\r
17 import com.ibm.icu.text.Transliterator;\r
18 import com.ibm.icu.text.UTF16;\r
19 import com.ibm.icu.text.UnicodeSet;\r
20 \r
21 /**\r
22  * Incrementally returns the set of all strings that case-fold to the same value.\r
23  */\r
24 public class CaseIterator {\r
25     \r
26     // testing stuff\r
27     static Transliterator toName = Transliterator.getInstance("[:^ascii:] Any-Name");\r
28     static Transliterator toHex = Transliterator.getInstance("[:^ascii:] Any-Hex");\r
29     static Transliterator toHex2 = Transliterator.getInstance("[[^\u0021-\u007F]-[,]] Any-Hex");\r
30     \r
31     // global tables (could be precompiled)\r
32     private static Map fromCaseFold = new HashMap();\r
33     private static Map toCaseFold = new HashMap();\r
34     private static int maxLength = 0;\r
35     \r
36     // This exception list is generated on the console by turning on the GENERATED flag, \r
37     // which MUST be false for normal operation.\r
38     // Once the list is generated, it is pasted in here.\r
39     // A bit of a cludge, but this bootstrapping is the easiest way \r
40     // to get around certain complications in the data.\r
41     \r
42     private static final boolean GENERATE = false;\r
43 \r
44     private static final boolean DUMP = false;\r
45     \r
46     private static String[][] exceptionList = {\r
47         // a\N{MODIFIER LETTER RIGHT HALF RING}\r
48         {"a\u02BE","A\u02BE","a\u02BE",},\r
49         // ff\r
50         {"ff","FF","Ff","fF","ff",},\r
51         // ffi\r
52         {"ffi","FFI","FFi","FfI","Ffi","F\uFB01","fFI","fFi","ffI","ffi","f\uFB01","\uFB00I","\uFB00i",},\r
53         // ffl\r
54         {"ffl","FFL","FFl","FfL","Ffl","F\uFB02","fFL","fFl","ffL","ffl","f\uFB02","\uFB00L","\uFB00l",},\r
55         // fi\r
56         {"fi","FI","Fi","fI","fi",},\r
57         // fl\r
58         {"fl","FL","Fl","fL","fl",},\r
59         // h\N{COMBINING MACRON BELOW}\r
60         {"h\u0331","H\u0331","h\u0331",},\r
61         // i\N{COMBINING DOT ABOVE}\r
62         {"i\u0307","I\u0307","i\u0307",},\r
63         // j\N{COMBINING CARON}\r
64         {"j\u030C","J\u030C","j\u030C",},\r
65         // ss\r
66         {"ss","SS","Ss","S\u017F","sS","ss","s\u017F","\u017FS","\u017Fs","\u017F\u017F",},\r
67         // st\r
68         {"st","ST","St","sT","st","\u017FT","\u017Ft",},\r
69         // t\N{COMBINING DIAERESIS}\r
70         {"t\u0308","T\u0308","t\u0308",},\r
71         // w\N{COMBINING RING ABOVE}\r
72         {"w\u030A","W\u030A","w\u030A",},\r
73         // y\N{COMBINING RING ABOVE}\r
74         {"y\u030A","Y\u030A","y\u030A",},\r
75         // \N{MODIFIER LETTER APOSTROPHE}n\r
76         {"\u02BCn","\u02BCN","\u02BCn",},\r
77         // \N{GREEK SMALL LETTER ALPHA WITH TONOS}\N{GREEK SMALL LETTER IOTA}\r
78         {"\u03AC\u03B9","\u0386\u0345","\u0386\u0399","\u0386\u03B9","\u0386\u1FBE","\u03AC\u0345","\u03AC\u0399","\u03AC\u03B9","\u03AC\u1FBE",},\r
79         // \N{GREEK SMALL LETTER ETA WITH TONOS}\N{GREEK SMALL LETTER IOTA}\r
80         {"\u03AE\u03B9","\u0389\u0345","\u0389\u0399","\u0389\u03B9","\u0389\u1FBE","\u03AE\u0345","\u03AE\u0399","\u03AE\u03B9","\u03AE\u1FBE",},\r
81         // \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}\r
82         {"\u03B1\u0342","\u0391\u0342","\u03B1\u0342",},\r
83         // \N{GREEK SMALL LETTER ALPHA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}\r
84         {"\u03B1\u0342\u03B9","\u0391\u0342\u0345","\u0391\u0342\u0399","\u0391\u0342\u03B9","\u0391\u0342\u1FBE",\r
85             "\u03B1\u0342\u0345","\u03B1\u0342\u0399","\u03B1\u0342\u03B9","\u03B1\u0342\u1FBE","\u1FB6\u0345",\r
86             "\u1FB6\u0399","\u1FB6\u03B9","\u1FB6\u1FBE",},\r
87         // \N{GREEK SMALL LETTER ALPHA}\N{GREEK SMALL LETTER IOTA}\r
88         {"\u03B1\u03B9","\u0391\u0345","\u0391\u0399","\u0391\u03B9","\u0391\u1FBE","\u03B1\u0345","\u03B1\u0399","\u03B1\u03B9","\u03B1\u1FBE",},\r
89         // \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}\r
90         {"\u03B7\u0342","\u0397\u0342","\u03B7\u0342",},\r
91         // \N{GREEK SMALL LETTER ETA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}\r
92         {"\u03B7\u0342\u03B9","\u0397\u0342\u0345","\u0397\u0342\u0399","\u0397\u0342\u03B9","\u0397\u0342\u1FBE",\r
93             "\u03B7\u0342\u0345","\u03B7\u0342\u0399","\u03B7\u0342\u03B9","\u03B7\u0342\u1FBE","\u1FC6\u0345","\u1FC6\u0399",\r
94             "\u1FC6\u03B9","\u1FC6\u1FBE",},\r
95         // \N{GREEK SMALL LETTER ETA}\N{GREEK SMALL LETTER IOTA}\r
96         {"\u03B7\u03B9","\u0397\u0345","\u0397\u0399","\u0397\u03B9","\u0397\u1FBE","\u03B7\u0345","\u03B7\u0399","\u03B7\u03B9","\u03B7\u1FBE",},\r
97         // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}\r
98         {"\u03B9\u0308\u0300","\u0345\u0308\u0300","\u0399\u0308\u0300","\u03B9\u0308\u0300","\u1FBE\u0308\u0300",},\r
99         // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT}\r
100         {"\u03B9\u0308\u0301","\u0345\u0308\u0301","\u0399\u0308\u0301","\u03B9\u0308\u0301","\u1FBE\u0308\u0301",},\r
101         // \N{GREEK SMALL LETTER IOTA}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI}\r
102         {"\u03B9\u0308\u0342","\u0345\u0308\u0342","\u0399\u0308\u0342","\u03B9\u0308\u0342","\u1FBE\u0308\u0342",},\r
103         // \N{GREEK SMALL LETTER IOTA}\N{COMBINING GREEK PERISPOMENI}\r
104         {"\u03B9\u0342","\u0345\u0342","\u0399\u0342","\u03B9\u0342","\u1FBE\u0342",},\r
105         // \N{GREEK SMALL LETTER RHO}\N{COMBINING COMMA ABOVE}\r
106         {"\u03C1\u0313","\u03A1\u0313","\u03C1\u0313","\u03F1\u0313",},\r
107         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}\r
108         {"\u03C5\u0308\u0300","\u03A5\u0308\u0300","\u03C5\u0308\u0300",},\r
109         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING ACUTE ACCENT}\r
110         {"\u03C5\u0308\u0301","\u03A5\u0308\u0301","\u03C5\u0308\u0301",},\r
111         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING DIAERESIS}\N{COMBINING GREEK PERISPOMENI}\r
112         {"\u03C5\u0308\u0342","\u03A5\u0308\u0342","\u03C5\u0308\u0342",},\r
113         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\r
114         {"\u03C5\u0313","\u03A5\u0313","\u03C5\u0313",},\r
115         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GRAVE ACCENT}\r
116         {"\u03C5\u0313\u0300","\u03A5\u0313\u0300","\u03C5\u0313\u0300","\u1F50\u0300",},\r
117         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING ACUTE ACCENT}\r
118         {"\u03C5\u0313\u0301","\u03A5\u0313\u0301","\u03C5\u0313\u0301","\u1F50\u0301",},\r
119         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING COMMA ABOVE}\N{COMBINING GREEK PERISPOMENI}\r
120         {"\u03C5\u0313\u0342","\u03A5\u0313\u0342","\u03C5\u0313\u0342","\u1F50\u0342",},\r
121         // \N{GREEK SMALL LETTER UPSILON}\N{COMBINING GREEK PERISPOMENI}\r
122         {"\u03C5\u0342","\u03A5\u0342","\u03C5\u0342",},\r
123         // \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}\r
124         {"\u03C9\u0342","\u03A9\u0342","\u03C9\u0342","\u2126\u0342",},\r
125         // \N{GREEK SMALL LETTER OMEGA}\N{COMBINING GREEK PERISPOMENI}\N{GREEK SMALL LETTER IOTA}\r
126         {"\u03C9\u0342\u03B9","\u03A9\u0342\u0345","\u03A9\u0342\u0399","\u03A9\u0342\u03B9","\u03A9\u0342\u1FBE","\u03C9\u0342\u0345","\u03C9\u0342\u0399","\u03C9\u0342\u03B9","\u03C9\u0342\u1FBE","\u1FF6\u0345",\r
127             "\u1FF6\u0399","\u1FF6\u03B9","\u1FF6\u1FBE","\u2126\u0342\u0345","\u2126\u0342\u0399","\u2126\u0342\u03B9","\u2126\u0342\u1FBE",},\r
128         // \N{GREEK SMALL LETTER OMEGA}\N{GREEK SMALL LETTER IOTA}\r
129         {"\u03C9\u03B9","\u03A9\u0345","\u03A9\u0399","\u03A9\u03B9","\u03A9\u1FBE","\u03C9\u0345","\u03C9\u0399","\u03C9\u03B9","\u03C9\u1FBE","\u2126\u0345","\u2126\u0399","\u2126\u03B9","\u2126\u1FBE",},\r
130         // \N{GREEK SMALL LETTER OMEGA WITH TONOS}\N{GREEK SMALL LETTER IOTA}\r
131         {"\u03CE\u03B9","\u038F\u0345","\u038F\u0399","\u038F\u03B9","\u038F\u1FBE","\u03CE\u0345","\u03CE\u0399","\u03CE\u03B9","\u03CE\u1FBE",},\r
132         // \N{ARMENIAN SMALL LETTER ECH}\N{ARMENIAN SMALL LETTER YIWN}\r
133         {"\u0565\u0582","\u0535\u0552","\u0535\u0582","\u0565\u0552","\u0565\u0582",},\r
134         // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER ECH}\r
135         {"\u0574\u0565","\u0544\u0535","\u0544\u0565","\u0574\u0535","\u0574\u0565",},\r
136         // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER INI}\r
137         {"\u0574\u056B","\u0544\u053B","\u0544\u056B","\u0574\u053B","\u0574\u056B",},\r
138         // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER XEH}\r
139         {"\u0574\u056D","\u0544\u053D","\u0544\u056D","\u0574\u053D","\u0574\u056D",},\r
140         // \N{ARMENIAN SMALL LETTER MEN}\N{ARMENIAN SMALL LETTER NOW}\r
141         {"\u0574\u0576","\u0544\u0546","\u0544\u0576","\u0574\u0546","\u0574\u0576",},\r
142         // \N{ARMENIAN SMALL LETTER VEW}\N{ARMENIAN SMALL LETTER NOW}\r
143         {"\u057E\u0576","\u054E\u0546","\u054E\u0576","\u057E\u0546","\u057E\u0576",},\r
144         // \N{GREEK SMALL LETTER ALPHA WITH PSILI}\N{GREEK SMALL LETTER IOTA}\r
145         {"\u1F00\u03B9","\u1F00\u0345","\u1F00\u0399","\u1F00\u03B9","\u1F00\u1FBE","\u1F08\u0345","\u1F08\u0399","\u1F08\u03B9","\u1F08\u1FBE",},\r
146         // \N{GREEK SMALL LETTER ALPHA WITH DASIA}\N{GREEK SMALL LETTER IOTA}\r
147         {"\u1F01\u03B9","\u1F01\u0345","\u1F01\u0399","\u1F01\u03B9","\u1F01\u1FBE","\u1F09\u0345","\u1F09\u0399","\u1F09\u03B9","\u1F09\u1FBE",},\r
148         // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}\r
149         {"\u1F02\u03B9","\u1F02\u0345","\u1F02\u0399","\u1F02\u03B9","\u1F02\u1FBE","\u1F0A\u0345","\u1F0A\u0399","\u1F0A\u03B9","\u1F0A\u1FBE",},\r
150         // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}\r
151         {"\u1F03\u03B9","\u1F03\u0345","\u1F03\u0399","\u1F03\u03B9","\u1F03\u1FBE","\u1F0B\u0345","\u1F0B\u0399","\u1F0B\u03B9","\u1F0B\u1FBE",},\r
152         // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}\r
153         {"\u1F04\u03B9","\u1F04\u0345","\u1F04\u0399","\u1F04\u03B9","\u1F04\u1FBE","\u1F0C\u0345","\u1F0C\u0399","\u1F0C\u03B9","\u1F0C\u1FBE",},\r
154         // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}\r
155         {"\u1F05\u03B9","\u1F05\u0345","\u1F05\u0399","\u1F05\u03B9","\u1F05\u1FBE","\u1F0D\u0345","\u1F0D\u0399","\u1F0D\u03B9","\u1F0D\u1FBE",},\r
156         // \N{GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}\r
157         {"\u1F06\u03B9","\u1F06\u0345","\u1F06\u0399","\u1F06\u03B9","\u1F06\u1FBE","\u1F0E\u0345","\u1F0E\u0399","\u1F0E\u03B9","\u1F0E\u1FBE",},\r
158         // \N{GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}\r
159         {"\u1F07\u03B9","\u1F07\u0345","\u1F07\u0399","\u1F07\u03B9","\u1F07\u1FBE","\u1F0F\u0345","\u1F0F\u0399","\u1F0F\u03B9","\u1F0F\u1FBE",},\r
160         // \N{GREEK SMALL LETTER ETA WITH PSILI}\N{GREEK SMALL LETTER IOTA}\r
161         {"\u1F20\u03B9","\u1F20\u0345","\u1F20\u0399","\u1F20\u03B9","\u1F20\u1FBE","\u1F28\u0345","\u1F28\u0399","\u1F28\u03B9","\u1F28\u1FBE",},\r
162         // \N{GREEK SMALL LETTER ETA WITH DASIA}\N{GREEK SMALL LETTER IOTA}\r
163         {"\u1F21\u03B9","\u1F21\u0345","\u1F21\u0399","\u1F21\u03B9","\u1F21\u1FBE","\u1F29\u0345","\u1F29\u0399","\u1F29\u03B9","\u1F29\u1FBE",},\r
164         // \N{GREEK SMALL LETTER ETA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}\r
165         {"\u1F22\u03B9","\u1F22\u0345","\u1F22\u0399","\u1F22\u03B9","\u1F22\u1FBE","\u1F2A\u0345","\u1F2A\u0399","\u1F2A\u03B9","\u1F2A\u1FBE",},\r
166         // \N{GREEK SMALL LETTER ETA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}\r
167         {"\u1F23\u03B9","\u1F23\u0345","\u1F23\u0399","\u1F23\u03B9","\u1F23\u1FBE","\u1F2B\u0345","\u1F2B\u0399","\u1F2B\u03B9","\u1F2B\u1FBE",},\r
168         // \N{GREEK SMALL LETTER ETA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}\r
169         {"\u1F24\u03B9","\u1F24\u0345","\u1F24\u0399","\u1F24\u03B9","\u1F24\u1FBE","\u1F2C\u0345","\u1F2C\u0399","\u1F2C\u03B9","\u1F2C\u1FBE",},\r
170         // \N{GREEK SMALL LETTER ETA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}\r
171         {"\u1F25\u03B9","\u1F25\u0345","\u1F25\u0399","\u1F25\u03B9","\u1F25\u1FBE","\u1F2D\u0345","\u1F2D\u0399","\u1F2D\u03B9","\u1F2D\u1FBE",},\r
172         // \N{GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}\r
173         {"\u1F26\u03B9","\u1F26\u0345","\u1F26\u0399","\u1F26\u03B9","\u1F26\u1FBE","\u1F2E\u0345","\u1F2E\u0399","\u1F2E\u03B9","\u1F2E\u1FBE",},\r
174         // \N{GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}\r
175         {"\u1F27\u03B9","\u1F27\u0345","\u1F27\u0399","\u1F27\u03B9","\u1F27\u1FBE","\u1F2F\u0345","\u1F2F\u0399","\u1F2F\u03B9","\u1F2F\u1FBE",},\r
176         // \N{GREEK SMALL LETTER OMEGA WITH PSILI}\N{GREEK SMALL LETTER IOTA}\r
177         {"\u1F60\u03B9","\u1F60\u0345","\u1F60\u0399","\u1F60\u03B9","\u1F60\u1FBE","\u1F68\u0345","\u1F68\u0399","\u1F68\u03B9","\u1F68\u1FBE",},\r
178         // \N{GREEK SMALL LETTER OMEGA WITH DASIA}\N{GREEK SMALL LETTER IOTA}\r
179         {"\u1F61\u03B9","\u1F61\u0345","\u1F61\u0399","\u1F61\u03B9","\u1F61\u1FBE","\u1F69\u0345","\u1F69\u0399","\u1F69\u03B9","\u1F69\u1FBE",},\r
180         // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA}\N{GREEK SMALL LETTER IOTA}\r
181         {"\u1F62\u03B9","\u1F62\u0345","\u1F62\u0399","\u1F62\u03B9","\u1F62\u1FBE","\u1F6A\u0345","\u1F6A\u0399","\u1F6A\u03B9","\u1F6A\u1FBE",},\r
182         // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA}\N{GREEK SMALL LETTER IOTA}\r
183         {"\u1F63\u03B9","\u1F63\u0345","\u1F63\u0399","\u1F63\u03B9","\u1F63\u1FBE","\u1F6B\u0345","\u1F6B\u0399","\u1F6B\u03B9","\u1F6B\u1FBE",},\r
184         // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA}\N{GREEK SMALL LETTER IOTA}\r
185         {"\u1F64\u03B9","\u1F64\u0345","\u1F64\u0399","\u1F64\u03B9","\u1F64\u1FBE","\u1F6C\u0345","\u1F6C\u0399","\u1F6C\u03B9","\u1F6C\u1FBE",},\r
186         // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA}\N{GREEK SMALL LETTER IOTA}\r
187         {"\u1F65\u03B9","\u1F65\u0345","\u1F65\u0399","\u1F65\u03B9","\u1F65\u1FBE","\u1F6D\u0345","\u1F6D\u0399","\u1F6D\u03B9","\u1F6D\u1FBE",},\r
188         // \N{GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}\r
189         {"\u1F66\u03B9","\u1F66\u0345","\u1F66\u0399","\u1F66\u03B9","\u1F66\u1FBE","\u1F6E\u0345","\u1F6E\u0399","\u1F6E\u03B9","\u1F6E\u1FBE",},\r
190         // \N{GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI}\N{GREEK SMALL LETTER IOTA}\r
191         {"\u1F67\u03B9","\u1F67\u0345","\u1F67\u0399","\u1F67\u03B9","\u1F67\u1FBE","\u1F6F\u0345","\u1F6F\u0399","\u1F6F\u03B9","\u1F6F\u1FBE",},\r
192         // \N{GREEK SMALL LETTER ALPHA WITH VARIA}\N{GREEK SMALL LETTER IOTA}\r
193         {"\u1F70\u03B9","\u1F70\u0345","\u1F70\u0399","\u1F70\u03B9","\u1F70\u1FBE","\u1FBA\u0345","\u1FBA\u0399","\u1FBA\u03B9","\u1FBA\u1FBE",},\r
194         // \N{GREEK SMALL LETTER ETA WITH VARIA}\N{GREEK SMALL LETTER IOTA}\r
195         {"\u1F74\u03B9","\u1F74\u0345","\u1F74\u0399","\u1F74\u03B9","\u1F74\u1FBE","\u1FCA\u0345","\u1FCA\u0399","\u1FCA\u03B9","\u1FCA\u1FBE",},\r
196         // \N{GREEK SMALL LETTER OMEGA WITH VARIA}\N{GREEK SMALL LETTER IOTA}\r
197         {"\u1F7C\u03B9","\u1F7C\u0345","\u1F7C\u0399","\u1F7C\u03B9","\u1F7C\u1FBE","\u1FFA\u0345","\u1FFA\u0399","\u1FFA\u03B9","\u1FFA\u1FBE",},\r
198     };\r
199     \r
200     // this initializes the data used to generated the case-equivalents\r
201 \r
202     static {\r
203         \r
204         // Gather up the exceptions in a form we can use\r
205         \r
206         if (!GENERATE) {\r
207             for (int i = 0; i < exceptionList.length; ++i) {\r
208                 String[] exception = exceptionList[i];\r
209                 Set s = new HashSet();\r
210                 // there has to be some method to do the following, but I can't find it in the collections\r
211                 for (int j = 0; j < exception.length; ++j) {\r
212                     s.add(exception[j]);\r
213                 }\r
214                 fromCaseFold.put(exception[0], s);\r
215             }\r
216         }\r
217         \r
218         // walk through all the characters, and at every case fold result,\r
219         // put a set of all the characters that map to that result\r
220 \r
221         boolean defaultmapping = true; // false for turkish\r
222         for (int i = 0; i <= 0x10FFFF; ++i) {\r
223             int cat = UCharacter.getType(i);\r
224             if (cat == Character.UNASSIGNED || cat == Character.PRIVATE_USE) continue;\r
225             \r
226             String cp = UTF16.valueOf(i);\r
227             String mapped = UCharacter.foldCase(cp, defaultmapping);\r
228             if (mapped.equals(cp)) continue;\r
229             \r
230             if (maxLength < mapped.length()) maxLength = mapped.length();\r
231             \r
232             // at this point, have different case folding\r
233             \r
234             Set s = (Set) fromCaseFold.get(mapped);\r
235             if (s == null) {\r
236                 s = new HashSet();\r
237                 s.add(mapped); // add the case fold result itself\r
238                 fromCaseFold.put(mapped, s);\r
239             }\r
240             s.add(cp);\r
241             toCaseFold.put(cp, mapped);\r
242             toCaseFold.put(mapped, mapped); // add mapping to self\r
243         }\r
244         \r
245         // Emit the final data\r
246 \r
247         if (DUMP) {\r
248             System.out.println("maxLength = " + maxLength);\r
249 \r
250             System.out.println("\nfromCaseFold:");\r
251             Iterator it = fromCaseFold.keySet().iterator();\r
252             while (it.hasNext()) {\r
253                 Object key = it.next();\r
254                 System.out.print(" " + toHex2.transliterate((String)key) + ": ");\r
255                 Set s = (Set) fromCaseFold.get(key);\r
256                 Iterator it2 = s.iterator();\r
257                 boolean first = true;\r
258                 while (it2.hasNext()) {\r
259                     if (first) {\r
260                         first = false;\r
261                     } else {\r
262                         System.out.print(", ");\r
263                     }\r
264                     System.out.print(toHex2.transliterate((String)it2.next()));\r
265                 }\r
266                 System.out.println("");\r
267             }\r
268 \r
269             System.out.println("\ntoCaseFold:");\r
270             it = toCaseFold.keySet().iterator();\r
271             while (it.hasNext()) {\r
272                 String key = (String) it.next();\r
273                 String value = (String) toCaseFold.get(key);\r
274                 System.out.println(" " + toHex2.transliterate(key) + ": " + toHex2.transliterate(value));\r
275             }            \r
276         }\r
277         \r
278         // Now convert all those sets into linear arrays\r
279         // We can't do this in place in Java, so make a temporary target array\r
280         \r
281         // Note: This could be transformed into a single array, with offsets into it.\r
282         // Might be best choice in C.\r
283         \r
284         \r
285         Map fromCaseFold2 = new HashMap();\r
286         Iterator it = fromCaseFold.keySet().iterator();\r
287         while (it.hasNext()) {\r
288             Object key = it.next();\r
289             Set s = (Set) fromCaseFold.get(key);\r
290             String[] temp = new String[s.size()];\r
291             s.toArray(temp);\r
292             fromCaseFold2.put(key, temp);\r
293         }\r
294         fromCaseFold = fromCaseFold2;\r
295 \r
296         // We have processed everything, so the iterator will now work\r
297         // The following is normally OFF. \r
298         // It is here to generate (under the GENERATE flag) the static exception list.\r
299         // It must be at the very end of initialization, so that the iterator is functional.\r
300         // (easiest to do it that way)\r
301             \r
302         if (GENERATE) {\r
303 \r
304             // first get small set of items that have multiple characters\r
305             \r
306             Set multichars = new TreeSet();\r
307             it = fromCaseFold.keySet().iterator();\r
308             while (it.hasNext()) {\r
309                 String key = (String) it.next();\r
310                 if (UTF16.countCodePoint(key) < 2) continue;\r
311                 multichars.add(key);\r
312             }            \r
313             \r
314             // now we will go through each of them.\r
315             \r
316             CaseIterator ci = new CaseIterator();\r
317             it = multichars.iterator();\r
318             \r
319             while (it.hasNext()) {\r
320                 String key = (String) it.next();\r
321                 \r
322                 // here is a nasty complication. Take 'ffi' ligature. We\r
323                 // can't just close it, since we would miss the combination\r
324                 // that includes the 'fi' => "fi" ligature\r
325                 // so first do a pass through, and add substring combinations\r
326                 // we call this a 'partial closure'\r
327                 \r
328                 Set partialClosure = new TreeSet();\r
329                 partialClosure.add(key);\r
330                 \r
331                 if (UTF16.countCodePoint(key) > 2) {\r
332                     Iterator multiIt2 = multichars.iterator();\r
333                     while (multiIt2.hasNext()) {\r
334                         String otherKey = (String) multiIt2.next();\r
335                         if (otherKey.length() >= key.length()) continue;\r
336                         int pos = -1;\r
337                         while (true) {\r
338                             // The following is not completely general\r
339                             // but works for the actual cased stuff,\r
340                             // and should work for future characters, since we won't have\r
341                             // more ligatures & other oddities.\r
342                             pos = key.indexOf(otherKey, pos+1);\r
343                             if (pos < 0) break;\r
344                             int endPos = pos + otherKey.length();\r
345                             // we know we have a proper substring,\r
346                             // so get the combinations\r
347                             String[] choices = (String[]) fromCaseFold.get(otherKey);\r
348                             for (int ii = 0; ii < choices.length; ++ii) {\r
349                                 String patchwork = key.substring(0, pos)\r
350                                     + choices[ii]\r
351                                     + key.substring(endPos);\r
352                                 partialClosure.add(patchwork);\r
353                             }\r
354                         }\r
355                     }\r
356                 }\r
357                 \r
358                 // now, for each thing in the partial closure, get its\r
359                 // case closure and add it to the final result.\r
360                 \r
361                 Set closure = new TreeSet(); // this will be the real closure\r
362                 Iterator partialIt = partialClosure.iterator();\r
363                 while (partialIt.hasNext()) {\r
364                     String key2 = (String) partialIt.next();\r
365                     ci.reset(key2);\r
366                     for (String temp = ci.next(); temp != null; temp = ci.next()) {\r
367                         closure.add(temp);\r
368                     }\r
369                     // form closure\r
370                     /*String[] choices = (String[]) fromCaseFold.get(key2);\r
371                     for (int i = 0; i < choices.length; ++i) {\r
372                         ci.reset(choices[i]);\r
373                         String temp;\r
374                         while (null != (temp = ci.next())) {\r
375                             closure.add(temp);\r
376                         }\r
377                     }\r
378                     */\r
379                 }\r
380                 \r
381                 // print it out, so that it can be cut and pasted back into this document.\r
382                 \r
383                 Iterator it2 = closure.iterator();\r
384                 System.out.println("\t// " + toName.transliterate(key));\r
385                 System.out.print("\t{\"" + toHex.transliterate(key) + "\",");\r
386                 while (it2.hasNext()) {\r
387                     String item = (String)it2.next();\r
388                     System.out.print("\"" + toHex.transliterate(item) + "\",");\r
389                 }\r
390                 System.out.println("},");\r
391             }\r
392         }\r
393     }\r
394     \r
395     // ============ PRIVATE CLASS DATA ============ \r
396     \r
397     // pieces that we will put together\r
398     // is not changed during iteration\r
399     private int count = 0;\r
400     private String[][] variants;\r
401     \r
402     // state information, changes during iteration\r
403     private boolean done = false;\r
404     private int[] counts;\r
405     \r
406     // internal buffer for efficiency\r
407     private StringBuffer nextBuffer = new StringBuffer();\r
408     \r
409     // ========================  \r
410 \r
411     /**\r
412      * Reset to different source. Once reset, the iteration starts from the beginning.\r
413      * @param source The string to get case variants for\r
414      */\r
415     public void reset(String source) {\r
416         \r
417         // allocate arrays to store pieces\r
418         // using length might be slightly too long, but we don't care much\r
419         \r
420         counts = new int[source.length()];\r
421         variants = new String[source.length()][];\r
422         \r
423         // walk through the source, and break up into pieces\r
424         // each piece becomes an array of equivalent values\r
425         // TODO: could optimized this later to coalesce all single string pieces\r
426         \r
427         String piece = null;\r
428         count = 0;\r
429         for (int i = 0; i < source.length(); i += piece.length()) {\r
430             \r
431             // find *longest* matching piece\r
432             String caseFold = null;\r
433             \r
434             if (GENERATE) {\r
435                 // do exactly one CP\r
436                 piece = UTF16.valueOf(source, i);\r
437                 caseFold = (String) toCaseFold.get(piece);\r
438             } else {               \r
439                 int max = i + maxLength;\r
440                 if (max > source.length()) max = source.length();\r
441                 for (int j = max; j > i; --j) {\r
442                     piece = source.substring(i, j);\r
443                     caseFold = (String) toCaseFold.get(piece);\r
444                     if (caseFold != null) break;\r
445                 }\r
446             }\r
447             \r
448             // if we fail, pick one code point\r
449             if (caseFold == null) {\r
450                 piece = UTF16.valueOf(source, i);\r
451                 variants[count++] = new String[] {piece}; // single item string\r
452             } else {\r
453                 variants[count++] = (String[])fromCaseFold.get(caseFold);\r
454             }\r
455         }\r
456         reset();\r
457     }\r
458     \r
459     /**\r
460      * Restart the iteration from the beginning, but with same source\r
461      */\r
462     public void reset() {\r
463         done = false;\r
464         for (int i = 0; i < count; ++i) {\r
465             counts[i] = 0;\r
466         }\r
467     }\r
468     \r
469     /**\r
470      * Iterates through the case variants.\r
471      * @return next case variant. Each variant will case-fold to the same value as the source will.\r
472      * When the iteration is done, null is returned.\r
473      */\r
474     public String next() {\r
475         \r
476         if (done) return null;\r
477         int i;\r
478         \r
479         // TODO Optimize so we keep the piece before and after the current position\r
480         // so we don't have so much concatenation\r
481         \r
482         // get the result, a concatenation\r
483         \r
484         nextBuffer.setLength(0);\r
485         for (i = 0; i < count; ++i) {\r
486             nextBuffer.append(variants[i][counts[i]]);\r
487         }\r
488         \r
489         // find the next right set of pieces to concatenate\r
490         \r
491         for (i = count-1; i >= 0; --i) {\r
492             counts[i]++;\r
493             if (counts[i] < variants[i].length) break;\r
494             counts[i] = 0;\r
495         }\r
496         \r
497         // if we go too far, bail\r
498         \r
499         if (i < 0) {\r
500             done = true;\r
501         }\r
502         \r
503         return nextBuffer.toString();            \r
504     }\r
505         \r
506         \r
507     /**\r
508      * Temporary test, just to see how the stuff works.\r
509      */\r
510     static public void main(String[] args) {\r
511         String[] testCases = {"fiss", "h\u03a3"};\r
512         CaseIterator ci = new CaseIterator();\r
513         \r
514         for (int i = 0; i < testCases.length; ++i) {\r
515             String item = testCases[i];\r
516             System.out.println();\r
517             System.out.println("Testing: " + toName.transliterate(item));\r
518             System.out.println();\r
519             ci.reset(item);\r
520             int count = 0;\r
521             for (String temp = ci.next(); temp != null; temp = ci.next()) {\r
522                 System.out.println(toName.transliterate(temp));\r
523                 count++;\r
524             }\r
525             System.out.println("Total: " + count);\r
526         }\r
527 \r
528         // generate a list of all caseless characters -- characters whose\r
529         // case closure is themselves.\r
530 \r
531         UnicodeSet caseless = new UnicodeSet();\r
532 \r
533         for (int i = 0; i <= 0x10FFFF; ++i) {\r
534             String cp = UTF16.valueOf(i);\r
535             ci.reset(cp);\r
536             int count = 0;\r
537             String fold = null;\r
538             for (String temp = ci.next(); temp != null; temp = ci.next()) {\r
539                 fold = temp;\r
540                 if (++count > 1) break;\r
541             }\r
542             if (count==1 && fold.equals(cp)) {\r
543                 caseless.add(i);\r
544             }\r
545         }\r
546 \r
547         System.out.println("caseless = " + caseless.toPattern(true));\r
548 \r
549         UnicodeSet not_lc = new UnicodeSet("[:^lc:]");\r
550         \r
551         UnicodeSet a = new UnicodeSet();\r
552         a.set(not_lc);\r
553         a.removeAll(caseless);\r
554         System.out.println("[:^lc:] - caseless = " + a.toPattern(true));\r
555 \r
556         a.set(caseless);\r
557         a.removeAll(not_lc);\r
558         System.out.println("caseless - [:^lc:] = " + a.toPattern(true));\r
559     }\r
560 }\r