2 **********************************************************************
\r
3 * Copyright (c) 2003, International Business Machines
\r
4 * Corporation and others. All Rights Reserved.
\r
5 **********************************************************************
\r
7 * Created: February 11 2003
\r
9 **********************************************************************
\r
11 package com.ibm.icu.dev.tool.translit;
\r
13 import com.ibm.icu.lang.UCharacter;
\r
14 import com.ibm.icu.text.*;
\r
15 import com.ibm.icu.impl.Utility;
\r
16 import java.util.Collections;
\r
17 import java.util.Comparator;
\r
18 import java.util.HashMap;
\r
19 import java.util.Iterator;
\r
20 import java.util.Map;
\r
21 import java.util.Set;
\r
22 import java.util.TreeSet;
\r
23 import java.util.Vector;
\r
24 import java.util.Locale;
\r
28 * This class produces the data tables used by the closeOver() method
\r
31 * Whenever the Unicode database changes, this tool must be re-run
\r
32 * (AFTER the data file(s) underlying ICU4J are udpated).
\r
34 * The output of this tool should then be pasted into the appropriate
\r
37 * ICU4J: com.ibm.icu.text.UnicodeSet.java
\r
38 * ICU4C: /icu/source/common/uniset.cpp
\r
40 class UnicodeSetCloseOver {
\r
43 static final String JAVA_OUT = "to_UnicodeSet.java";
\r
44 static final String JAVA_CHARPROP_OUT = "to_UCharacterProperty.java";
\r
45 static final String C_SET_OUT = "to_uniset.cpp";
\r
46 static final String C_UCHAR_OUT = "to_uchar.c";
\r
48 // Source code "do not edit" warning
\r
49 static final String WARNING = "MACHINE-GENERATED; Unicode version " +
\r
50 UCharacter.getUnicodeVersion() +
\r
51 "; DO NOT EDIT; See " +
\r
52 UnicodeSetCloseOver.class.getName();
\r
54 // Case folding options flag. This must correspond to the options
\r
55 // used in UnicodeSet.closeOver() in Java and C++.
\r
56 static final boolean DEFAULT_CASE_MAP = true; // false for Turkish
\r
58 public static void main(String[] args) throws IOException {
\r
59 System.out.println("This tool will generate several output files. Each is named according");
\r
60 System.out.println("the target file. For example, the contents of to_UnicodeSet.java should");
\r
61 System.out.println("be pasted into UnicodeSet.java.");
\r
62 System.out.println();
\r
68 * Create a map of String => Set. The String in this case is a
\r
69 * folded string for which
\r
70 * UCharacter.foldCase(folded. DEFAULT_CASE_MAP).equals(folded).
\r
71 * The Set contains all single-character strings x for which
\r
72 * UCharacter.foldCase(x, DEFAULT_CASE_MAP).equals(folded), as
\r
73 * well as folded itself.
\r
75 static Map createCaseFoldEquivalencyClasses() {
\r
76 Map equivClasses = new HashMap();
\r
77 for (int i = 0; i <= 0x10FFFF; ++i) {
\r
78 int cat = UCharacter.getType(i);
\r
79 if (cat == Character.UNASSIGNED || cat == Character.PRIVATE_USE)
\r
82 String cp = UTF16.valueOf(i);
\r
83 String folded = UCharacter.foldCase(cp, DEFAULT_CASE_MAP);
\r
84 if (folded.equals(cp)) continue;
\r
86 // At this point, have different case folding. Add
\r
87 // the code point and its folded equivalent into the
\r
88 // equivalency class.
\r
89 TreeSet s = (TreeSet) equivClasses.get(folded);
\r
92 s.add(folded); // add the case fold result itself
\r
93 equivClasses.put(folded, s);
\r
97 return equivClasses;
\r
101 * Analyze the case fold equivalency classes. Break them into two
\r
102 * groups: 'pairs', and 'nonpairs'. Create a tally of the length
\r
103 * configurations of the nonpairs.
\r
105 * Length configurations of equivalency classes, as of Unicode
\r
106 * 3.2. Most of the classes (83%) have two single codepoints.
\r
107 * Here "112:28" means there are 28 equivalency classes with 2
\r
108 * single codepoints and one string of length 2.
\r
119 * Note: This method does not count the frequencies of the
\r
120 * different length configurations (as shown above after ':'); it
\r
121 * merely records which configurations occur.
\r
123 * @param pairs Accumulate equivalency classes that consist of
\r
124 * exactly two codepoints here. This is 83+% of the classes.
\r
125 * E.g., {"a", "A"}.
\r
126 * @param nonpairs Accumulate other equivalency classes here, as
\r
127 * lists of strings. E,g, {"st", "\uFB05", "\uFB06"}.
\r
128 * @param lengths Accumulate a list of unique length structures,
\r
129 * not including pairs. Each length structure is represented by a
\r
130 * string of digits. The digit string "12" means the equivalency
\r
131 * class contains a single code point and a string of length 2.
\r
132 * Typical contents of 'lengths': { "111", "1111", "112",
\r
133 * "113", "12", "13", "22" }. Note the absence of "11".
\r
135 static void analyzeCaseData(Map equivClasses,
\r
136 StringBuffer pairs,
\r
139 Iterator i = new TreeSet(equivClasses.keySet()).iterator();
\r
140 StringBuffer buf = new StringBuffer();
\r
141 while (i.hasNext()) {
\r
142 Object key = i.next();
\r
143 Vector v = new Vector((Set) equivClasses.get(key));
\r
144 if (v.size() == 2) {
\r
145 String a = (String) v.elementAt(0);
\r
146 String b = (String) v.elementAt(1);
\r
147 if (a.length() == 1 && b.length() == 1) {
\r
148 pairs.append(a).append(b);
\r
150 // Note that pairs are included in 'lengths'
\r
153 String[] a = new String[v.size()];
\r
156 //int singleCount = 0;
\r
157 //int stringCount = 0;
\r
158 // Make a string of the lengths, e.g., "111" means 3
\r
159 // single code points; "13" means a single code point
\r
160 // and a string of length 3.
\r
162 for (int j=0; j<a.length; ++j) {
\r
163 v.add(new Integer(a[j].length()));
\r
165 Collections.sort(v);
\r
167 for (int j=0; j<v.size(); ++j) {
\r
168 buf.append(String.valueOf(v.elementAt(j)));
\r
170 if (!lengths.contains(buf.toString())) {
\r
171 lengths.add(buf.toString());
\r
176 static void generateCaseData() throws IOException {
\r
178 Map equivClasses = createCaseFoldEquivalencyClasses();
\r
180 // Accumulate equivalency classes that consist of exactly
\r
181 // two codepoints here. This is 83+% of the classes.
\r
182 // E.g., {"a", "A"}.
\r
183 StringBuffer pairs = new StringBuffer();
\r
185 // Accumulate other equivalency classes here, as lists
\r
186 // of strings. E,g, {"st", "\uFB05", "\uFB06"}.
\r
187 Vector nonpairs = new Vector(); // contains String[]
\r
188 Vector lengths = new Vector(); // "111", "12", "22", etc.
\r
190 analyzeCaseData(equivClasses, pairs, nonpairs, lengths);
\r
192 //-------------------------------------------------------------
\r
193 // Emit Java source
\r
194 PrintStream out = new PrintStream(new FileOutputStream(JAVA_OUT));
\r
195 System.out.println("Writing " + JAVA_OUT);
\r
197 out.println(" // " + WARNING);
\r
198 out.println(" private static final String CASE_PAIRS =");
\r
199 out.println(Utility.formatForSource(pairs.toString()) + ";");
\r
201 out.println(" // " + WARNING);
\r
202 out.println(" private static final String[][] CASE_NONPAIRS = {");
\r
203 for (int j=0; j<nonpairs.size(); ++j) {
\r
204 String[] a = (String[]) nonpairs.elementAt(j);
\r
206 for (int k=0; k<a.length; ++k) {
\r
207 if (k != 0) out.print(", ");
\r
208 out.print(Utility.format1ForSource(a[k]));
\r
212 out.println(" };");
\r
214 //-------------------------------------------------------------
\r
217 // In C++, the pairs are again emitted in an array, but this
\r
218 // array is the final representation form -- it will not be
\r
219 // reprocessed into a hash. It will be binary searched by
\r
220 // looking at the even elements [0], [2], [4], etc., and
\r
221 // ignoring the odd elements. The even elements must contain
\r
222 // the folded members of the pairs. That is, in the pair
\r
223 // {'A', 'a'}, the even element must be 'a', not 'A'. Then a
\r
224 // code point to be located is first folded ('Y' => 'y') then
\r
225 // it binary searched against [0]='A', [2]='B', etc. When a
\r
226 // match is found at k, the pair is [k], [k+1].
\r
228 out = new PrintStream(new FileOutputStream(C_SET_OUT));
\r
229 System.out.println("Writing " + C_SET_OUT);
\r
231 // Sort the pairs. They must be ordered by the folded element.
\r
232 // Store these as two-character strings, with charAt(0) being
\r
233 // the folded member of the pair.
\r
234 TreeSet sortPairs = new TreeSet(new Comparator() {
\r
235 public int compare(Object a, Object b) {
\r
236 return ((int) ((String) a).charAt(0)) -
\r
237 ((int) ((String) b).charAt(0));
\r
239 public boolean equals(Object obj) {
\r
243 for (int i=0; i<pairs.length(); i+=2) {
\r
244 String a = String.valueOf(pairs.charAt(i));
\r
245 String b = String.valueOf(pairs.charAt(i+1));
\r
246 String folded = UCharacter.foldCase(a, DEFAULT_CASE_MAP);
\r
247 if (a.equals(folded)) {
\r
248 sortPairs.add(a + b);
\r
250 sortPairs.add(b + a);
\r
255 out.println("// " + WARNING);
\r
256 out.println("static const UChar CASE_PAIRS[] = {");
\r
257 Iterator it = sortPairs.iterator();
\r
258 while (it.hasNext()) {
\r
261 while (n++ < 5 && it.hasNext()) {
\r
262 String s = (String) it.next();
\r
263 //out.print((int) s.charAt(0) + "," +
\r
264 // (int) s.charAt(1) + ",");
\r
265 out.print("0x" + Utility.hex(s.charAt(0)) + ",0x" +
\r
266 Utility.hex(s.charAt(1)) + ",");
\r
273 // The non-pairs are encoded in the following way. All the
\r
274 // single codepoints in each class are grouped together
\r
275 // followed by a zero. Then each multi-character string is
\r
276 // added, followed by a zero. Finally, another zero is added.
\r
278 // {"iQ", "R"} => [ 'R', 0, 'i', 'Q', 0, 0 ]
\r
279 // {"S", "D", "F", "G"} => [ 'S', 'D', 'F', 'G', 0, 0 ]
\r
280 // {"jW", "jY"} => [ 0, 'j', 'W', 0, 'j', 'Y', 0, 0 ]
\r
281 // The end-result is a short, flat array of UChar values that
\r
282 // can be used to initialize a UChar[] array in C.
\r
284 int maxLen = 0; // Maximum encoded length of any class, including zeros
\r
285 out.println("// " + WARNING);
\r
286 out.println("static const CaseEquivClass CASE_NONPAIRS[] = {");
\r
287 for (int j=0; j<nonpairs.size(); ++j) {
\r
289 String[] a = (String[]) nonpairs.elementAt(j);
\r
291 // Emit single code points
\r
292 for (int k=0; k<a.length; ++k) {
\r
293 if (a[k].length() != 1) continue;
\r
294 //out.print((int) a[k].charAt(0) + ",");
\r
295 out.print("0x"+Utility.hex(a[k].charAt(0)) + ",");
\r
298 out.print("0, "); // End of single code points
\r
300 // Emit multi-character strings
\r
301 for (int k=0; k<a.length; ++k) {
\r
302 if (a[k].length() == 1) continue;
\r
303 for (int m=0; m<a[k].length(); ++m) {
\r
304 //out.print((int) a[k].charAt(m) + ",");
\r
305 out.print("0x"+Utility.hex(a[k].charAt(m)) + ",");
\r
308 out.print("0, "); // End of string
\r
311 out.println("0},"); // End of equivalency class
\r
313 if (len > maxLen) maxLen = len;
\r
317 // Make sure the CaseEquivClass data can fit.
\r
319 throw new RuntimeException("Must adjust CaseEquivClass to accomodate " + maxLen + " UChars");
\r
322 // Also make sure that we can map into this array using a
\r
323 // CompactByteArray. We could do this check above, but we
\r
324 // keep it here, adjacent to the maxLen check. We use one
\r
325 // value (-1 == 255) to indicate "no value."
\r
326 if (nonpairs.size() > 255) {
\r
327 throw new RuntimeException("Too many CASE_NONPAIRS array elements to be indexed by a CompactByteArray");
\r
330 //-------------------------------------------------------------
\r
331 // Case-unique set: All characters c for which closeOver(c)==c.
\r
333 // UPDATE: Instead of using this, we're using the related
\r
334 // notion of Case_Sensitive. See below. Note that
\r
335 // Case_Sensitive != ^Case_Unique.
\r
338 UnicodeSet caseUnique = new UnicodeSet();
\r
339 for (int i = 0; i <= 0x10FFFF; ++i) {
\r
340 String cp = UTF16.valueOf(i);
\r
341 if (equivClasses.get(UCharacter.foldCase(cp, DEFAULT_CASE_MAP)) == null) {
\r
345 // out.println("caseUnique = " + caseUnique.toPattern(true));
\r
348 UnicodeSet caseSensitive = getCaseSensitive();
\r
349 //System.out.println("caseSensitive = " + caseSensitive.toPattern(true));
\r
351 // Now for C, emit an array of ranges
\r
352 out = new PrintStream(new FileOutputStream(C_UCHAR_OUT));
\r
353 System.out.println("Writing " + C_UCHAR_OUT);
\r
355 out.println("/* " + WARNING + " */");
\r
356 emitUCharRangesArray(out, caseSensitive, "CASE_SENSITIVE_RANGES");
\r
358 // For Java, emit a string with the ranges (each pair of chars
\r
359 // in the string is a range).
\r
360 out = new PrintStream(new FileOutputStream(JAVA_CHARPROP_OUT));
\r
361 System.out.println("Writing " + JAVA_CHARPROP_OUT);
\r
362 out.println(" // " + WARNING);
\r
363 emitRangesString(out, caseSensitive, "CASE_SENSITIVE_RANGES");
\r
367 * Create the set of case-sensitive characters. These are characters
\r
368 * that participate in any case mapping operation as a source or
\r
369 * as a member of a target string.
\r
371 static UnicodeSet getCaseSensitive() {
\r
372 UnicodeSet caseSensitive = new UnicodeSet();
\r
373 Locale loc = Locale.US;
\r
374 BreakIterator bi = BreakIterator.getTitleInstance(loc);
\r
375 for (int c = 0; c <= 0x10FFFF; ++c) {
\r
376 String cp = UTF16.valueOf(c);
\r
377 for (int j=0; j<4; ++j) {
\r
380 case 0: s = UCharacter.toUpperCase(loc, cp); break;
\r
381 case 1: s = UCharacter.toLowerCase(loc, cp); break;
\r
382 case 2: s = UCharacter.toTitleCase(loc, cp, bi); break;
\r
383 case 3: s = UCharacter.foldCase(cp, DEFAULT_CASE_MAP); break;
\r
385 if (!s.equals(cp)) {
\r
387 for (int k=0; k<s.length(); k+=UTF16.getCharCount(cc)) {
\r
388 cc = UTF16.charAt(s, k);
\r
389 caseSensitive.add(cc);
\r
391 for (int k=0; k<cp.length(); k+=UTF16.getCharCount(cc)) {
\r
392 cc = UTF16.charAt(cp, k);
\r
393 caseSensitive.add(cc);
\r
397 // Also do the single-codepoint API. This shouldn't add any
\r
398 // code points, but we do it for completeness.
\r
399 for (int j=0; j<4; ++j) {
\r
402 case 0: d = UCharacter.toUpperCase(c); break;
\r
403 case 1: d = UCharacter.toLowerCase(c); break;
\r
404 case 2: d = UCharacter.toTitleCase(c); break;
\r
405 case 3: d = UCharacter.foldCase(c, DEFAULT_CASE_MAP); break;
\r
408 if (!caseSensitive.contains(c) ||
\r
409 !caseSensitive.contains(d)) {
\r
410 System.out.println("Warning: code point " + c +
\r
411 " => " + d + " created NEW MAPPING"+
\r
412 " for Case_Sensitive");
\r
414 caseSensitive.add(c);
\r
415 caseSensitive.add(d);
\r
419 return caseSensitive;
\r
423 * Given a UnicodeSet, emit it as an array of UChar pairs. Each
\r
424 * pair will be the start/end of a range. Code points >= U+10000
\r
425 * will be represented as surrogate pairs.
\r
427 static void emitUCharRangesArray(PrintStream out, UnicodeSet set, String id) {
\r
428 // Store the pairs in a StringBuffer. This handles surrogate
\r
430 StringBuffer buf = new StringBuffer();
\r
431 for (int i=0; i<set.getRangeCount(); ++i) {
\r
432 UTF16.append(buf, set.getRangeStart(i));
\r
433 UTF16.append(buf, set.getRangeEnd(i));
\r
436 out.println("static const UChar " + id + "[] = {");
\r
437 for (int i=0; i<buf.length(); ) {
\r
439 for (int n=0; n++<10 && i<buf.length(); ++i) {
\r
440 out.print("0x" + Utility.hex(buf.charAt(i), 4) + ',');
\r
445 out.println("#define " + id + "_LENGTH (sizeof(" + id +
\r
446 ")/sizeof(" + id + "[0]))");
\r
450 * Given a UnicodeSet, emit it as a Java string. The most economical
\r
451 * format is not the pattern, but instead a pairs list, with each
\r
452 * range pair represented as two adjacent characters.
\r
454 static void emitRangesString(PrintStream out, UnicodeSet set, String id) {
\r
455 // Store the pairs in a StringBuffer. This handles surrogate
\r
457 StringBuffer buf = new StringBuffer();
\r
458 for (int i=0; i<set.getRangeCount(); ++i) {
\r
459 UTF16.append(buf, set.getRangeStart(i));
\r
460 UTF16.append(buf, set.getRangeEnd(i));
\r
463 out.println(" private static final String " + id + " =");
\r
464 out.println(Utility.formatForSource(buf.toString()) + ";");
\r