/* ********************************************************************** * Copyright (c) 2003-2010, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Author: Alan Liu * Created: February 11 2003 * Since: ICU 2.6 ********************************************************************** */ package com.ibm.icu.dev.tool.translit; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.Vector; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; /** * This class produces the data tables used by the closeOver() method * of UnicodeSet. * * Whenever the Unicode database changes, this tool must be re-run * (AFTER the data file(s) underlying ICU4J are udpated). * * The output of this tool should then be pasted into the appropriate * files: * * ICU4J: com.ibm.icu.text.UnicodeSet.java * ICU4C: /icu/source/common/uniset.cpp */ class UnicodeSetCloseOver { // Our output files static final String JAVA_OUT = "to_UnicodeSet.java"; static final String JAVA_CHARPROP_OUT = "to_UCharacterProperty.java"; static final String C_SET_OUT = "to_uniset.cpp"; static final String C_UCHAR_OUT = "to_uchar.c"; // Source code "do not edit" warning static final String WARNING = "MACHINE-GENERATED; Unicode version " + UCharacter.getUnicodeVersion() + "; DO NOT EDIT; See " + UnicodeSetCloseOver.class.getName(); // Case folding options flag. This must correspond to the options // used in UnicodeSet.closeOver() in Java and C++. static final boolean DEFAULT_CASE_MAP = true; // false for Turkish public static void main(String[] args) throws IOException { System.out.println("This tool will generate several output files. Each is named according"); System.out.println("the target file. For example, the contents of to_UnicodeSet.java should"); System.out.println("be pasted into UnicodeSet.java."); System.out.println(); generateCaseData(); } /** * Create a map of String => Set. The String in this case is a * folded string for which * UCharacter.foldCase(folded. DEFAULT_CASE_MAP).equals(folded). * The Set contains all single-character strings x for which * UCharacter.foldCase(x, DEFAULT_CASE_MAP).equals(folded), as * well as folded itself. */ static Map createCaseFoldEquivalencyClasses() { Map equivClasses = new HashMap(); for (int i = 0; i <= 0x10FFFF; ++i) { int cat = UCharacter.getType(i); if (cat == Character.UNASSIGNED || cat == Character.PRIVATE_USE) continue; String cp = UTF16.valueOf(i); String folded = UCharacter.foldCase(cp, DEFAULT_CASE_MAP); if (folded.equals(cp)) continue; // At this point, have different case folding. Add // the code point and its folded equivalent into the // equivalency class. TreeSet s = (TreeSet) equivClasses.get(folded); if (s == null) { s = new TreeSet(); s.add(folded); // add the case fold result itself equivClasses.put(folded, s); } s.add(cp); } return equivClasses; } /** * Analyze the case fold equivalency classes. Break them into two * groups: 'pairs', and 'nonpairs'. Create a tally of the length * configurations of the nonpairs. * * Length configurations of equivalency classes, as of Unicode * 3.2. Most of the classes (83%) have two single codepoints. * Here "112:28" means there are 28 equivalency classes with 2 * single codepoints and one string of length 2. * * 11:656 * 111:16 * 1111:3 * 112:28 * 113:2 * 12:31 * 13:12 * 22:38 * * Note: This method does not count the frequencies of the * different length configurations (as shown above after ':'); it * merely records which configurations occur. * * @param pairs Accumulate equivalency classes that consist of * exactly two codepoints here. This is 83+% of the classes. * E.g., {"a", "A"}. * @param nonpairs Accumulate other equivalency classes here, as * lists of strings. E,g, {"st", "\uFB05", "\uFB06"}. * @param lengths Accumulate a list of unique length structures, * not including pairs. Each length structure is represented by a * string of digits. The digit string "12" means the equivalency * class contains a single code point and a string of length 2. * Typical contents of 'lengths': { "111", "1111", "112", * "113", "12", "13", "22" }. Note the absence of "11". */ static void analyzeCaseData(Map equivClasses, StringBuffer pairs, Vector nonpairs, Vector lengths) { Iterator i = new TreeSet(equivClasses.keySet()).iterator(); StringBuffer buf = new StringBuffer(); while (i.hasNext()) { Object key = i.next(); Vector v = new Vector((Set) equivClasses.get(key)); if (v.size() == 2) { String a = (String) v.elementAt(0); String b = (String) v.elementAt(1); if (a.length() == 1 && b.length() == 1) { pairs.append(a).append(b); continue; // Note that pairs are included in 'lengths' } } String[] a = new String[v.size()]; v.toArray(a); nonpairs.add(a); //int singleCount = 0; //int stringCount = 0; // Make a string of the lengths, e.g., "111" means 3 // single code points; "13" means a single code point // and a string of length 3. v.clear(); for (int j=0; j 'y') then // it binary searched against [0]='A', [2]='B', etc. When a // match is found at k, the pair is [k], [k+1]. out = new PrintStream(new FileOutputStream(C_SET_OUT)); System.out.println("Writing " + C_SET_OUT); // Sort the pairs. They must be ordered by the folded element. // Store these as two-character strings, with charAt(0) being // the folded member of the pair. TreeSet sortPairs = new TreeSet(new Comparator() { public int compare(Object a, Object b) { return ((int) ((String) a).charAt(0)) - ((int) ((String) b).charAt(0)); } public boolean equals(Object obj) { return false; } }); for (int i=0; i [ 'R', 0, 'i', 'Q', 0, 0 ] // {"S", "D", "F", "G"} => [ 'S', 'D', 'F', 'G', 0, 0 ] // {"jW", "jY"} => [ 0, 'j', 'W', 0, 'j', 'Y', 0, 0 ] // The end-result is a short, flat array of UChar values that // can be used to initialize a UChar[] array in C. int maxLen = 0; // Maximum encoded length of any class, including zeros out.println("// " + WARNING); out.println("static const CaseEquivClass CASE_NONPAIRS[] = {"); for (int j=0; j maxLen) maxLen = len; } out.println("};"); // Make sure the CaseEquivClass data can fit. if (maxLen > 8) { throw new RuntimeException("Must adjust CaseEquivClass to accomodate " + maxLen + " UChars"); } // Also make sure that we can map into this array using a // CompactByteArray. We could do this check above, but we // keep it here, adjacent to the maxLen check. We use one // value (-1 == 255) to indicate "no value." if (nonpairs.size() > 255) { throw new RuntimeException("Too many CASE_NONPAIRS array elements to be indexed by a CompactByteArray"); } //------------------------------------------------------------- // Case-unique set: All characters c for which closeOver(c)==c. // UPDATE: Instead of using this, we're using the related // notion of Case_Sensitive. See below. Note that // Case_Sensitive != ^Case_Unique. if (false) { UnicodeSet caseUnique = new UnicodeSet(); for (int i = 0; i <= 0x10FFFF; ++i) { String cp = UTF16.valueOf(i); if (equivClasses.get(UCharacter.foldCase(cp, DEFAULT_CASE_MAP)) == null) { caseUnique.add(i); } } // out.println("caseUnique = " + caseUnique.toPattern(true)); } UnicodeSet caseSensitive = getCaseSensitive(); //System.out.println("caseSensitive = " + caseSensitive.toPattern(true)); // Now for C, emit an array of ranges out = new PrintStream(new FileOutputStream(C_UCHAR_OUT)); System.out.println("Writing " + C_UCHAR_OUT); out.println("/* " + WARNING + " */"); emitUCharRangesArray(out, caseSensitive, "CASE_SENSITIVE_RANGES"); // For Java, emit a string with the ranges (each pair of chars // in the string is a range). out = new PrintStream(new FileOutputStream(JAVA_CHARPROP_OUT)); System.out.println("Writing " + JAVA_CHARPROP_OUT); out.println(" // " + WARNING); emitRangesString(out, caseSensitive, "CASE_SENSITIVE_RANGES"); } /** * Create the set of case-sensitive characters. These are characters * that participate in any case mapping operation as a source or * as a member of a target string. */ static UnicodeSet getCaseSensitive() { UnicodeSet caseSensitive = new UnicodeSet(); Locale loc = Locale.US; BreakIterator bi = BreakIterator.getTitleInstance(loc); for (int c = 0; c <= 0x10FFFF; ++c) { String cp = UTF16.valueOf(c); for (int j=0; j<4; ++j) { String s = null; switch (j) { case 0: s = UCharacter.toUpperCase(loc, cp); break; case 1: s = UCharacter.toLowerCase(loc, cp); break; case 2: s = UCharacter.toTitleCase(loc, cp, bi); break; case 3: s = UCharacter.foldCase(cp, DEFAULT_CASE_MAP); break; } if (!s.equals(cp)) { int cc; for (int k=0; k " + d + " created NEW MAPPING"+ " for Case_Sensitive"); } caseSensitive.add(c); caseSensitive.add(d); } } } return caseSensitive; } /** * Given a UnicodeSet, emit it as an array of UChar pairs. Each * pair will be the start/end of a range. Code points >= U+10000 * will be represented as surrogate pairs. */ static void emitUCharRangesArray(PrintStream out, UnicodeSet set, String id) { // Store the pairs in a StringBuffer. This handles surrogate // representation. StringBuffer buf = new StringBuffer(); for (int i=0; i