/** * Builds the normalization tables. This is a separate class so that it * can be unloaded once not needed.
* Copyright (C) 1998-2007 International Business Machines Corporation and * Unicode, Inc. All Rights Reserved.
* The Unicode Consortium makes no expressed or implied warranty of any * kind, and assumes no liability for errors or omissions. * No liability is assumed for incidental and consequential damages * in connection with or arising out of the use of the information here. * @author Mark Davis * Updates for supplementary code points: * Vladimir Weinstein & Markus Scherer */ package com.ibm.icu.dev.test.normalizer; import java.io.BufferedReader; import java.util.BitSet; import com.ibm.icu.dev.test.TestUtil; import com.ibm.icu.dev.test.UTF16Util; class NormalizerBuilder { //private static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc."; /** * Testing flags */ private static final boolean DEBUG = false; //private static final boolean GENERATING = false; /** * Constants for the data file version to use. */ /*static final boolean NEW_VERSION = true; private static final String DIR = "D:\\UnicodeData\\" + (NEW_VERSION ? "WorkingGroups\\" : ""); static final String UNIDATA_VERSION = NEW_VERSION ? "3.0.0d12" : "2.1.9"; static final String EXCLUSIONS_VERSION = NEW_VERSION ? "1d4" : "1"; public static final String UNICODE_DATA = DIR + "UnicodeData-" + UNIDATA_VERSION + ".txt"; public static final String COMPOSITION_EXCLUSIONS = DIR + "CompositionExclusions-" + EXCLUSIONS_VERSION +".txt"; */ /** * Called exactly once by NormalizerData to build the static data */ static NormalizerData build(boolean fullData) { try { IntHashtable canonicalClass = new IntHashtable(0); IntStringHashtable decompose = new IntStringHashtable(null); LongHashtable compose = new LongHashtable(NormalizerData.NOT_COMPOSITE); BitSet isCompatibility = new BitSet(); BitSet isExcluded = new BitSet(); if (fullData) { //System.out.println("Building Normalizer Data from file."); readExclusionList(isExcluded); //System.out.println(isExcluded.get(0x00C0)); buildDecompositionTables(canonicalClass, decompose, compose, isCompatibility, isExcluded); } else { // for use in Applets //System.out.println("Building abridged data."); setMinimalDecomp(canonicalClass, decompose, compose, isCompatibility, isExcluded); } return new NormalizerData(canonicalClass, decompose, compose, isCompatibility, isExcluded); } catch (java.io.IOException e) { System.err.println("Can't load data file." + e + ", " + e.getMessage()); return null; } } // ============================================================= // Building Decomposition Tables // ============================================================= /** * Reads exclusion list and stores the data */ private static void readExclusionList(BitSet isExcluded) throws java.io.IOException { if (DEBUG) System.out.println("Reading Exclusions"); BufferedReader in = TestUtil.getDataReader("unicode/CompositionExclusions.txt"); while (true) { // read a line, discarding comments and blank lines String line = in.readLine(); if (line == null) break; int comment = line.indexOf('#'); // strip comments if (comment != -1) line = line.substring(0,comment); if (line.length() == 0) continue; // ignore blanks if(line.indexOf(' ') != -1) { line = line.substring(0, line.indexOf(' ')); } // store -1 in the excluded table for each character hit int value = Integer.parseInt(line,16); isExcluded.set(value); //System.out.println("Excluding " + hex(value)); } in.close(); if (DEBUG) System.out.println("Done reading Exclusions"); } /** * Builds a decomposition table from a UnicodeData file */ private static void buildDecompositionTables( IntHashtable canonicalClass, IntStringHashtable decompose, LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) throws java.io.IOException { if (DEBUG) System.out.println("Reading Unicode Character Database"); //BufferedReader in = new BufferedReader(new FileReader(UNICODE_DATA), 64*1024); BufferedReader in = null; try { in = TestUtil.getDataReader("unicode/UnicodeData.txt"); } catch (Exception e) { System.err.println("Failed to read UnicodeData.txt"); System.exit(1); } int value; long pair; int counter = 0; while (true) { // read a line, discarding comments and blank lines String line = in.readLine(); if (line == null) break; int comment = line.indexOf('#'); // strip comments if (comment != -1) line = line.substring(0,comment); if (line.length() == 0) continue; if (DEBUG) { counter++; if ((counter & 0xFF) == 0) System.out.println("At: " + line); } // find the values of the particular fields that we need // Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0; int start = 0; int end = line.indexOf(';'); // code value = Integer.parseInt(line.substring(start,end),16); if (true && value == '\u00c0') { //System.out.println("debug: " + line); } end = line.indexOf(';',start=end+1); // name /*String name = line.substring(start,end);*/ end = line.indexOf(';',start=end+1); // general category end = line.indexOf(';',start=end+1); // canonical class // check consistency: canonical classes must be from 0 to 255 int cc = Integer.parseInt(line.substring(start,end)); if (cc != (cc & 0xFF)) System.err.println("Bad canonical class at: " + line); canonicalClass.put(value,cc); end = line.indexOf(';',start=end+1); // BIDI end = line.indexOf(';',start=end+1); // decomp // decomp requires more processing. // store whether it is canonical or compatibility. // store the decomp in one table, and the reverse mapping (from pairs) in another if (start != end) { String segment = line.substring(start, end); boolean compat = segment.charAt(0) == '<'; if (compat) isCompatibility.set(value); String decomp = fromHex(segment); // a small snippet of code to generate the Applet data /*if (GENERATING) { if (value < 0xFF) { System.out.println( "\"\\u" + hex((char)value) + "\", " + "\"\\u" + hex(decomp, "\\u") + "\", " + (compat ? "\"K\"," : "\"\",") + "// " + name); } }*/ // check consistency: all canon decomps must be singles or pairs! int decompLen = UTF16Util.countCodePoint(decomp); if (decompLen < 1 || decompLen > 2 && !compat) { System.err.println("Bad decomp at: " + line); } decompose.put(value, decomp); // only compositions are canonical pairs // skip if script exclusion if (!compat && !isExcluded.get(value)) { int first = '\u0000'; int second = UTF16Util.nextCodePoint(decomp, 0); if (decompLen > 1) { first = second; second = UTF16Util.nextCodePoint(decomp, UTF16Util.codePointLength(first)); } // store composition pair in single integer pair = ((long)first << 32) | second; if (DEBUG && value == '\u00C0') { System.out.println("debug2: " + line); } compose.put(pair, value); } else if (DEBUG) { System.out.println("Excluding: " + decomp); } } } in.close(); if (DEBUG) System.out.println("Done reading Unicode Character Database"); // add algorithmic Hangul decompositions // this is more compact if done at runtime, but for simplicity we // do it this way. if (DEBUG) System.out.println("Adding Hangul"); for (int SIndex = 0; SIndex < SCount; ++SIndex) { int TIndex = SIndex % TCount; char first, second; if (TIndex != 0) { // triple first = (char)(SBase + SIndex - TIndex); second = (char)(TBase + TIndex); } else { first = (char)(LBase + SIndex / NCount); second = (char)(VBase + (SIndex % NCount) / TCount); } pair = ((long)first << 32) | second; value = SIndex + SBase; decompose.put(value, String.valueOf(first) + second); compose.put(pair, value); } if (DEBUG) System.out.println("Done adding Hangul"); } /** * Hangul composition constants */ static final int SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7, LCount = 19, VCount = 21, TCount = 28, NCount = VCount * TCount, // 588 SCount = LCount * NCount; // 11172 /** * For use in an applet: just load a minimal set of data. */ private static void setMinimalDecomp(IntHashtable canonicalClass, IntStringHashtable decompose, LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) { String[] decomposeData = { "\u005E", "\u0020\u0302", "K", "\u005F", "\u0020\u0332", "K", "\u0060", "\u0020\u0300", "K", "\u00A0", "\u0020", "K", "\u00A8", "\u0020\u0308", "K", "\u00AA", "\u0061", "K", "\u00AF", "\u0020\u0304", "K", "\u00B2", "\u0032", "K", "\u00B3", "\u0033", "K", "\u00B4", "\u0020\u0301", "K", "\u00B5", "\u03BC", "K", "\u00B8", "\u0020\u0327", "K", "\u00B9", "\u0031", "K", "\u00BA", "\u006F", "K", "\u00BC", "\u0031\u2044\u0034", "K", "\u00BD", "\u0031\u2044\u0032", "K", "\u00BE", "\u0033\u2044\u0034", "K", "\u00C0", "\u0041\u0300", "", "\u00C1", "\u0041\u0301", "", "\u00C2", "\u0041\u0302", "", "\u00C3", "\u0041\u0303", "", "\u00C4", "\u0041\u0308", "", "\u00C5", "\u0041\u030A", "", "\u00C7", "\u0043\u0327", "", "\u00C8", "\u0045\u0300", "", "\u00C9", "\u0045\u0301", "", "\u00CA", "\u0045\u0302", "", "\u00CB", "\u0045\u0308", "", "\u00CC", "\u0049\u0300", "", "\u00CD", "\u0049\u0301", "", "\u00CE", "\u0049\u0302", "", "\u00CF", "\u0049\u0308", "", "\u00D1", "\u004E\u0303", "", "\u00D2", "\u004F\u0300", "", "\u00D3", "\u004F\u0301", "", "\u00D4", "\u004F\u0302", "", "\u00D5", "\u004F\u0303", "", "\u00D6", "\u004F\u0308", "", "\u00D9", "\u0055\u0300", "", "\u00DA", "\u0055\u0301", "", "\u00DB", "\u0055\u0302", "", "\u00DC", "\u0055\u0308", "", "\u00DD", "\u0059\u0301", "", "\u00E0", "\u0061\u0300", "", "\u00E1", "\u0061\u0301", "", "\u00E2", "\u0061\u0302", "", "\u00E3", "\u0061\u0303", "", "\u00E4", "\u0061\u0308", "", "\u00E5", "\u0061\u030A", "", "\u00E7", "\u0063\u0327", "", "\u00E8", "\u0065\u0300", "", "\u00E9", "\u0065\u0301", "", "\u00EA", "\u0065\u0302", "", "\u00EB", "\u0065\u0308", "", "\u00EC", "\u0069\u0300", "", "\u00ED", "\u0069\u0301", "", "\u00EE", "\u0069\u0302", "", "\u00EF", "\u0069\u0308", "", "\u00F1", "\u006E\u0303", "", "\u00F2", "\u006F\u0300", "", "\u00F3", "\u006F\u0301", "", "\u00F4", "\u006F\u0302", "", "\u00F5", "\u006F\u0303", "", "\u00F6", "\u006F\u0308", "", "\u00F9", "\u0075\u0300", "", "\u00FA", "\u0075\u0301", "", "\u00FB", "\u0075\u0302", "", "\u00FC", "\u0075\u0308", "", "\u00FD", "\u0079\u0301", "", // EXTRAS, outside of Latin 1 "\u1EA4", "\u00C2\u0301", "", "\u1EA5", "\u00E2\u0301", "", "\u1EA6", "\u00C2\u0300", "", "\u1EA7", "\u00E2\u0300", "", }; int[] classData = { 0x0300, 230, 0x0301, 230, 0x0302, 230, 0x0303, 230, 0x0304, 230, 0x0305, 230, 0x0306, 230, 0x0307, 230, 0x0308, 230, 0x0309, 230, 0x030A, 230, 0x030B, 230, 0x030C, 230, 0x030D, 230, 0x030E, 230, 0x030F, 230, 0x0310, 230, 0x0311, 230, 0x0312, 230, 0x0313, 230, 0x0314, 230, 0x0315, 232, 0x0316, 220, 0x0317, 220, 0x0318, 220, 0x0319, 220, 0x031A, 232, 0x031B, 216, 0x031C, 220, 0x031D, 220, 0x031E, 220, 0x031F, 220, 0x0320, 220, 0x0321, 202, 0x0322, 202, 0x0323, 220, 0x0324, 220, 0x0325, 220, 0x0326, 220, 0x0327, 202, 0x0328, 202, 0x0329, 220, 0x032A, 220, 0x032B, 220, 0x032C, 220, 0x032D, 220, 0x032E, 220, 0x032F, 220, 0x0330, 220, 0x0331, 220, 0x0332, 220, 0x0333, 220, 0x0334, 1, 0x0335, 1, 0x0336, 1, 0x0337, 1, 0x0338, 1, 0x0339, 220, 0x033A, 220, 0x033B, 220, 0x033C, 220, 0x033D, 230, 0x033E, 230, 0x033F, 230, 0x0340, 230, 0x0341, 230, 0x0342, 230, 0x0343, 230, 0x0344, 230, 0x0345, 240, 0x0360, 234, 0x0361, 234 }; // build the same tables we would otherwise get from the // Unicode Character Database, just with limited data for (int i = 0; i < decomposeData.length; i+=3) { char value = decomposeData[i].charAt(0); String decomp = decomposeData[i+1]; boolean compat = decomposeData[i+2].equals("K"); if (compat) isCompatibility.set(value); decompose.put(value, decomp); if (!compat) { int first = '\u0000'; int second = UTF16Util.nextCodePoint(decomp, 0); if (decomp.length() > 1) { first = second; second = UTF16Util.nextCodePoint(decomp, UTF16Util.codePointLength(first)); } long pair = (first << 16) | second; compose.put(pair, value); } } for (int i = 0; i < classData.length;) { canonicalClass.put(classData[i++], classData[i++]); } } /** * Utility: Parses a sequence of hex Unicode characters separated by spaces */ static public String fromHex(String source) { StringBuffer result = new StringBuffer(); for (int i = 0; i < source.length(); ++i) { char c = source.charAt(i); switch (c) { case ' ': break; // ignore case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': int end = 0; int value = 0; try { //System.out.println(source.substring(i, i + 4) + "************" + source); end = source.indexOf(' ',i); if (end < 0) { end = source.length(); } value = Integer.parseInt(source.substring(i, end),16); UTF16Util.appendCodePoint(result, value); } catch (Exception e) { System.out.println("i: " + i + ";end:" + end + "source:" + source); //System.out.println(source.substring(i, i + 4) + "************" + source); System.exit(1); } //i+= 3; // skip rest of number i = end; break; case '<': int j = source.indexOf('>',i); // skip <...> if (j > 0) { i = j; break; } // else fall through--error default: throw new IllegalArgumentException("Bad hex value in " + source); } } return result.toString(); } /** * Utility: Supplies a zero-padded hex representation of an integer (without 0x) */ static public String hex(int i) { String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase(); return "00000000".substring(result.length(),8) + result; } /** * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u) */ static public String hex(char i) { String result = Integer.toString(i, 16).toUpperCase(); return "0000".substring(result.length(),4) + result; } /** * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u) */ public static String hex(String s, String sep) { StringBuffer result = new StringBuffer(); for (int i = 0; i < s.length(); ++i) { if (i != 0) result.append(sep); result.append(hex(s.charAt(i))); } return result.toString(); } }