2 * Builds the normalization tables. This is a separate class so that it
3 * can be unloaded once not needed.<br>
4 * Copyright (C) 1998-2007 International Business Machines Corporation and
5 * Unicode, Inc. All Rights Reserved.<br>
6 * The Unicode Consortium makes no expressed or implied warranty of any
7 * kind, and assumes no liability for errors or omissions.
8 * No liability is assumed for incidental and consequential damages
9 * in connection with or arising out of the use of the information here.
11 * Updates for supplementary code points:
12 * Vladimir Weinstein & Markus Scherer
15 package com.ibm.icu.dev.test.normalizer;
17 import java.io.BufferedReader;
18 import java.util.BitSet;
20 import com.ibm.icu.dev.test.TestUtil;
21 import com.ibm.icu.dev.test.UTF16Util;
23 class NormalizerBuilder {
24 //private static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
30 private static final boolean DEBUG = false;
31 //private static final boolean GENERATING = false;
34 * Constants for the data file version to use.
36 /*static final boolean NEW_VERSION = true;
37 private static final String DIR = "D:\\UnicodeData\\" + (NEW_VERSION ? "WorkingGroups\\" : "");
39 static final String UNIDATA_VERSION = NEW_VERSION ? "3.0.0d12" : "2.1.9";
40 static final String EXCLUSIONS_VERSION = NEW_VERSION ? "1d4" : "1";
42 public static final String UNICODE_DATA = DIR + "UnicodeData-" + UNIDATA_VERSION + ".txt";
43 public static final String COMPOSITION_EXCLUSIONS = DIR + "CompositionExclusions-" + EXCLUSIONS_VERSION +".txt";
47 * Called exactly once by NormalizerData to build the static data
50 static NormalizerData build(boolean fullData) {
52 IntHashtable canonicalClass = new IntHashtable(0);
53 IntStringHashtable decompose = new IntStringHashtable(null);
54 LongHashtable compose = new LongHashtable(NormalizerData.NOT_COMPOSITE);
55 BitSet isCompatibility = new BitSet();
56 BitSet isExcluded = new BitSet();
58 //System.out.println("Building Normalizer Data from file.");
59 readExclusionList(isExcluded);
60 //System.out.println(isExcluded.get(0x00C0));
61 buildDecompositionTables(canonicalClass, decompose, compose,
62 isCompatibility, isExcluded);
63 } else { // for use in Applets
64 //System.out.println("Building abridged data.");
65 setMinimalDecomp(canonicalClass, decompose, compose,
66 isCompatibility, isExcluded);
68 return new NormalizerData(canonicalClass, decompose, compose,
69 isCompatibility, isExcluded);
70 } catch (java.io.IOException e) {
71 System.err.println("Can't load data file." + e + ", " + e.getMessage());
76 // =============================================================
77 // Building Decomposition Tables
78 // =============================================================
81 * Reads exclusion list and stores the data
83 private static void readExclusionList(BitSet isExcluded) throws java.io.IOException {
84 if (DEBUG) System.out.println("Reading Exclusions");
86 BufferedReader in = TestUtil.getDataReader("unicode/CompositionExclusions.txt");
89 // read a line, discarding comments and blank lines
91 String line = in.readLine();
92 if (line == null) break;
93 int comment = line.indexOf('#'); // strip comments
94 if (comment != -1) line = line.substring(0,comment);
95 if (line.length() == 0) continue; // ignore blanks
96 if(line.indexOf(' ') != -1) {
97 line = line.substring(0, line.indexOf(' '));
99 // store -1 in the excluded table for each character hit
101 int value = Integer.parseInt(line,16);
102 isExcluded.set(value);
103 //System.out.println("Excluding " + hex(value));
106 if (DEBUG) System.out.println("Done reading Exclusions");
110 * Builds a decomposition table from a UnicodeData file
112 private static void buildDecompositionTables(
113 IntHashtable canonicalClass, IntStringHashtable decompose,
114 LongHashtable compose, BitSet isCompatibility, BitSet isExcluded)
115 throws java.io.IOException {
116 if (DEBUG) System.out.println("Reading Unicode Character Database");
117 //BufferedReader in = new BufferedReader(new FileReader(UNICODE_DATA), 64*1024);
118 BufferedReader in = null;
120 in = TestUtil.getDataReader("unicode/UnicodeData.txt");
121 } catch (Exception e) {
122 System.err.println("Failed to read UnicodeData.txt");
131 // read a line, discarding comments and blank lines
133 String line = in.readLine();
134 if (line == null) break;
135 int comment = line.indexOf('#'); // strip comments
136 if (comment != -1) line = line.substring(0,comment);
137 if (line.length() == 0) continue;
140 if ((counter & 0xFF) == 0) System.out.println("At: " + line);
143 // find the values of the particular fields that we need
144 // Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0;
147 int end = line.indexOf(';'); // code
148 value = Integer.parseInt(line.substring(start,end),16);
149 if (true && value == '\u00c0') {
150 //System.out.println("debug: " + line);
152 end = line.indexOf(';',start=end+1); // name
153 /*String name = line.substring(start,end);*/
154 end = line.indexOf(';',start=end+1); // general category
155 end = line.indexOf(';',start=end+1); // canonical class
157 // check consistency: canonical classes must be from 0 to 255
159 int cc = Integer.parseInt(line.substring(start,end));
160 if (cc != (cc & 0xFF)) System.err.println("Bad canonical class at: " + line);
161 canonicalClass.put(value,cc);
162 end = line.indexOf(';',start=end+1); // BIDI
163 end = line.indexOf(';',start=end+1); // decomp
165 // decomp requires more processing.
166 // store whether it is canonical or compatibility.
167 // store the decomp in one table, and the reverse mapping (from pairs) in another
170 String segment = line.substring(start, end);
171 boolean compat = segment.charAt(0) == '<';
172 if (compat) isCompatibility.set(value);
173 String decomp = fromHex(segment);
175 // a small snippet of code to generate the Applet data
180 "\"\\u" + hex((char)value) + "\", "
181 + "\"\\u" + hex(decomp, "\\u") + "\", "
182 + (compat ? "\"K\"," : "\"\",")
187 // check consistency: all canon decomps must be singles or pairs!
188 int decompLen = UTF16Util.countCodePoint(decomp);
189 if (decompLen < 1 || decompLen > 2 && !compat) {
190 System.err.println("Bad decomp at: " + line);
192 decompose.put(value, decomp);
194 // only compositions are canonical pairs
195 // skip if script exclusion
197 if (!compat && !isExcluded.get(value)) {
198 int first = '\u0000';
199 int second = UTF16Util.nextCodePoint(decomp, 0);
202 second = UTF16Util.nextCodePoint(decomp,
203 UTF16Util.codePointLength(first));
206 // store composition pair in single integer
208 pair = ((long)first << 32) | second;
209 if (DEBUG && value == '\u00C0') {
210 System.out.println("debug2: " + line);
212 compose.put(pair, value);
214 System.out.println("Excluding: " + decomp);
219 if (DEBUG) System.out.println("Done reading Unicode Character Database");
221 // add algorithmic Hangul decompositions
222 // this is more compact if done at runtime, but for simplicity we
225 if (DEBUG) System.out.println("Adding Hangul");
227 for (int SIndex = 0; SIndex < SCount; ++SIndex) {
228 int TIndex = SIndex % TCount;
230 if (TIndex != 0) { // triple
231 first = (char)(SBase + SIndex - TIndex);
232 second = (char)(TBase + TIndex);
234 first = (char)(LBase + SIndex / NCount);
235 second = (char)(VBase + (SIndex % NCount) / TCount);
237 pair = ((long)first << 32) | second;
238 value = SIndex + SBase;
239 decompose.put(value, String.valueOf(first) + second);
240 compose.put(pair, value);
242 if (DEBUG) System.out.println("Done adding Hangul");
246 * Hangul composition constants
249 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
250 LCount = 19, VCount = 21, TCount = 28,
251 NCount = VCount * TCount, // 588
252 SCount = LCount * NCount; // 11172
255 * For use in an applet: just load a minimal set of data.
257 private static void setMinimalDecomp(IntHashtable canonicalClass, IntStringHashtable decompose,
258 LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) {
259 String[] decomposeData = {
260 "\u005E", "\u0020\u0302", "K",
261 "\u005F", "\u0020\u0332", "K",
262 "\u0060", "\u0020\u0300", "K",
263 "\u00A0", "\u0020", "K",
264 "\u00A8", "\u0020\u0308", "K",
265 "\u00AA", "\u0061", "K",
266 "\u00AF", "\u0020\u0304", "K",
267 "\u00B2", "\u0032", "K",
268 "\u00B3", "\u0033", "K",
269 "\u00B4", "\u0020\u0301", "K",
270 "\u00B5", "\u03BC", "K",
271 "\u00B8", "\u0020\u0327", "K",
272 "\u00B9", "\u0031", "K",
273 "\u00BA", "\u006F", "K",
274 "\u00BC", "\u0031\u2044\u0034", "K",
275 "\u00BD", "\u0031\u2044\u0032", "K",
276 "\u00BE", "\u0033\u2044\u0034", "K",
277 "\u00C0", "\u0041\u0300", "",
278 "\u00C1", "\u0041\u0301", "",
279 "\u00C2", "\u0041\u0302", "",
280 "\u00C3", "\u0041\u0303", "",
281 "\u00C4", "\u0041\u0308", "",
282 "\u00C5", "\u0041\u030A", "",
283 "\u00C7", "\u0043\u0327", "",
284 "\u00C8", "\u0045\u0300", "",
285 "\u00C9", "\u0045\u0301", "",
286 "\u00CA", "\u0045\u0302", "",
287 "\u00CB", "\u0045\u0308", "",
288 "\u00CC", "\u0049\u0300", "",
289 "\u00CD", "\u0049\u0301", "",
290 "\u00CE", "\u0049\u0302", "",
291 "\u00CF", "\u0049\u0308", "",
292 "\u00D1", "\u004E\u0303", "",
293 "\u00D2", "\u004F\u0300", "",
294 "\u00D3", "\u004F\u0301", "",
295 "\u00D4", "\u004F\u0302", "",
296 "\u00D5", "\u004F\u0303", "",
297 "\u00D6", "\u004F\u0308", "",
298 "\u00D9", "\u0055\u0300", "",
299 "\u00DA", "\u0055\u0301", "",
300 "\u00DB", "\u0055\u0302", "",
301 "\u00DC", "\u0055\u0308", "",
302 "\u00DD", "\u0059\u0301", "",
303 "\u00E0", "\u0061\u0300", "",
304 "\u00E1", "\u0061\u0301", "",
305 "\u00E2", "\u0061\u0302", "",
306 "\u00E3", "\u0061\u0303", "",
307 "\u00E4", "\u0061\u0308", "",
308 "\u00E5", "\u0061\u030A", "",
309 "\u00E7", "\u0063\u0327", "",
310 "\u00E8", "\u0065\u0300", "",
311 "\u00E9", "\u0065\u0301", "",
312 "\u00EA", "\u0065\u0302", "",
313 "\u00EB", "\u0065\u0308", "",
314 "\u00EC", "\u0069\u0300", "",
315 "\u00ED", "\u0069\u0301", "",
316 "\u00EE", "\u0069\u0302", "",
317 "\u00EF", "\u0069\u0308", "",
318 "\u00F1", "\u006E\u0303", "",
319 "\u00F2", "\u006F\u0300", "",
320 "\u00F3", "\u006F\u0301", "",
321 "\u00F4", "\u006F\u0302", "",
322 "\u00F5", "\u006F\u0303", "",
323 "\u00F6", "\u006F\u0308", "",
324 "\u00F9", "\u0075\u0300", "",
325 "\u00FA", "\u0075\u0301", "",
326 "\u00FB", "\u0075\u0302", "",
327 "\u00FC", "\u0075\u0308", "",
328 "\u00FD", "\u0079\u0301", "",
329 // EXTRAS, outside of Latin 1
330 "\u1EA4", "\u00C2\u0301", "",
331 "\u1EA5", "\u00E2\u0301", "",
332 "\u1EA6", "\u00C2\u0300", "",
333 "\u1EA7", "\u00E2\u0300", "",
411 // build the same tables we would otherwise get from the
412 // Unicode Character Database, just with limited data
414 for (int i = 0; i < decomposeData.length; i+=3) {
415 char value = decomposeData[i].charAt(0);
416 String decomp = decomposeData[i+1];
417 boolean compat = decomposeData[i+2].equals("K");
418 if (compat) isCompatibility.set(value);
419 decompose.put(value, decomp);
421 int first = '\u0000';
422 int second = UTF16Util.nextCodePoint(decomp, 0);
423 if (decomp.length() > 1) {
425 second = UTF16Util.nextCodePoint(decomp,
426 UTF16Util.codePointLength(first));
428 long pair = (first << 16) | second;
429 compose.put(pair, value);
433 for (int i = 0; i < classData.length;) {
434 canonicalClass.put(classData[i++], classData[i++]);
439 * Utility: Parses a sequence of hex Unicode characters separated by spaces
441 static public String fromHex(String source) {
442 StringBuffer result = new StringBuffer();
443 for (int i = 0; i < source.length(); ++i) {
444 char c = source.charAt(i);
446 case ' ': break; // ignore
447 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
448 case '8': case '9': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
449 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
453 //System.out.println(source.substring(i, i + 4) + "************" + source);
454 end = source.indexOf(' ',i);
456 end = source.length();
458 value = Integer.parseInt(source.substring(i, end),16);
459 UTF16Util.appendCodePoint(result, value);
460 } catch (Exception e) {
461 System.out.println("i: " + i + ";end:" + end + "source:" + source);
462 //System.out.println(source.substring(i, i + 4) + "************" + source);
465 //i+= 3; // skip rest of number
468 case '<': int j = source.indexOf('>',i); // skip <...>
472 } // else fall through--error
474 throw new IllegalArgumentException("Bad hex value in " + source);
477 return result.toString();
481 * Utility: Supplies a zero-padded hex representation of an integer (without 0x)
483 static public String hex(int i) {
484 String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase();
485 return "00000000".substring(result.length(),8) + result;
489 * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
491 static public String hex(char i) {
492 String result = Integer.toString(i, 16).toUpperCase();
493 return "0000".substring(result.length(),4) + result;
497 * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
499 public static String hex(String s, String sep) {
500 StringBuffer result = new StringBuffer();
501 for (int i = 0; i < s.length(); ++i) {
502 if (i != 0) result.append(sep);
503 result.append(hex(s.charAt(i)));
505 return result.toString();