2 * Builds the normalization tables. This is a separate class so that it
\r
3 * can be unloaded once not needed.<br>
\r
4 * Copyright (C) 1998-2007 International Business Machines Corporation and
\r
5 * Unicode, Inc. All Rights Reserved.<br>
\r
6 * The Unicode Consortium makes no expressed or implied warranty of any
\r
7 * kind, and assumes no liability for errors or omissions.
\r
8 * No liability is assumed for incidental and consequential damages
\r
9 * in connection with or arising out of the use of the information here.
\r
10 * @author Mark Davis
\r
11 * Updates for supplementary code points:
\r
12 * Vladimir Weinstein & Markus Scherer
\r
15 package com.ibm.icu.dev.test.normalizer;
\r
17 import java.io.BufferedReader;
\r
18 import java.util.BitSet;
\r
20 import com.ibm.icu.dev.test.TestUtil;
\r
21 import com.ibm.icu.dev.test.UTF16Util;
\r
23 class NormalizerBuilder {
\r
24 //private static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
\r
30 private static final boolean DEBUG = false;
\r
31 //private static final boolean GENERATING = false;
\r
34 * Constants for the data file version to use.
\r
36 /*static final boolean NEW_VERSION = true;
\r
37 private static final String DIR = "D:\\UnicodeData\\" + (NEW_VERSION ? "WorkingGroups\\" : "");
\r
39 static final String UNIDATA_VERSION = NEW_VERSION ? "3.0.0d12" : "2.1.9";
\r
40 static final String EXCLUSIONS_VERSION = NEW_VERSION ? "1d4" : "1";
\r
42 public static final String UNICODE_DATA = DIR + "UnicodeData-" + UNIDATA_VERSION + ".txt";
\r
43 public static final String COMPOSITION_EXCLUSIONS = DIR + "CompositionExclusions-" + EXCLUSIONS_VERSION +".txt";
\r
47 * Called exactly once by NormalizerData to build the static data
\r
50 static NormalizerData build(boolean fullData) {
\r
52 IntHashtable canonicalClass = new IntHashtable(0);
\r
53 IntStringHashtable decompose = new IntStringHashtable(null);
\r
54 LongHashtable compose = new LongHashtable(NormalizerData.NOT_COMPOSITE);
\r
55 BitSet isCompatibility = new BitSet();
\r
56 BitSet isExcluded = new BitSet();
\r
58 //System.out.println("Building Normalizer Data from file.");
\r
59 readExclusionList(isExcluded);
\r
60 //System.out.println(isExcluded.get(0x00C0));
\r
61 buildDecompositionTables(canonicalClass, decompose, compose,
\r
62 isCompatibility, isExcluded);
\r
63 } else { // for use in Applets
\r
64 //System.out.println("Building abridged data.");
\r
65 setMinimalDecomp(canonicalClass, decompose, compose,
\r
66 isCompatibility, isExcluded);
\r
68 return new NormalizerData(canonicalClass, decompose, compose,
\r
69 isCompatibility, isExcluded);
\r
70 } catch (java.io.IOException e) {
\r
71 System.err.println("Can't load data file." + e + ", " + e.getMessage());
\r
76 // =============================================================
\r
77 // Building Decomposition Tables
\r
78 // =============================================================
\r
81 * Reads exclusion list and stores the data
\r
83 private static void readExclusionList(BitSet isExcluded) throws java.io.IOException {
\r
84 if (DEBUG) System.out.println("Reading Exclusions");
\r
86 BufferedReader in = TestUtil.getDataReader("unicode/CompositionExclusions.txt");
\r
89 // read a line, discarding comments and blank lines
\r
91 String line = in.readLine();
\r
92 if (line == null) break;
\r
93 int comment = line.indexOf('#'); // strip comments
\r
94 if (comment != -1) line = line.substring(0,comment);
\r
95 if (line.length() == 0) continue; // ignore blanks
\r
96 if(line.indexOf(' ') != -1) {
\r
97 line = line.substring(0, line.indexOf(' '));
\r
99 // store -1 in the excluded table for each character hit
\r
101 int value = Integer.parseInt(line,16);
\r
102 isExcluded.set(value);
\r
103 //System.out.println("Excluding " + hex(value));
\r
106 if (DEBUG) System.out.println("Done reading Exclusions");
\r
110 * Builds a decomposition table from a UnicodeData file
\r
112 private static void buildDecompositionTables(
\r
113 IntHashtable canonicalClass, IntStringHashtable decompose,
\r
114 LongHashtable compose, BitSet isCompatibility, BitSet isExcluded)
\r
115 throws java.io.IOException {
\r
116 if (DEBUG) System.out.println("Reading Unicode Character Database");
\r
117 //BufferedReader in = new BufferedReader(new FileReader(UNICODE_DATA), 64*1024);
\r
118 BufferedReader in = null;
\r
120 in = TestUtil.getDataReader("unicode/UnicodeData.txt");
\r
121 } catch (Exception e) {
\r
122 System.err.println("Failed to read UnicodeData.txt");
\r
131 // read a line, discarding comments and blank lines
\r
133 String line = in.readLine();
\r
134 if (line == null) break;
\r
135 int comment = line.indexOf('#'); // strip comments
\r
136 if (comment != -1) line = line.substring(0,comment);
\r
137 if (line.length() == 0) continue;
\r
140 if ((counter & 0xFF) == 0) System.out.println("At: " + line);
\r
143 // find the values of the particular fields that we need
\r
144 // Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0;
\r
147 int end = line.indexOf(';'); // code
\r
148 value = Integer.parseInt(line.substring(start,end),16);
\r
149 if (true && value == '\u00c0') {
\r
150 //System.out.println("debug: " + line);
\r
152 end = line.indexOf(';',start=end+1); // name
\r
153 /*String name = line.substring(start,end);*/
\r
154 end = line.indexOf(';',start=end+1); // general category
\r
155 end = line.indexOf(';',start=end+1); // canonical class
\r
157 // check consistency: canonical classes must be from 0 to 255
\r
159 int cc = Integer.parseInt(line.substring(start,end));
\r
160 if (cc != (cc & 0xFF)) System.err.println("Bad canonical class at: " + line);
\r
161 canonicalClass.put(value,cc);
\r
162 end = line.indexOf(';',start=end+1); // BIDI
\r
163 end = line.indexOf(';',start=end+1); // decomp
\r
165 // decomp requires more processing.
\r
166 // store whether it is canonical or compatibility.
\r
167 // store the decomp in one table, and the reverse mapping (from pairs) in another
\r
169 if (start != end) {
\r
170 String segment = line.substring(start, end);
\r
171 boolean compat = segment.charAt(0) == '<';
\r
172 if (compat) isCompatibility.set(value);
\r
173 String decomp = fromHex(segment);
\r
175 // a small snippet of code to generate the Applet data
\r
177 /*if (GENERATING) {
\r
178 if (value < 0xFF) {
\r
179 System.out.println(
\r
180 "\"\\u" + hex((char)value) + "\", "
\r
181 + "\"\\u" + hex(decomp, "\\u") + "\", "
\r
182 + (compat ? "\"K\"," : "\"\",")
\r
187 // check consistency: all canon decomps must be singles or pairs!
\r
188 int decompLen = UTF16Util.countCodePoint(decomp);
\r
189 if (decompLen < 1 || decompLen > 2 && !compat) {
\r
190 System.err.println("Bad decomp at: " + line);
\r
192 decompose.put(value, decomp);
\r
194 // only compositions are canonical pairs
\r
195 // skip if script exclusion
\r
197 if (!compat && !isExcluded.get(value)) {
\r
198 int first = '\u0000';
\r
199 int second = UTF16Util.nextCodePoint(decomp, 0);
\r
200 if (decompLen > 1) {
\r
202 second = UTF16Util.nextCodePoint(decomp,
\r
203 UTF16Util.codePointLength(first));
\r
206 // store composition pair in single integer
\r
208 pair = ((long)first << 32) | second;
\r
209 if (DEBUG && value == '\u00C0') {
\r
210 System.out.println("debug2: " + line);
\r
212 compose.put(pair, value);
\r
213 } else if (DEBUG) {
\r
214 System.out.println("Excluding: " + decomp);
\r
219 if (DEBUG) System.out.println("Done reading Unicode Character Database");
\r
221 // add algorithmic Hangul decompositions
\r
222 // this is more compact if done at runtime, but for simplicity we
\r
225 if (DEBUG) System.out.println("Adding Hangul");
\r
227 for (int SIndex = 0; SIndex < SCount; ++SIndex) {
\r
228 int TIndex = SIndex % TCount;
\r
229 char first, second;
\r
230 if (TIndex != 0) { // triple
\r
231 first = (char)(SBase + SIndex - TIndex);
\r
232 second = (char)(TBase + TIndex);
\r
234 first = (char)(LBase + SIndex / NCount);
\r
235 second = (char)(VBase + (SIndex % NCount) / TCount);
\r
237 pair = ((long)first << 32) | second;
\r
238 value = SIndex + SBase;
\r
239 decompose.put(value, String.valueOf(first) + second);
\r
240 compose.put(pair, value);
\r
242 if (DEBUG) System.out.println("Done adding Hangul");
\r
246 * Hangul composition constants
\r
249 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
\r
250 LCount = 19, VCount = 21, TCount = 28,
\r
251 NCount = VCount * TCount, // 588
\r
252 SCount = LCount * NCount; // 11172
\r
255 * For use in an applet: just load a minimal set of data.
\r
257 private static void setMinimalDecomp(IntHashtable canonicalClass, IntStringHashtable decompose,
\r
258 LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) {
\r
259 String[] decomposeData = {
\r
260 "\u005E", "\u0020\u0302", "K",
\r
261 "\u005F", "\u0020\u0332", "K",
\r
262 "\u0060", "\u0020\u0300", "K",
\r
263 "\u00A0", "\u0020", "K",
\r
264 "\u00A8", "\u0020\u0308", "K",
\r
265 "\u00AA", "\u0061", "K",
\r
266 "\u00AF", "\u0020\u0304", "K",
\r
267 "\u00B2", "\u0032", "K",
\r
268 "\u00B3", "\u0033", "K",
\r
269 "\u00B4", "\u0020\u0301", "K",
\r
270 "\u00B5", "\u03BC", "K",
\r
271 "\u00B8", "\u0020\u0327", "K",
\r
272 "\u00B9", "\u0031", "K",
\r
273 "\u00BA", "\u006F", "K",
\r
274 "\u00BC", "\u0031\u2044\u0034", "K",
\r
275 "\u00BD", "\u0031\u2044\u0032", "K",
\r
276 "\u00BE", "\u0033\u2044\u0034", "K",
\r
277 "\u00C0", "\u0041\u0300", "",
\r
278 "\u00C1", "\u0041\u0301", "",
\r
279 "\u00C2", "\u0041\u0302", "",
\r
280 "\u00C3", "\u0041\u0303", "",
\r
281 "\u00C4", "\u0041\u0308", "",
\r
282 "\u00C5", "\u0041\u030A", "",
\r
283 "\u00C7", "\u0043\u0327", "",
\r
284 "\u00C8", "\u0045\u0300", "",
\r
285 "\u00C9", "\u0045\u0301", "",
\r
286 "\u00CA", "\u0045\u0302", "",
\r
287 "\u00CB", "\u0045\u0308", "",
\r
288 "\u00CC", "\u0049\u0300", "",
\r
289 "\u00CD", "\u0049\u0301", "",
\r
290 "\u00CE", "\u0049\u0302", "",
\r
291 "\u00CF", "\u0049\u0308", "",
\r
292 "\u00D1", "\u004E\u0303", "",
\r
293 "\u00D2", "\u004F\u0300", "",
\r
294 "\u00D3", "\u004F\u0301", "",
\r
295 "\u00D4", "\u004F\u0302", "",
\r
296 "\u00D5", "\u004F\u0303", "",
\r
297 "\u00D6", "\u004F\u0308", "",
\r
298 "\u00D9", "\u0055\u0300", "",
\r
299 "\u00DA", "\u0055\u0301", "",
\r
300 "\u00DB", "\u0055\u0302", "",
\r
301 "\u00DC", "\u0055\u0308", "",
\r
302 "\u00DD", "\u0059\u0301", "",
\r
303 "\u00E0", "\u0061\u0300", "",
\r
304 "\u00E1", "\u0061\u0301", "",
\r
305 "\u00E2", "\u0061\u0302", "",
\r
306 "\u00E3", "\u0061\u0303", "",
\r
307 "\u00E4", "\u0061\u0308", "",
\r
308 "\u00E5", "\u0061\u030A", "",
\r
309 "\u00E7", "\u0063\u0327", "",
\r
310 "\u00E8", "\u0065\u0300", "",
\r
311 "\u00E9", "\u0065\u0301", "",
\r
312 "\u00EA", "\u0065\u0302", "",
\r
313 "\u00EB", "\u0065\u0308", "",
\r
314 "\u00EC", "\u0069\u0300", "",
\r
315 "\u00ED", "\u0069\u0301", "",
\r
316 "\u00EE", "\u0069\u0302", "",
\r
317 "\u00EF", "\u0069\u0308", "",
\r
318 "\u00F1", "\u006E\u0303", "",
\r
319 "\u00F2", "\u006F\u0300", "",
\r
320 "\u00F3", "\u006F\u0301", "",
\r
321 "\u00F4", "\u006F\u0302", "",
\r
322 "\u00F5", "\u006F\u0303", "",
\r
323 "\u00F6", "\u006F\u0308", "",
\r
324 "\u00F9", "\u0075\u0300", "",
\r
325 "\u00FA", "\u0075\u0301", "",
\r
326 "\u00FB", "\u0075\u0302", "",
\r
327 "\u00FC", "\u0075\u0308", "",
\r
328 "\u00FD", "\u0079\u0301", "",
\r
329 // EXTRAS, outside of Latin 1
\r
330 "\u1EA4", "\u00C2\u0301", "",
\r
331 "\u1EA5", "\u00E2\u0301", "",
\r
332 "\u1EA6", "\u00C2\u0300", "",
\r
333 "\u1EA7", "\u00E2\u0300", "",
\r
336 int[] classData = {
\r
411 // build the same tables we would otherwise get from the
\r
412 // Unicode Character Database, just with limited data
\r
414 for (int i = 0; i < decomposeData.length; i+=3) {
\r
415 char value = decomposeData[i].charAt(0);
\r
416 String decomp = decomposeData[i+1];
\r
417 boolean compat = decomposeData[i+2].equals("K");
\r
418 if (compat) isCompatibility.set(value);
\r
419 decompose.put(value, decomp);
\r
421 int first = '\u0000';
\r
422 int second = UTF16Util.nextCodePoint(decomp, 0);
\r
423 if (decomp.length() > 1) {
\r
425 second = UTF16Util.nextCodePoint(decomp,
\r
426 UTF16Util.codePointLength(first));
\r
428 long pair = (first << 16) | second;
\r
429 compose.put(pair, value);
\r
433 for (int i = 0; i < classData.length;) {
\r
434 canonicalClass.put(classData[i++], classData[i++]);
\r
439 * Utility: Parses a sequence of hex Unicode characters separated by spaces
\r
441 static public String fromHex(String source) {
\r
442 StringBuffer result = new StringBuffer();
\r
443 for (int i = 0; i < source.length(); ++i) {
\r
444 char c = source.charAt(i);
\r
446 case ' ': break; // ignore
\r
447 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
\r
448 case '8': case '9': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
\r
449 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
\r
453 //System.out.println(source.substring(i, i + 4) + "************" + source);
\r
454 end = source.indexOf(' ',i);
\r
456 end = source.length();
\r
458 value = Integer.parseInt(source.substring(i, end),16);
\r
459 UTF16Util.appendCodePoint(result, value);
\r
460 } catch (Exception e) {
\r
461 System.out.println("i: " + i + ";end:" + end + "source:" + source);
\r
462 //System.out.println(source.substring(i, i + 4) + "************" + source);
\r
465 //i+= 3; // skip rest of number
\r
468 case '<': int j = source.indexOf('>',i); // skip <...>
\r
472 } // else fall through--error
\r
474 throw new IllegalArgumentException("Bad hex value in " + source);
\r
477 return result.toString();
\r
481 * Utility: Supplies a zero-padded hex representation of an integer (without 0x)
\r
483 static public String hex(int i) {
\r
484 String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase();
\r
485 return "00000000".substring(result.length(),8) + result;
\r
489 * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
\r
491 static public String hex(char i) {
\r
492 String result = Integer.toString(i, 16).toUpperCase();
\r
493 return "0000".substring(result.length(),4) + result;
\r
497 * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
\r
499 public static String hex(String s, String sep) {
\r
500 StringBuffer result = new StringBuffer();
\r
501 for (int i = 0; i < s.length(); ++i) {
\r
502 if (i != 0) result.append(sep);
\r
503 result.append(hex(s.charAt(i)));
\r
505 return result.toString();
\r