]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_8_1_1/main/tests/core/src/com/ibm/icu/dev/test/normalizer/NormalizerBuilder.java
Added flags.
[Dictionary.git] / jars / icu4j-4_8_1_1 / main / tests / core / src / com / ibm / icu / dev / test / normalizer / NormalizerBuilder.java
1 /**
2  * Builds the normalization tables. This is a separate class so that it
3  * can be unloaded once not needed.<br>
4  * Copyright (C) 1998-2007 International Business Machines Corporation and
5  * Unicode, Inc. All Rights Reserved.<br>
6  * The Unicode Consortium makes no expressed or implied warranty of any
7  * kind, and assumes no liability for errors or omissions.
8  * No liability is assumed for incidental and consequential damages
9  * in connection with or arising out of the use of the information here.
10  * @author Mark Davis
11  * Updates for supplementary code points:
12  * Vladimir Weinstein & Markus Scherer
13  */
14
15 package com.ibm.icu.dev.test.normalizer;
16
17 import java.io.BufferedReader;
18 import java.util.BitSet;
19
20 import com.ibm.icu.dev.test.TestUtil;
21 import com.ibm.icu.dev.test.UTF16Util;
22
23 class NormalizerBuilder {
24     //private static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
25
26     /**
27      * Testing flags
28      */
29
30     private static final boolean DEBUG = false;
31     //private static final boolean GENERATING = false;
32
33     /**
34      * Constants for the data file version to use.
35      */
36     /*static final boolean NEW_VERSION = true;
37     private static final String DIR = "D:\\UnicodeData\\" + (NEW_VERSION ? "WorkingGroups\\" : "");
38
39     static final String UNIDATA_VERSION = NEW_VERSION ? "3.0.0d12" : "2.1.9";
40     static final String EXCLUSIONS_VERSION = NEW_VERSION ? "1d4" : "1";
41
42     public static final String UNICODE_DATA = DIR + "UnicodeData-" + UNIDATA_VERSION + ".txt";
43     public static final String COMPOSITION_EXCLUSIONS = DIR + "CompositionExclusions-" + EXCLUSIONS_VERSION +".txt";
44     */
45
46     /**
47      * Called exactly once by NormalizerData to build the static data
48      */
49
50     static NormalizerData build(boolean fullData) {
51         try {
52             IntHashtable canonicalClass = new IntHashtable(0);
53             IntStringHashtable decompose = new IntStringHashtable(null);
54             LongHashtable compose = new LongHashtable(NormalizerData.NOT_COMPOSITE);
55             BitSet isCompatibility = new BitSet();
56             BitSet isExcluded = new BitSet();
57             if (fullData) {
58                 //System.out.println("Building Normalizer Data from file.");
59                 readExclusionList(isExcluded);
60                 //System.out.println(isExcluded.get(0x00C0));
61                 buildDecompositionTables(canonicalClass, decompose, compose,
62                   isCompatibility, isExcluded);
63             } else {    // for use in Applets
64                 //System.out.println("Building abridged data.");
65                 setMinimalDecomp(canonicalClass, decompose, compose,
66                   isCompatibility, isExcluded);
67             }
68             return new NormalizerData(canonicalClass, decompose, compose,
69                   isCompatibility, isExcluded);
70         } catch (java.io.IOException e) {
71             System.err.println("Can't load data file." + e + ", " + e.getMessage());
72             return null;
73         }
74     }
75
76 // =============================================================
77 // Building Decomposition Tables
78 // =============================================================
79
80     /**
81      * Reads exclusion list and stores the data
82      */
83     private static void readExclusionList(BitSet isExcluded) throws java.io.IOException {
84         if (DEBUG) System.out.println("Reading Exclusions");
85
86         BufferedReader in = TestUtil.getDataReader("unicode/CompositionExclusions.txt");
87
88         while (true) {
89             // read a line, discarding comments and blank lines
90
91             String line = in.readLine();
92             if (line == null) break;
93             int comment = line.indexOf('#');                    // strip comments
94             if (comment != -1) line = line.substring(0,comment);
95             if (line.length() == 0) continue;                   // ignore blanks
96             if(line.indexOf(' ') != -1) {
97                 line = line.substring(0, line.indexOf(' '));
98             }
99             // store -1 in the excluded table for each character hit
100
101             int value = Integer.parseInt(line,16);
102             isExcluded.set(value);
103             //System.out.println("Excluding " + hex(value));
104         }
105         in.close();
106         if (DEBUG) System.out.println("Done reading Exclusions");
107     }
108
109     /**
110      * Builds a decomposition table from a UnicodeData file
111      */
112     private static void buildDecompositionTables(
113       IntHashtable canonicalClass, IntStringHashtable decompose,
114       LongHashtable compose, BitSet isCompatibility, BitSet isExcluded)
115       throws java.io.IOException {
116         if (DEBUG) System.out.println("Reading Unicode Character Database");
117         //BufferedReader in = new BufferedReader(new FileReader(UNICODE_DATA), 64*1024);
118         BufferedReader in = null;
119         try {
120             in = TestUtil.getDataReader("unicode/UnicodeData.txt");
121         } catch (Exception e) {
122             System.err.println("Failed to read UnicodeData.txt");
123             System.exit(1);
124         }
125
126         int value;
127         long pair;
128         int counter = 0;
129         while (true) {
130
131             // read a line, discarding comments and blank lines
132
133             String line = in.readLine();
134             if (line == null) break;
135             int comment = line.indexOf('#');                    // strip comments
136             if (comment != -1) line = line.substring(0,comment);
137             if (line.length() == 0) continue;
138             if (DEBUG) {
139                 counter++;
140                 if ((counter & 0xFF) == 0) System.out.println("At: " + line);
141             }
142
143             // find the values of the particular fields that we need
144             // Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0;
145
146             int start = 0;
147             int end = line.indexOf(';'); // code
148             value = Integer.parseInt(line.substring(start,end),16);
149             if (true && value == '\u00c0') {
150                 //System.out.println("debug: " + line);
151             }
152             end = line.indexOf(';',start=end+1); // name
153             /*String name = line.substring(start,end);*/
154             end = line.indexOf(';',start=end+1); // general category
155             end = line.indexOf(';',start=end+1); // canonical class
156
157             // check consistency: canonical classes must be from 0 to 255
158
159             int cc = Integer.parseInt(line.substring(start,end));
160             if (cc != (cc & 0xFF)) System.err.println("Bad canonical class at: " + line);
161             canonicalClass.put(value,cc);
162             end = line.indexOf(';',start=end+1); // BIDI
163             end = line.indexOf(';',start=end+1); // decomp
164
165             // decomp requires more processing.
166             // store whether it is canonical or compatibility.
167             // store the decomp in one table, and the reverse mapping (from pairs) in another
168
169             if (start != end) {
170                 String segment = line.substring(start, end);
171                 boolean compat = segment.charAt(0) == '<';
172                 if (compat) isCompatibility.set(value);
173                 String decomp = fromHex(segment);
174
175                 // a small snippet of code to generate the Applet data
176
177                 /*if (GENERATING) {
178                     if (value < 0xFF) {
179                         System.out.println(
180                             "\"\\u" + hex((char)value) + "\", "
181                             + "\"\\u" + hex(decomp, "\\u") + "\", "
182                             + (compat ? "\"K\"," : "\"\",")
183                             + "// " + name);
184                     }
185                 }*/
186
187                 // check consistency: all canon decomps must be singles or pairs!
188                 int decompLen = UTF16Util.countCodePoint(decomp);
189                 if (decompLen < 1 || decompLen > 2 && !compat) {
190                     System.err.println("Bad decomp at: " + line);
191                 }
192                 decompose.put(value, decomp);
193
194                 // only compositions are canonical pairs
195                 // skip if script exclusion
196
197                 if (!compat && !isExcluded.get(value)) {
198                     int first = '\u0000';
199                     int second = UTF16Util.nextCodePoint(decomp, 0);
200                     if (decompLen > 1) {
201                         first = second;
202                         second = UTF16Util.nextCodePoint(decomp,
203                             UTF16Util.codePointLength(first));
204                     }
205
206                     // store composition pair in single integer
207
208                     pair = ((long)first << 32) | second;
209                     if (DEBUG && value == '\u00C0') {
210                         System.out.println("debug2: " + line);
211                     }
212                     compose.put(pair, value);
213                 } else if (DEBUG) {
214                     System.out.println("Excluding: " + decomp);
215                 }
216             }
217         }
218         in.close();
219         if (DEBUG) System.out.println("Done reading Unicode Character Database");
220
221         // add algorithmic Hangul decompositions
222         // this is more compact if done at runtime, but for simplicity we
223         // do it this way.
224
225         if (DEBUG) System.out.println("Adding Hangul");
226
227         for (int SIndex = 0; SIndex < SCount; ++SIndex) {
228             int TIndex = SIndex % TCount;
229             char first, second;
230             if (TIndex != 0) { // triple
231                 first = (char)(SBase + SIndex - TIndex);
232                 second = (char)(TBase + TIndex);
233             } else {
234                 first = (char)(LBase + SIndex / NCount);
235                 second = (char)(VBase + (SIndex % NCount) / TCount);
236             }
237             pair = ((long)first << 32) | second;
238             value = SIndex + SBase;
239             decompose.put(value, String.valueOf(first) + second);
240             compose.put(pair, value);
241         }
242         if (DEBUG) System.out.println("Done adding Hangul");
243     }
244
245     /**
246      * Hangul composition constants
247      */
248     static final int
249         SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
250         LCount = 19, VCount = 21, TCount = 28,
251         NCount = VCount * TCount,   // 588
252         SCount = LCount * NCount;   // 11172
253
254     /**
255      * For use in an applet: just load a minimal set of data.
256      */
257     private static void setMinimalDecomp(IntHashtable canonicalClass, IntStringHashtable decompose,
258       LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) {
259         String[] decomposeData = {
260             "\u005E", "\u0020\u0302", "K",
261             "\u005F", "\u0020\u0332", "K",
262             "\u0060", "\u0020\u0300", "K",
263             "\u00A0", "\u0020", "K",
264             "\u00A8", "\u0020\u0308", "K",
265             "\u00AA", "\u0061", "K",
266             "\u00AF", "\u0020\u0304", "K",
267             "\u00B2", "\u0032", "K",
268             "\u00B3", "\u0033", "K",
269             "\u00B4", "\u0020\u0301", "K",
270             "\u00B5", "\u03BC", "K",
271             "\u00B8", "\u0020\u0327", "K",
272             "\u00B9", "\u0031", "K",
273             "\u00BA", "\u006F", "K",
274             "\u00BC", "\u0031\u2044\u0034", "K",
275             "\u00BD", "\u0031\u2044\u0032", "K",
276             "\u00BE", "\u0033\u2044\u0034", "K",
277             "\u00C0", "\u0041\u0300", "",
278             "\u00C1", "\u0041\u0301", "",
279             "\u00C2", "\u0041\u0302", "",
280             "\u00C3", "\u0041\u0303", "",
281             "\u00C4", "\u0041\u0308", "",
282             "\u00C5", "\u0041\u030A", "",
283             "\u00C7", "\u0043\u0327", "",
284             "\u00C8", "\u0045\u0300", "",
285             "\u00C9", "\u0045\u0301", "",
286             "\u00CA", "\u0045\u0302", "",
287             "\u00CB", "\u0045\u0308", "",
288             "\u00CC", "\u0049\u0300", "",
289             "\u00CD", "\u0049\u0301", "",
290             "\u00CE", "\u0049\u0302", "",
291             "\u00CF", "\u0049\u0308", "",
292             "\u00D1", "\u004E\u0303", "",
293             "\u00D2", "\u004F\u0300", "",
294             "\u00D3", "\u004F\u0301", "",
295             "\u00D4", "\u004F\u0302", "",
296             "\u00D5", "\u004F\u0303", "",
297             "\u00D6", "\u004F\u0308", "",
298             "\u00D9", "\u0055\u0300", "",
299             "\u00DA", "\u0055\u0301", "",
300             "\u00DB", "\u0055\u0302", "",
301             "\u00DC", "\u0055\u0308", "",
302             "\u00DD", "\u0059\u0301", "",
303             "\u00E0", "\u0061\u0300", "",
304             "\u00E1", "\u0061\u0301", "",
305             "\u00E2", "\u0061\u0302", "",
306             "\u00E3", "\u0061\u0303", "",
307             "\u00E4", "\u0061\u0308", "",
308             "\u00E5", "\u0061\u030A", "",
309             "\u00E7", "\u0063\u0327", "",
310             "\u00E8", "\u0065\u0300", "",
311             "\u00E9", "\u0065\u0301", "",
312             "\u00EA", "\u0065\u0302", "",
313             "\u00EB", "\u0065\u0308", "",
314             "\u00EC", "\u0069\u0300", "",
315             "\u00ED", "\u0069\u0301", "",
316             "\u00EE", "\u0069\u0302", "",
317             "\u00EF", "\u0069\u0308", "",
318             "\u00F1", "\u006E\u0303", "",
319             "\u00F2", "\u006F\u0300", "",
320             "\u00F3", "\u006F\u0301", "",
321             "\u00F4", "\u006F\u0302", "",
322             "\u00F5", "\u006F\u0303", "",
323             "\u00F6", "\u006F\u0308", "",
324             "\u00F9", "\u0075\u0300", "",
325             "\u00FA", "\u0075\u0301", "",
326             "\u00FB", "\u0075\u0302", "",
327             "\u00FC", "\u0075\u0308", "",
328             "\u00FD", "\u0079\u0301", "",
329 // EXTRAS, outside of Latin 1
330             "\u1EA4", "\u00C2\u0301", "",
331             "\u1EA5", "\u00E2\u0301", "",
332             "\u1EA6", "\u00C2\u0300", "",
333             "\u1EA7", "\u00E2\u0300", "",
334         };
335
336         int[] classData = {
337             0x0300, 230,
338             0x0301, 230,
339             0x0302, 230,
340             0x0303, 230,
341             0x0304, 230,
342             0x0305, 230,
343             0x0306, 230,
344             0x0307, 230,
345             0x0308, 230,
346             0x0309, 230,
347             0x030A, 230,
348             0x030B, 230,
349             0x030C, 230,
350             0x030D, 230,
351             0x030E, 230,
352             0x030F, 230,
353             0x0310, 230,
354             0x0311, 230,
355             0x0312, 230,
356             0x0313, 230,
357             0x0314, 230,
358             0x0315, 232,
359             0x0316, 220,
360             0x0317, 220,
361             0x0318, 220,
362             0x0319, 220,
363             0x031A, 232,
364             0x031B, 216,
365             0x031C, 220,
366             0x031D, 220,
367             0x031E, 220,
368             0x031F, 220,
369             0x0320, 220,
370             0x0321, 202,
371             0x0322, 202,
372             0x0323, 220,
373             0x0324, 220,
374             0x0325, 220,
375             0x0326, 220,
376             0x0327, 202,
377             0x0328, 202,
378             0x0329, 220,
379             0x032A, 220,
380             0x032B, 220,
381             0x032C, 220,
382             0x032D, 220,
383             0x032E, 220,
384             0x032F, 220,
385             0x0330, 220,
386             0x0331, 220,
387             0x0332, 220,
388             0x0333, 220,
389             0x0334, 1,
390             0x0335, 1,
391             0x0336, 1,
392             0x0337, 1,
393             0x0338, 1,
394             0x0339, 220,
395             0x033A, 220,
396             0x033B, 220,
397             0x033C, 220,
398             0x033D, 230,
399             0x033E, 230,
400             0x033F, 230,
401             0x0340, 230,
402             0x0341, 230,
403             0x0342, 230,
404             0x0343, 230,
405             0x0344, 230,
406             0x0345, 240,
407             0x0360, 234,
408             0x0361, 234
409         };
410
411         // build the same tables we would otherwise get from the
412         // Unicode Character Database, just with limited data
413
414         for (int i = 0; i < decomposeData.length; i+=3) {
415             char value = decomposeData[i].charAt(0);
416             String decomp = decomposeData[i+1];
417             boolean compat = decomposeData[i+2].equals("K");
418             if (compat) isCompatibility.set(value);
419             decompose.put(value, decomp);
420             if (!compat) {
421                 int first = '\u0000';
422                 int second = UTF16Util.nextCodePoint(decomp, 0);
423                 if (decomp.length() > 1) {
424                     first = second;
425                     second = UTF16Util.nextCodePoint(decomp,
426                         UTF16Util.codePointLength(first));
427                 }
428                 long pair = (first << 16) | second;
429                 compose.put(pair, value);
430             }
431         }
432
433         for (int i = 0; i < classData.length;) {
434             canonicalClass.put(classData[i++], classData[i++]);
435         }
436     }
437
438     /**
439      * Utility: Parses a sequence of hex Unicode characters separated by spaces
440      */
441     static public String fromHex(String source) {
442         StringBuffer result = new StringBuffer();
443         for (int i = 0; i < source.length(); ++i) {
444             char c = source.charAt(i);
445             switch (c) {
446               case ' ': break; // ignore
447               case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
448               case '8': case '9': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
449               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
450                 int end = 0;
451                 int value = 0;
452                 try {
453                     //System.out.println(source.substring(i, i + 4) + "************" + source);
454                     end = source.indexOf(' ',i);
455                     if (end < 0) {
456                         end = source.length();
457                     }
458                     value = Integer.parseInt(source.substring(i, end),16);
459                     UTF16Util.appendCodePoint(result, value);
460                 } catch (Exception e) {
461                     System.out.println("i: " + i + ";end:" + end + "source:" + source);
462                     //System.out.println(source.substring(i, i + 4) + "************" + source);
463                     System.exit(1);
464                 }
465                 //i+= 3; // skip rest of number
466                 i = end;
467                 break;
468               case '<': int j = source.indexOf('>',i); // skip <...>
469                 if (j > 0) {
470                     i = j;
471                     break;
472                 } // else fall through--error
473               default:
474                 throw new IllegalArgumentException("Bad hex value in " + source);
475             }
476         }
477         return result.toString();
478     }
479
480     /**
481      * Utility: Supplies a zero-padded hex representation of an integer (without 0x)
482      */
483     static public String hex(int i) {
484         String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase();
485         return "00000000".substring(result.length(),8) + result;
486     }
487
488     /**
489      * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
490      */
491     static public String hex(char i) {
492         String result = Integer.toString(i, 16).toUpperCase();
493         return "0000".substring(result.length(),4) + result;
494     }
495
496     /**
497      * Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
498      */
499     public static String hex(String s, String sep) {
500         StringBuffer result = new StringBuffer();
501         for (int i = 0; i < s.length(); ++i) {
502             if (i != 0) result.append(sep);
503             result.append(hex(s.charAt(i)));
504         }
505         return result.toString();
506     }
507 }