]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-52_1/tools/misc/src/com/ibm/icu/dev/tool/layout/CanonGSUBBuilder.java
Upgrade ICU4J.
[Dictionary.git] / jars / icu4j-52_1 / tools / misc / src / com / ibm / icu / dev / tool / layout / CanonGSUBBuilder.java
1 /**
2  *******************************************************************************
3  * Copyright (C) 2002-2010, International Business Machines Corporation and    *
4  * others. All Rights Reserved.                                                *
5  *******************************************************************************
6  */
7
8
9 package com.ibm.icu.dev.tool.layout;
10
11 import com.ibm.icu.lang.UCharacter;
12 import com.ibm.icu.lang.UScript;
13 import com.ibm.icu.text.UTF16;
14 import com.ibm.icu.text.UnicodeSet;
15
16 /**
17  * @author Eric Mader
18  *
19  * Notes:
20  * 
21  * The property \p{Decomposition_Type=Canonical} will match all characters with a canonical
22  * decomposition.
23  *
24  * So "[[\\p{Latin}\\p{Greek}\\p{Cyrillic}] & [\\p{Decomposition_Type=Canonical}]]"
25  * will match all Latin, Greek and Cyrillic characters with a canonical decomposition.
26  * 
27  * Are these three scripts enough? Do we want to collect them all at once and distribute by script,
28  * or process them one script at a time. It's probably a good idea to build a single table for
29  * however many scripts there are.
30  * 
31  * It might be better to collect all the characters that have a canonical decomposition and just
32  * sort them into however many scripts there are... unless we'll get characters in COMMON???
33  */
34 public class CanonGSUBBuilder
35 {
36     static public String convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable)
37     {
38         int leftType  = ArabicShaping.VALUE_NONE;
39         int rightType = ArabicShaping.VALUE_NONE;
40         
41         switch (type) {
42             case UCharacter.DecompositionType.ISOLATED:
43                 break;
44                 
45             case UCharacter.DecompositionType.FINAL:
46                 rightType = ArabicShaping.VALUE_LEFT;
47                 break;
48             
49             case UCharacter.DecompositionType.INITIAL:
50                 leftType = ArabicShaping.VALUE_RIGHT;
51                 break;
52             
53             case UCharacter.DecompositionType.MEDIAL:
54                rightType = ArabicShaping.VALUE_LEFT;
55                leftType  = ArabicShaping.VALUE_RIGHT;
56                break;
57                
58            default:
59                return decomp + UCharacter.toString(ligature);
60         }
61         
62         char[] chars = decomp.toCharArray();
63               
64         ArabicShaping.shape(chars, leftType, rightType, isolClassTable);
65   
66         return new String(chars) + UCharacter.toString(ligature);
67     }
68     
69     static void buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable,
70                                      ClassTable finaClassTable, ClassTable isolClassTable)
71     {
72         System.out.print("Finding Arabic contextual forms... ");
73         
74         for (int i = 0; i < data.countRecords(); i += 1) {
75             ArabicCharacterData.Record record = data.getRecord(i);
76             String decomposition = record.getDecomposition();
77             
78             if (decomposition != null && decomposition.length() == 1) {
79                 int contextual = record.getCodePoint();
80                 int isolated   = UTF16.charAt(record.getDecomposition(), 0);
81             
82                 switch (record.getDecompositionType()) {
83                 case UCharacter.DecompositionType.INITIAL:
84                     initClassTable.addMapping(isolated, contextual);
85                     break;
86                 
87                 case UCharacter.DecompositionType.MEDIAL:
88                     mediClassTable.addMapping(isolated, contextual);
89                     break;
90                 
91                case UCharacter.DecompositionType.FINAL:
92                    finaClassTable.addMapping(isolated, contextual);
93                    break;
94                    
95                case UCharacter.DecompositionType.ISOLATED:
96                    isolClassTable.addMapping(isolated, contextual);
97                    break;
98                
99                default:
100                    // issue some error message?
101                    break;
102                 }
103             }
104         }
105         
106         System.out.println("Done.");
107     }
108
109     static LigatureTree buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable)
110     {
111         LigatureTree contextualTree = new LigatureTree();
112         int ligatureCount = 0;
113         
114         System.out.print("Building Arabic ligature tree... ");
115         
116         for (int i = 0; i < data.countRecords(); i += 1) {
117             ArabicCharacterData.Record record = data.getRecord(i);
118             String decomposition = record.getDecomposition();
119             
120             if (decomposition != null && decomposition.length() > 1) {
121                 int ligature   = record.getCodePoint();
122                 int decompType = record.getDecompositionType();
123                 
124                 switch (decompType) {
125                 case UCharacter.DecompositionType.FINAL:
126                 case UCharacter.DecompositionType.INITIAL:
127                 case UCharacter.DecompositionType.MEDIAL:
128                 case UCharacter.DecompositionType.ISOLATED:
129                     contextualTree.insert(convertArabicString(decompType, ligature, decomposition, isolClassTable));
130                     ligatureCount += 1;
131                     break;
132                     
133                 case UCharacter.DecompositionType.CANONICAL:
134                     //cannonicalTree.insert(decomposition + UCharacter.toString(ligature));
135                     break;
136                 }
137             }
138         }
139         
140         System.out.println(ligatureCount + " ligatures.");
141         
142         return contextualTree;
143     }
144     
145     static final int SIMPLE_GLYPH = 1;
146     static final int LIGATURE_GLYPH = 2;
147     static final int MARK_GLYPH = 3;
148     static final int COMPONENT_GLYPH = 4;
149     
150     static final int categoryClassMap[] = {
151     0,              // UNASSIGNED
152     SIMPLE_GLYPH,   // UPPERCASE_LETTER
153     SIMPLE_GLYPH,   // LOWERCASE_LETTER
154     SIMPLE_GLYPH,   // TITLECASE_LETTER
155     SIMPLE_GLYPH,   // MODIFIER_LETTER
156     SIMPLE_GLYPH,   // OTHER_LETTER
157     MARK_GLYPH,     // NON_SPACING_MARK
158     MARK_GLYPH,     // ENCLOSING_MARK ??
159     MARK_GLYPH,     // COMBINING_SPACING_MARK ??
160     SIMPLE_GLYPH,   // DECIMAL_NUMBER
161     SIMPLE_GLYPH,   // LETTER_NUMBER
162     SIMPLE_GLYPH,   // OTHER_NUMBER;
163     0,              // SPACE_SEPARATOR
164     0,              // LINE_SEPARATOR
165     0,              // PARAGRAPH_SEPARATOR
166     0,              // CONTROL
167     0,              // FORMAT
168     0,              // PRIVATE_USE
169     0,              // SURROGATE
170     SIMPLE_GLYPH,   // DASH_PUNCTUATION
171     SIMPLE_GLYPH,   // START_PUNCTUATION
172     SIMPLE_GLYPH,   // END_PUNCTUATION
173     SIMPLE_GLYPH,   // CONNECTOR_PUNCTUATION
174     SIMPLE_GLYPH,   // OTHER_PUNCTUATION
175     SIMPLE_GLYPH,   // MATH_SYMBOL;
176     SIMPLE_GLYPH,   // CURRENCY_SYMBOL
177     SIMPLE_GLYPH,   // MODIFIER_SYMBOL
178     SIMPLE_GLYPH,   // OTHER_SYMBOL
179     SIMPLE_GLYPH,   // INITIAL_PUNCTUATION
180     SIMPLE_GLYPH    // FINAL_PUNCTUATION
181     };
182
183     static int getGlyphClass(ArabicCharacterData.Record record)
184     {
185         String decomp = record.getDecomposition();
186         
187         if (decomp != null && decomp.length() > 1) {
188             return LIGATURE_GLYPH;
189         }
190         
191         return categoryClassMap[record.getGeneralCategory()];
192     }
193     
194     static void addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable)
195     {
196         System.out.print("Adding Arabic glyph classes... ");
197         
198         for (int i = 0; i < data.countRecords(); i += 1) {
199             ArabicCharacterData.Record record = data.getRecord(i);
200             classTable.addMapping(record.getCodePoint(), getGlyphClass(record));
201         }
202         
203         System.out.println("Done.");
204     }
205     
206     private static void buildArabicTables(ScriptList scriptList, FeatureList featureList,
207                                                 LookupList lookupList, ClassTable classTable) {
208         // TODO: Might want to have the ligature table builder explicitly check for ligatures
209         // which start with space and tatweel rather than pulling them out here...
210         UnicodeSet arabicBlock   = new UnicodeSet("[[\\p{block=Arabic}] & [[:Cf:][:Po:][:So:][:Mn:][:Nd:][:Lm:]]]");
211         UnicodeSet oddLigatures  = new UnicodeSet("[\\uFC5E-\\uFC63\\uFCF2-\\uFCF4\\uFE70-\\uFE7F]");
212         UnicodeSet arabicLetters = new UnicodeSet("[\\p{Arabic}]");
213         ArabicCharacterData arabicData = ArabicCharacterData.factory(arabicLetters.addAll(arabicBlock).removeAll(oddLigatures));
214
215         addArabicGlyphClasses(arabicData, classTable);
216         
217         ClassTable initClassTable = new ClassTable();
218         ClassTable mediClassTable = new ClassTable();
219         ClassTable finaClassTable = new ClassTable();
220         ClassTable isolClassTable = new ClassTable();
221         
222         buildArabicContextualForms(arabicData, initClassTable, mediClassTable, finaClassTable, isolClassTable);
223         isolClassTable.snapshot();
224         LigatureTree ligaTree = buildArabicLigatureTree(arabicData, isolClassTable);
225
226         LigatureTreeWalker ligaWalker = new LigatureTreeWalker();
227
228         ligaTree.walk(ligaWalker);
229         
230         Lookup initLookup, mediLookup, finaLookup, ligaLookup;
231         
232         initLookup = new Lookup(Lookup.GSST_Single, 0);
233         initLookup.addSubtable(initClassTable);
234         
235         mediLookup = new Lookup(Lookup.GSST_Single, 0);
236         mediLookup.addSubtable(mediClassTable);
237         
238         finaLookup = new Lookup(Lookup.GSST_Single, 0);
239         finaLookup.addSubtable(finaClassTable);
240         
241         ligaLookup = new Lookup(Lookup.GSST_Ligature, Lookup.LF_IgnoreMarks);
242         ligaLookup.addSubtable(ligaWalker);
243         
244         Feature init = new Feature("init");
245         Feature medi = new Feature("medi");
246         Feature fina = new Feature("fina");
247         Feature liga = new Feature("liga");
248         
249         init.addLookup(lookupList.addLookup(initLookup));
250         medi.addLookup(lookupList.addLookup(mediLookup));
251         fina.addLookup(lookupList.addLookup(finaLookup));
252         liga.addLookup(lookupList.addLookup(ligaLookup));
253         
254         featureList.addFeature(init);
255         featureList.addFeature(medi);
256         featureList.addFeature(fina);
257         featureList.addFeature(liga);
258         
259         scriptList.addFeature("arab", "(default)", init);
260         scriptList.addFeature("arab", "(default)", medi);
261         scriptList.addFeature("arab", "(default)", fina);
262         scriptList.addFeature("arab", "(default)", liga);
263         
264         System.out.println();
265     }
266
267     public static void buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree)
268     {
269         int ligatureCount = 0;
270         
271         System.out.print("building composition ligature tree for " + UScript.getName(script) + "... ");
272         
273         for (int i = 0; i < data.countRecords(script); i += 1) {
274             CanonicalCharacterData.Record record = data.getRecord(script, i);
275             String composed = UCharacter.toString(record.getComposedCharacter());
276             
277             for (int e = 0; e < record.countEquivalents(); e += 1) {
278                 String equivalent = record.getEquivalent(e);
279                 
280                 ligatureTree.insert(equivalent + composed);
281                 ligatureCount += 1;
282             }
283         }
284         
285         System.out.println(ligatureCount + " ligatures.");
286     }
287     
288     public static DecompTable[] buildDecompTables(CanonicalCharacterData data, int script)
289     {
290         int maxDecompCount = data.getMaxEquivalents(script);
291         DecompTable[] decompTables = new DecompTable[maxDecompCount];
292         
293         System.out.print("Building decompositon tables for " + UScript.getName(script) +
294                          "... total decompositions: " + data.countRecords(script) + 
295                          ", max: " + maxDecompCount + "...");
296         
297         for (int i = 0; i < maxDecompCount; i += 1) {
298             DecompTable table = new DecompTable();
299             
300             for (int r = 0; r < data.countRecords(script); r += 1) {
301                 CanonicalCharacterData.Record record = data.getRecord(script, r);
302                 
303                 if (record.countEquivalents() > i) {
304                     table.add(record.getComposedCharacter(), record.getEquivalent(i));
305                 }
306             }
307             
308             decompTables[i] = table;
309         }
310         
311         System.out.println(" Done.");
312         
313         return decompTables;
314     }
315     
316     public static int[] buildLookups(CanonicalCharacterData data, LookupList lookupList, int script)
317     {
318         int[] lookups = new int[2];
319         
320         DecompTable[] decompTables = buildDecompTables(data, script);
321         
322         LigatureTree compTree = new LigatureTree();
323         
324         buildLigatureTree(data, script, compTree);
325         
326         System.out.println();
327         
328         LigatureTreeWalker compWalker = new LigatureTreeWalker();
329         
330         compTree.walk(compWalker);
331         
332         Lookup compLookup, dcmpLookup;
333         //int compLookupIndex, dcmpLookupIndex;
334         
335         compLookup = new Lookup(Lookup.GSST_Ligature, 0);
336         compLookup.addSubtable(compWalker);
337         
338         dcmpLookup = new Lookup(Lookup.GSST_Multiple, 0);
339         for (int i = 0; i < decompTables.length; i += 1) {
340             dcmpLookup.addSubtable(decompTables[i]);
341         }
342         
343         lookups[0] = lookupList.addLookup(compLookup);
344         lookups[1] = lookupList.addLookup(dcmpLookup);
345         
346         return lookups;
347     }
348     
349     public static void addLookups(Feature feature, int[] lookups)
350     {
351         for (int i = 0; i < lookups.length; i += 1) {
352             feature.addLookup(lookups[i]);
353         }
354     }
355     
356     /*
357      * Hebrew mark order taken from the SBL Hebrew Font manual
358      * Arabic mark order per Thomas Milo: hamza < shadda < combining_alef < sukun, vowel_marks < madda < qur'anic_marks
359      */
360     public static ClassTable buildCombiningClassTable()
361     {
362         UnicodeSet markSet = new UnicodeSet("[\\P{CanonicalCombiningClass=0}]");
363         ClassTable exceptions = new ClassTable();
364         ClassTable combiningClasses = new ClassTable();
365         int markCount = markSet.size();
366         
367         exceptions.addMapping(0x05C1,  10); // Point Shin Dot
368         exceptions.addMapping(0x05C2,  11); // Point Sin Dot
369         exceptions.addMapping(0x05BC,  21); // Point Dagesh or Mapiq
370         exceptions.addMapping(0x05BF,  23); // Point Rafe
371         exceptions.addMapping(0x05B9,  27); // Point Holam
372         exceptions.addMapping(0x0323, 220); // Comb. Dot Below (low punctum)
373         exceptions.addMapping(0x0591, 220); // Accent Etnahta
374         exceptions.addMapping(0x0596, 220); // Accent Tipeha
375         exceptions.addMapping(0x059B, 220); // Accent Tevir
376         exceptions.addMapping(0x05A3, 220); // Accent Munah
377         exceptions.addMapping(0x05A4, 220); // Accent Mahapakh
378         exceptions.addMapping(0x05A5, 220); // Accent Merkha
379         exceptions.addMapping(0x05A6, 220); // Accent Merkha Kefula
380         exceptions.addMapping(0x05A7, 220); // Accent Darga
381         exceptions.addMapping(0x05AA, 220); // Accent Yerah Ben Yomo
382         exceptions.addMapping(0x05B0, 220); // Point Sheva
383         exceptions.addMapping(0x05B1, 220); // Point Hataf Segol
384         exceptions.addMapping(0x05B2, 220); // Point Hataf Patah
385         exceptions.addMapping(0x05B3, 220); // Point Hataf Qamats
386         exceptions.addMapping(0x05B4, 220); // Point Hiriq
387         exceptions.addMapping(0x05B5, 220); // Point Tsere
388         exceptions.addMapping(0x05B6, 220); // Point Segol
389         exceptions.addMapping(0x05B7, 220); // Point Patah
390         exceptions.addMapping(0x05B8, 220); // Point Qamats
391         exceptions.addMapping(0x05BB, 220); // Point Qubuts
392         exceptions.addMapping(0x05BD, 220); // Point Meteg
393         exceptions.addMapping(0x059A, 222); // Accent Yetiv
394         exceptions.addMapping(0x05AD, 222); // Accent Dehi
395         exceptions.addMapping(0x05C4, 230); // Mark Upper Dot (high punctum)
396         exceptions.addMapping(0x0593, 230); // Accent Shalshelet
397         exceptions.addMapping(0x0594, 230); // Accent Zaqef Qatan
398         exceptions.addMapping(0x0595, 230); // Accent Zaqef Gadol
399         exceptions.addMapping(0x0597, 230); // Accent Revia
400         exceptions.addMapping(0x0598, 230); // Accent Zarqa
401         exceptions.addMapping(0x059F, 230); // Accent Qarney Para
402         exceptions.addMapping(0x059E, 230); // Accent Gershayim
403         exceptions.addMapping(0x059D, 230); // Accent Geresh Muqdam
404         exceptions.addMapping(0x059C, 230); // Accent Geresh
405         exceptions.addMapping(0x0592, 230); // Accent Segolta
406         exceptions.addMapping(0x05A0, 230); // Accent Telisha Gedola
407         exceptions.addMapping(0x05AC, 230); // Accent Iluy
408         exceptions.addMapping(0x05A8, 230); // Accent Qadma
409         exceptions.addMapping(0x05AB, 230); // Accent Ole
410         exceptions.addMapping(0x05AF, 230); // Mark Masora Circle
411         exceptions.addMapping(0x05A1, 230); // Accent Pazer
412       //exceptions.addMapping(0x0307, 230); // Mark Number/Masora Dot
413         exceptions.addMapping(0x05AE, 232); // Accent Zinor
414         exceptions.addMapping(0x05A9, 232); // Accent Telisha Qetana
415         exceptions.addMapping(0x0599, 232); // Accent Pashta
416         
417         exceptions.addMapping(0x0655,  27); // ARABIC HAMZA BELOW
418         exceptions.addMapping(0x0654,  27); // ARABIC HAMZA ABOVE
419
420         exceptions.addMapping(0x0651,  28); // ARABIC SHADDA
421
422         exceptions.addMapping(0x0656,  29); // ARABIC SUBSCRIPT ALEF
423         exceptions.addMapping(0x0670,  29); // ARABIC LETTER SUPERSCRIPT ALEF
424
425         exceptions.addMapping(0x064D,  30); // ARABIC KASRATAN
426         exceptions.addMapping(0x0650,  30); // ARABIC KASRA
427
428         exceptions.addMapping(0x0652,  31); // ARABIC SUKUN
429         exceptions.addMapping(0x06E1,  31); // ARABIC SMALL HIGH DOTLESS HEAD OF KHAH
430
431         exceptions.addMapping(0x064B,  31); // ARABIC FATHATAN
432         exceptions.addMapping(0x064C,  31); // ARABIC DAMMATAN
433         exceptions.addMapping(0x064E,  31); // ARABIC FATHA
434         exceptions.addMapping(0x064F,  31); // ARABIC DAMMA
435         exceptions.addMapping(0x0657,  31); // ARABIC INVERTED DAMMA
436         exceptions.addMapping(0x0658,  31); // ARABIC MARK NOON GHUNNA
437
438         exceptions.addMapping(0x0653,  32); // ARABIC MADDAH ABOVE
439         
440         exceptions.snapshot();
441         
442         for (int i = 0; i < markCount; i += 1) {
443             int mark = markSet.charAt(i);
444             int markClass = exceptions.getGlyphClassID(mark);
445             
446             if (markClass == 0) {
447                 markClass = UCharacter.getCombiningClass(mark);
448             }
449             
450             combiningClasses.addMapping(mark, markClass);
451         }
452         
453         combiningClasses.snapshot();
454         return combiningClasses;
455     }
456     
457     public static void buildDecompTables(String fileName)
458     {
459         // F900 - FAFF are compatibility ideographs. They all decompose to a single other character, and can be ignored.
460       //UnicodeSet decompSet = new UnicodeSet("[[[\\P{Hangul}] & [\\p{DecompositionType=Canonical}]] - [\uF900-\uFAFF]]");
461         UnicodeSet decompSet = new UnicodeSet("[[\\p{DecompositionType=Canonical}] & [\\P{FullCompositionExclusion}] & [\\P{Hangul}]]");
462         CanonicalCharacterData data = CanonicalCharacterData.factory(decompSet);
463         ClassTable classTable = new ClassTable();
464         
465         LookupList  lookupList  = new LookupList();
466         FeatureList featureList = new FeatureList();
467         ScriptList  scriptList  = new ScriptList();
468
469         // build common, inherited lookups...
470 //        int[] commonLookups = buildLookups(data, lookupList, UScript.COMMON);
471 //        int[] inheritedLookups = buildLookups(data, lookupList, UScript.INHERITED);
472         
473         for (int script = 0; script < UScript.CODE_LIMIT; script += 1) {
474             
475             // This is a bit lame, but it's the only way I can think of
476             // to make this work w/o knowing the values of COMMON and INHERITED...
477             if (script == UScript.COMMON || script == UScript.INHERITED ||
478                 data.getMaxEquivalents(script) == 0) {
479                 continue;
480             }
481             
482             int[] lookups = buildLookups(data, lookupList, script);
483
484             Feature ccmp = new Feature("ccmp");
485             
486             addLookups(ccmp, lookups);
487 //            addLookups(ccmp, commonLookups);
488 //            addLookups(ccmp, inheritedLookups);
489             
490             featureList.addFeature(ccmp);
491         
492             String scriptTag = TagUtilities.tagLabel(UScript.getShortName(script));
493             
494             scriptList.addFeature(scriptTag, "(default)", ccmp);
495             
496             if (script == UScript.ARABIC) {
497                 buildArabicTables(scriptList, featureList, lookupList, classTable);
498             }
499         }
500         
501         featureList.finalizeFeatureList();
502         
503         ClassTable markClassTable = buildCombiningClassTable();
504         
505         GSUBWriter gsubWriter = new GSUBWriter("Canon", scriptList, featureList, lookupList);
506         GDEFWriter gdefWriter = new GDEFWriter("Canon", classTable, markClassTable);
507         String[] includeFiles = {"LETypes.h", "CanonShaping.h"};        
508         
509         LigatureModuleWriter writer = new LigatureModuleWriter();
510         
511         writer.openFile(fileName);
512         writer.writeHeader(null, includeFiles);
513         writer.writeTable(gsubWriter);
514         writer.writeTable(gdefWriter);
515         writer.writeTrailer();
516         writer.closeFile();
517     }
518     
519     public static void main(String[] args)
520     {
521         buildDecompTables(args[0]);
522     }
523 }