2 *******************************************************************************
3 * Copyright (C) 2002-2010, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
9 package com.ibm.icu.dev.tool.layout;
11 import com.ibm.icu.lang.UCharacter;
12 import com.ibm.icu.lang.UScript;
13 import com.ibm.icu.text.UTF16;
14 import com.ibm.icu.text.UnicodeSet;
21 * The property \p{Decomposition_Type=Canonical} will match all characters with a canonical
24 * So "[[\\p{Latin}\\p{Greek}\\p{Cyrillic}] & [\\p{Decomposition_Type=Canonical}]]"
25 * will match all Latin, Greek and Cyrillic characters with a canonical decomposition.
27 * Are these three scripts enough? Do we want to collect them all at once and distribute by script,
28 * or process them one script at a time. It's probably a good idea to build a single table for
29 * however many scripts there are.
31 * It might be better to collect all the characters that have a canonical decomposition and just
32 * sort them into however many scripts there are... unless we'll get characters in COMMON???
34 public class CanonGSUBBuilder
36 static public String convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable)
38 int leftType = ArabicShaping.VALUE_NONE;
39 int rightType = ArabicShaping.VALUE_NONE;
42 case UCharacter.DecompositionType.ISOLATED:
45 case UCharacter.DecompositionType.FINAL:
46 rightType = ArabicShaping.VALUE_LEFT;
49 case UCharacter.DecompositionType.INITIAL:
50 leftType = ArabicShaping.VALUE_RIGHT;
53 case UCharacter.DecompositionType.MEDIAL:
54 rightType = ArabicShaping.VALUE_LEFT;
55 leftType = ArabicShaping.VALUE_RIGHT;
59 return decomp + UCharacter.toString(ligature);
62 char[] chars = decomp.toCharArray();
64 ArabicShaping.shape(chars, leftType, rightType, isolClassTable);
66 return new String(chars) + UCharacter.toString(ligature);
69 static void buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable,
70 ClassTable finaClassTable, ClassTable isolClassTable)
72 System.out.print("Finding Arabic contextual forms... ");
74 for (int i = 0; i < data.countRecords(); i += 1) {
75 ArabicCharacterData.Record record = data.getRecord(i);
76 String decomposition = record.getDecomposition();
78 if (decomposition != null && decomposition.length() == 1) {
79 int contextual = record.getCodePoint();
80 int isolated = UTF16.charAt(record.getDecomposition(), 0);
82 switch (record.getDecompositionType()) {
83 case UCharacter.DecompositionType.INITIAL:
84 initClassTable.addMapping(isolated, contextual);
87 case UCharacter.DecompositionType.MEDIAL:
88 mediClassTable.addMapping(isolated, contextual);
91 case UCharacter.DecompositionType.FINAL:
92 finaClassTable.addMapping(isolated, contextual);
95 case UCharacter.DecompositionType.ISOLATED:
96 isolClassTable.addMapping(isolated, contextual);
100 // issue some error message?
106 System.out.println("Done.");
109 static LigatureTree buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable)
111 LigatureTree contextualTree = new LigatureTree();
112 int ligatureCount = 0;
114 System.out.print("Building Arabic ligature tree... ");
116 for (int i = 0; i < data.countRecords(); i += 1) {
117 ArabicCharacterData.Record record = data.getRecord(i);
118 String decomposition = record.getDecomposition();
120 if (decomposition != null && decomposition.length() > 1) {
121 int ligature = record.getCodePoint();
122 int decompType = record.getDecompositionType();
124 switch (decompType) {
125 case UCharacter.DecompositionType.FINAL:
126 case UCharacter.DecompositionType.INITIAL:
127 case UCharacter.DecompositionType.MEDIAL:
128 case UCharacter.DecompositionType.ISOLATED:
129 contextualTree.insert(convertArabicString(decompType, ligature, decomposition, isolClassTable));
133 case UCharacter.DecompositionType.CANONICAL:
134 //cannonicalTree.insert(decomposition + UCharacter.toString(ligature));
140 System.out.println(ligatureCount + " ligatures.");
142 return contextualTree;
145 static final int SIMPLE_GLYPH = 1;
146 static final int LIGATURE_GLYPH = 2;
147 static final int MARK_GLYPH = 3;
148 static final int COMPONENT_GLYPH = 4;
150 static final int categoryClassMap[] = {
152 SIMPLE_GLYPH, // UPPERCASE_LETTER
153 SIMPLE_GLYPH, // LOWERCASE_LETTER
154 SIMPLE_GLYPH, // TITLECASE_LETTER
155 SIMPLE_GLYPH, // MODIFIER_LETTER
156 SIMPLE_GLYPH, // OTHER_LETTER
157 MARK_GLYPH, // NON_SPACING_MARK
158 MARK_GLYPH, // ENCLOSING_MARK ??
159 MARK_GLYPH, // COMBINING_SPACING_MARK ??
160 SIMPLE_GLYPH, // DECIMAL_NUMBER
161 SIMPLE_GLYPH, // LETTER_NUMBER
162 SIMPLE_GLYPH, // OTHER_NUMBER;
163 0, // SPACE_SEPARATOR
165 0, // PARAGRAPH_SEPARATOR
170 SIMPLE_GLYPH, // DASH_PUNCTUATION
171 SIMPLE_GLYPH, // START_PUNCTUATION
172 SIMPLE_GLYPH, // END_PUNCTUATION
173 SIMPLE_GLYPH, // CONNECTOR_PUNCTUATION
174 SIMPLE_GLYPH, // OTHER_PUNCTUATION
175 SIMPLE_GLYPH, // MATH_SYMBOL;
176 SIMPLE_GLYPH, // CURRENCY_SYMBOL
177 SIMPLE_GLYPH, // MODIFIER_SYMBOL
178 SIMPLE_GLYPH, // OTHER_SYMBOL
179 SIMPLE_GLYPH, // INITIAL_PUNCTUATION
180 SIMPLE_GLYPH // FINAL_PUNCTUATION
183 static int getGlyphClass(ArabicCharacterData.Record record)
185 String decomp = record.getDecomposition();
187 if (decomp != null && decomp.length() > 1) {
188 return LIGATURE_GLYPH;
191 return categoryClassMap[record.getGeneralCategory()];
194 static void addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable)
196 System.out.print("Adding Arabic glyph classes... ");
198 for (int i = 0; i < data.countRecords(); i += 1) {
199 ArabicCharacterData.Record record = data.getRecord(i);
200 classTable.addMapping(record.getCodePoint(), getGlyphClass(record));
203 System.out.println("Done.");
206 private static void buildArabicTables(ScriptList scriptList, FeatureList featureList,
207 LookupList lookupList, ClassTable classTable) {
208 // TODO: Might want to have the ligature table builder explicitly check for ligatures
209 // which start with space and tatweel rather than pulling them out here...
210 UnicodeSet arabicBlock = new UnicodeSet("[[\\p{block=Arabic}] & [[:Cf:][:Po:][:So:][:Mn:][:Nd:][:Lm:]]]");
211 UnicodeSet oddLigatures = new UnicodeSet("[\\uFC5E-\\uFC63\\uFCF2-\\uFCF4\\uFE70-\\uFE7F]");
212 UnicodeSet arabicLetters = new UnicodeSet("[\\p{Arabic}]");
213 ArabicCharacterData arabicData = ArabicCharacterData.factory(arabicLetters.addAll(arabicBlock).removeAll(oddLigatures));
215 addArabicGlyphClasses(arabicData, classTable);
217 ClassTable initClassTable = new ClassTable();
218 ClassTable mediClassTable = new ClassTable();
219 ClassTable finaClassTable = new ClassTable();
220 ClassTable isolClassTable = new ClassTable();
222 buildArabicContextualForms(arabicData, initClassTable, mediClassTable, finaClassTable, isolClassTable);
223 isolClassTable.snapshot();
224 LigatureTree ligaTree = buildArabicLigatureTree(arabicData, isolClassTable);
226 LigatureTreeWalker ligaWalker = new LigatureTreeWalker();
228 ligaTree.walk(ligaWalker);
230 Lookup initLookup, mediLookup, finaLookup, ligaLookup;
232 initLookup = new Lookup(Lookup.GSST_Single, 0);
233 initLookup.addSubtable(initClassTable);
235 mediLookup = new Lookup(Lookup.GSST_Single, 0);
236 mediLookup.addSubtable(mediClassTable);
238 finaLookup = new Lookup(Lookup.GSST_Single, 0);
239 finaLookup.addSubtable(finaClassTable);
241 ligaLookup = new Lookup(Lookup.GSST_Ligature, Lookup.LF_IgnoreMarks);
242 ligaLookup.addSubtable(ligaWalker);
244 Feature init = new Feature("init");
245 Feature medi = new Feature("medi");
246 Feature fina = new Feature("fina");
247 Feature liga = new Feature("liga");
249 init.addLookup(lookupList.addLookup(initLookup));
250 medi.addLookup(lookupList.addLookup(mediLookup));
251 fina.addLookup(lookupList.addLookup(finaLookup));
252 liga.addLookup(lookupList.addLookup(ligaLookup));
254 featureList.addFeature(init);
255 featureList.addFeature(medi);
256 featureList.addFeature(fina);
257 featureList.addFeature(liga);
259 scriptList.addFeature("arab", "(default)", init);
260 scriptList.addFeature("arab", "(default)", medi);
261 scriptList.addFeature("arab", "(default)", fina);
262 scriptList.addFeature("arab", "(default)", liga);
264 System.out.println();
267 public static void buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree)
269 int ligatureCount = 0;
271 System.out.print("building composition ligature tree for " + UScript.getName(script) + "... ");
273 for (int i = 0; i < data.countRecords(script); i += 1) {
274 CanonicalCharacterData.Record record = data.getRecord(script, i);
275 String composed = UCharacter.toString(record.getComposedCharacter());
277 for (int e = 0; e < record.countEquivalents(); e += 1) {
278 String equivalent = record.getEquivalent(e);
280 ligatureTree.insert(equivalent + composed);
285 System.out.println(ligatureCount + " ligatures.");
288 public static DecompTable[] buildDecompTables(CanonicalCharacterData data, int script)
290 int maxDecompCount = data.getMaxEquivalents(script);
291 DecompTable[] decompTables = new DecompTable[maxDecompCount];
293 System.out.print("Building decompositon tables for " + UScript.getName(script) +
294 "... total decompositions: " + data.countRecords(script) +
295 ", max: " + maxDecompCount + "...");
297 for (int i = 0; i < maxDecompCount; i += 1) {
298 DecompTable table = new DecompTable();
300 for (int r = 0; r < data.countRecords(script); r += 1) {
301 CanonicalCharacterData.Record record = data.getRecord(script, r);
303 if (record.countEquivalents() > i) {
304 table.add(record.getComposedCharacter(), record.getEquivalent(i));
308 decompTables[i] = table;
311 System.out.println(" Done.");
316 public static int[] buildLookups(CanonicalCharacterData data, LookupList lookupList, int script)
318 int[] lookups = new int[2];
320 DecompTable[] decompTables = buildDecompTables(data, script);
322 LigatureTree compTree = new LigatureTree();
324 buildLigatureTree(data, script, compTree);
326 System.out.println();
328 LigatureTreeWalker compWalker = new LigatureTreeWalker();
330 compTree.walk(compWalker);
332 Lookup compLookup, dcmpLookup;
333 //int compLookupIndex, dcmpLookupIndex;
335 compLookup = new Lookup(Lookup.GSST_Ligature, 0);
336 compLookup.addSubtable(compWalker);
338 dcmpLookup = new Lookup(Lookup.GSST_Multiple, 0);
339 for (int i = 0; i < decompTables.length; i += 1) {
340 dcmpLookup.addSubtable(decompTables[i]);
343 lookups[0] = lookupList.addLookup(compLookup);
344 lookups[1] = lookupList.addLookup(dcmpLookup);
349 public static void addLookups(Feature feature, int[] lookups)
351 for (int i = 0; i < lookups.length; i += 1) {
352 feature.addLookup(lookups[i]);
357 * Hebrew mark order taken from the SBL Hebrew Font manual
358 * Arabic mark order per Thomas Milo: hamza < shadda < combining_alef < sukun, vowel_marks < madda < qur'anic_marks
360 public static ClassTable buildCombiningClassTable()
362 UnicodeSet markSet = new UnicodeSet("[\\P{CanonicalCombiningClass=0}]");
363 ClassTable exceptions = new ClassTable();
364 ClassTable combiningClasses = new ClassTable();
365 int markCount = markSet.size();
367 exceptions.addMapping(0x05C1, 10); // Point Shin Dot
368 exceptions.addMapping(0x05C2, 11); // Point Sin Dot
369 exceptions.addMapping(0x05BC, 21); // Point Dagesh or Mapiq
370 exceptions.addMapping(0x05BF, 23); // Point Rafe
371 exceptions.addMapping(0x05B9, 27); // Point Holam
372 exceptions.addMapping(0x0323, 220); // Comb. Dot Below (low punctum)
373 exceptions.addMapping(0x0591, 220); // Accent Etnahta
374 exceptions.addMapping(0x0596, 220); // Accent Tipeha
375 exceptions.addMapping(0x059B, 220); // Accent Tevir
376 exceptions.addMapping(0x05A3, 220); // Accent Munah
377 exceptions.addMapping(0x05A4, 220); // Accent Mahapakh
378 exceptions.addMapping(0x05A5, 220); // Accent Merkha
379 exceptions.addMapping(0x05A6, 220); // Accent Merkha Kefula
380 exceptions.addMapping(0x05A7, 220); // Accent Darga
381 exceptions.addMapping(0x05AA, 220); // Accent Yerah Ben Yomo
382 exceptions.addMapping(0x05B0, 220); // Point Sheva
383 exceptions.addMapping(0x05B1, 220); // Point Hataf Segol
384 exceptions.addMapping(0x05B2, 220); // Point Hataf Patah
385 exceptions.addMapping(0x05B3, 220); // Point Hataf Qamats
386 exceptions.addMapping(0x05B4, 220); // Point Hiriq
387 exceptions.addMapping(0x05B5, 220); // Point Tsere
388 exceptions.addMapping(0x05B6, 220); // Point Segol
389 exceptions.addMapping(0x05B7, 220); // Point Patah
390 exceptions.addMapping(0x05B8, 220); // Point Qamats
391 exceptions.addMapping(0x05BB, 220); // Point Qubuts
392 exceptions.addMapping(0x05BD, 220); // Point Meteg
393 exceptions.addMapping(0x059A, 222); // Accent Yetiv
394 exceptions.addMapping(0x05AD, 222); // Accent Dehi
395 exceptions.addMapping(0x05C4, 230); // Mark Upper Dot (high punctum)
396 exceptions.addMapping(0x0593, 230); // Accent Shalshelet
397 exceptions.addMapping(0x0594, 230); // Accent Zaqef Qatan
398 exceptions.addMapping(0x0595, 230); // Accent Zaqef Gadol
399 exceptions.addMapping(0x0597, 230); // Accent Revia
400 exceptions.addMapping(0x0598, 230); // Accent Zarqa
401 exceptions.addMapping(0x059F, 230); // Accent Qarney Para
402 exceptions.addMapping(0x059E, 230); // Accent Gershayim
403 exceptions.addMapping(0x059D, 230); // Accent Geresh Muqdam
404 exceptions.addMapping(0x059C, 230); // Accent Geresh
405 exceptions.addMapping(0x0592, 230); // Accent Segolta
406 exceptions.addMapping(0x05A0, 230); // Accent Telisha Gedola
407 exceptions.addMapping(0x05AC, 230); // Accent Iluy
408 exceptions.addMapping(0x05A8, 230); // Accent Qadma
409 exceptions.addMapping(0x05AB, 230); // Accent Ole
410 exceptions.addMapping(0x05AF, 230); // Mark Masora Circle
411 exceptions.addMapping(0x05A1, 230); // Accent Pazer
412 //exceptions.addMapping(0x0307, 230); // Mark Number/Masora Dot
413 exceptions.addMapping(0x05AE, 232); // Accent Zinor
414 exceptions.addMapping(0x05A9, 232); // Accent Telisha Qetana
415 exceptions.addMapping(0x0599, 232); // Accent Pashta
417 exceptions.addMapping(0x0655, 27); // ARABIC HAMZA BELOW
418 exceptions.addMapping(0x0654, 27); // ARABIC HAMZA ABOVE
420 exceptions.addMapping(0x0651, 28); // ARABIC SHADDA
422 exceptions.addMapping(0x0656, 29); // ARABIC SUBSCRIPT ALEF
423 exceptions.addMapping(0x0670, 29); // ARABIC LETTER SUPERSCRIPT ALEF
425 exceptions.addMapping(0x064D, 30); // ARABIC KASRATAN
426 exceptions.addMapping(0x0650, 30); // ARABIC KASRA
428 exceptions.addMapping(0x0652, 31); // ARABIC SUKUN
429 exceptions.addMapping(0x06E1, 31); // ARABIC SMALL HIGH DOTLESS HEAD OF KHAH
431 exceptions.addMapping(0x064B, 31); // ARABIC FATHATAN
432 exceptions.addMapping(0x064C, 31); // ARABIC DAMMATAN
433 exceptions.addMapping(0x064E, 31); // ARABIC FATHA
434 exceptions.addMapping(0x064F, 31); // ARABIC DAMMA
435 exceptions.addMapping(0x0657, 31); // ARABIC INVERTED DAMMA
436 exceptions.addMapping(0x0658, 31); // ARABIC MARK NOON GHUNNA
438 exceptions.addMapping(0x0653, 32); // ARABIC MADDAH ABOVE
440 exceptions.snapshot();
442 for (int i = 0; i < markCount; i += 1) {
443 int mark = markSet.charAt(i);
444 int markClass = exceptions.getGlyphClassID(mark);
446 if (markClass == 0) {
447 markClass = UCharacter.getCombiningClass(mark);
450 combiningClasses.addMapping(mark, markClass);
453 combiningClasses.snapshot();
454 return combiningClasses;
457 public static void buildDecompTables(String fileName)
459 // F900 - FAFF are compatibility ideographs. They all decompose to a single other character, and can be ignored.
460 //UnicodeSet decompSet = new UnicodeSet("[[[\\P{Hangul}] & [\\p{DecompositionType=Canonical}]] - [\uF900-\uFAFF]]");
461 UnicodeSet decompSet = new UnicodeSet("[[\\p{DecompositionType=Canonical}] & [\\P{FullCompositionExclusion}] & [\\P{Hangul}]]");
462 CanonicalCharacterData data = CanonicalCharacterData.factory(decompSet);
463 ClassTable classTable = new ClassTable();
465 LookupList lookupList = new LookupList();
466 FeatureList featureList = new FeatureList();
467 ScriptList scriptList = new ScriptList();
469 // build common, inherited lookups...
470 // int[] commonLookups = buildLookups(data, lookupList, UScript.COMMON);
471 // int[] inheritedLookups = buildLookups(data, lookupList, UScript.INHERITED);
473 for (int script = 0; script < UScript.CODE_LIMIT; script += 1) {
475 // This is a bit lame, but it's the only way I can think of
476 // to make this work w/o knowing the values of COMMON and INHERITED...
477 if (script == UScript.COMMON || script == UScript.INHERITED ||
478 data.getMaxEquivalents(script) == 0) {
482 int[] lookups = buildLookups(data, lookupList, script);
484 Feature ccmp = new Feature("ccmp");
486 addLookups(ccmp, lookups);
487 // addLookups(ccmp, commonLookups);
488 // addLookups(ccmp, inheritedLookups);
490 featureList.addFeature(ccmp);
492 String scriptTag = TagUtilities.tagLabel(UScript.getShortName(script));
494 scriptList.addFeature(scriptTag, "(default)", ccmp);
496 if (script == UScript.ARABIC) {
497 buildArabicTables(scriptList, featureList, lookupList, classTable);
501 featureList.finalizeFeatureList();
503 ClassTable markClassTable = buildCombiningClassTable();
505 GSUBWriter gsubWriter = new GSUBWriter("Canon", scriptList, featureList, lookupList);
506 GDEFWriter gdefWriter = new GDEFWriter("Canon", classTable, markClassTable);
507 String[] includeFiles = {"LETypes.h", "CanonShaping.h"};
509 LigatureModuleWriter writer = new LigatureModuleWriter();
511 writer.openFile(fileName);
512 writer.writeHeader(null, includeFiles);
513 writer.writeTable(gsubWriter);
514 writer.writeTable(gdefWriter);
515 writer.writeTrailer();
519 public static void main(String[] args)
521 buildDecompTables(args[0]);