2 *******************************************************************************
\r
3 * Copyright (C) 2002-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
9 package com.ibm.icu.dev.tool.layout;
\r
11 import com.ibm.icu.lang.UCharacter;
\r
12 import com.ibm.icu.lang.UScript;
\r
13 import com.ibm.icu.text.UTF16;
\r
14 import com.ibm.icu.text.UnicodeSet;
\r
17 * @author Eric Mader
\r
21 * The property \p{Decomposition_Type=Canonical} will match all characters with a canonical
\r
24 * So "[[\\p{Latin}\\p{Greek}\\p{Cyrillic}] & [\\p{Decomposition_Type=Canonical}]]"
\r
25 * will match all Latin, Greek and Cyrillic characters with a canonical decomposition.
\r
27 * Are these three scripts enough? Do we want to collect them all at once and distribute by script,
\r
28 * or process them one script at a time. It's probably a good idea to build a single table for
\r
29 * however many scripts there are.
\r
31 * It might be better to collect all the characters that have a canonical decomposition and just
\r
32 * sort them into however many scripts there are... unless we'll get characters in COMMON???
\r
34 public class CanonGSUBBuilder
\r
36 static public String convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable)
\r
38 int leftType = ArabicShaping.VALUE_NONE;
\r
39 int rightType = ArabicShaping.VALUE_NONE;
\r
42 case UCharacter.DecompositionType.ISOLATED:
\r
45 case UCharacter.DecompositionType.FINAL:
\r
46 rightType = ArabicShaping.VALUE_LEFT;
\r
49 case UCharacter.DecompositionType.INITIAL:
\r
50 leftType = ArabicShaping.VALUE_RIGHT;
\r
53 case UCharacter.DecompositionType.MEDIAL:
\r
54 rightType = ArabicShaping.VALUE_LEFT;
\r
55 leftType = ArabicShaping.VALUE_RIGHT;
\r
59 return decomp + UCharacter.toString(ligature);
\r
62 char[] chars = decomp.toCharArray();
\r
64 ArabicShaping.shape(chars, leftType, rightType, isolClassTable);
\r
66 return new String(chars) + UCharacter.toString(ligature);
\r
69 static void buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable,
\r
70 ClassTable finaClassTable, ClassTable isolClassTable)
\r
72 System.out.print("Finding Arabic contextual forms... ");
\r
74 for (int i = 0; i < data.countRecords(); i += 1) {
\r
75 ArabicCharacterData.Record record = data.getRecord(i);
\r
76 String decomposition = record.getDecomposition();
\r
78 if (decomposition != null && decomposition.length() == 1) {
\r
79 int contextual = record.getCodePoint();
\r
80 int isolated = UTF16.charAt(record.getDecomposition(), 0);
\r
82 switch (record.getDecompositionType()) {
\r
83 case UCharacter.DecompositionType.INITIAL:
\r
84 initClassTable.addMapping(isolated, contextual);
\r
87 case UCharacter.DecompositionType.MEDIAL:
\r
88 mediClassTable.addMapping(isolated, contextual);
\r
91 case UCharacter.DecompositionType.FINAL:
\r
92 finaClassTable.addMapping(isolated, contextual);
\r
95 case UCharacter.DecompositionType.ISOLATED:
\r
96 isolClassTable.addMapping(isolated, contextual);
\r
100 // issue some error message?
\r
106 System.out.println("Done.");
\r
109 static LigatureTree buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable)
\r
111 LigatureTree contextualTree = new LigatureTree();
\r
112 int ligatureCount = 0;
\r
114 System.out.print("Building Arabic ligature tree... ");
\r
116 for (int i = 0; i < data.countRecords(); i += 1) {
\r
117 ArabicCharacterData.Record record = data.getRecord(i);
\r
118 String decomposition = record.getDecomposition();
\r
120 if (decomposition != null && decomposition.length() > 1) {
\r
121 int ligature = record.getCodePoint();
\r
122 int decompType = record.getDecompositionType();
\r
124 switch (decompType) {
\r
125 case UCharacter.DecompositionType.FINAL:
\r
126 case UCharacter.DecompositionType.INITIAL:
\r
127 case UCharacter.DecompositionType.MEDIAL:
\r
128 case UCharacter.DecompositionType.ISOLATED:
\r
129 contextualTree.insert(convertArabicString(decompType, ligature, decomposition, isolClassTable));
\r
130 ligatureCount += 1;
\r
133 case UCharacter.DecompositionType.CANONICAL:
\r
134 //cannonicalTree.insert(decomposition + UCharacter.toString(ligature));
\r
140 System.out.println(ligatureCount + " ligatures.");
\r
142 return contextualTree;
\r
145 static final int SIMPLE_GLYPH = 1;
\r
146 static final int LIGATURE_GLYPH = 2;
\r
147 static final int MARK_GLYPH = 3;
\r
148 static final int COMPONENT_GLYPH = 4;
\r
150 static final int categoryClassMap[] = {
\r
152 SIMPLE_GLYPH, // UPPERCASE_LETTER
\r
153 SIMPLE_GLYPH, // LOWERCASE_LETTER
\r
154 SIMPLE_GLYPH, // TITLECASE_LETTER
\r
155 SIMPLE_GLYPH, // MODIFIER_LETTER
\r
156 SIMPLE_GLYPH, // OTHER_LETTER
\r
157 MARK_GLYPH, // NON_SPACING_MARK
\r
158 MARK_GLYPH, // ENCLOSING_MARK ??
\r
159 MARK_GLYPH, // COMBINING_SPACING_MARK ??
\r
160 SIMPLE_GLYPH, // DECIMAL_NUMBER
\r
161 SIMPLE_GLYPH, // LETTER_NUMBER
\r
162 SIMPLE_GLYPH, // OTHER_NUMBER;
\r
163 0, // SPACE_SEPARATOR
\r
164 0, // LINE_SEPARATOR
\r
165 0, // PARAGRAPH_SEPARATOR
\r
170 SIMPLE_GLYPH, // DASH_PUNCTUATION
\r
171 SIMPLE_GLYPH, // START_PUNCTUATION
\r
172 SIMPLE_GLYPH, // END_PUNCTUATION
\r
173 SIMPLE_GLYPH, // CONNECTOR_PUNCTUATION
\r
174 SIMPLE_GLYPH, // OTHER_PUNCTUATION
\r
175 SIMPLE_GLYPH, // MATH_SYMBOL;
\r
176 SIMPLE_GLYPH, // CURRENCY_SYMBOL
\r
177 SIMPLE_GLYPH, // MODIFIER_SYMBOL
\r
178 SIMPLE_GLYPH, // OTHER_SYMBOL
\r
179 SIMPLE_GLYPH, // INITIAL_PUNCTUATION
\r
180 SIMPLE_GLYPH // FINAL_PUNCTUATION
\r
183 static int getGlyphClass(ArabicCharacterData.Record record)
\r
185 String decomp = record.getDecomposition();
\r
187 if (decomp != null && decomp.length() > 1) {
\r
188 return LIGATURE_GLYPH;
\r
191 return categoryClassMap[record.getGeneralCategory()];
\r
194 static void addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable)
\r
196 System.out.print("Adding Arabic glyph classes... ");
\r
198 for (int i = 0; i < data.countRecords(); i += 1) {
\r
199 ArabicCharacterData.Record record = data.getRecord(i);
\r
200 classTable.addMapping(record.getCodePoint(), getGlyphClass(record));
\r
203 System.out.println("Done.");
\r
206 private static void buildArabicTables(ScriptList scriptList, FeatureList featureList,
\r
207 LookupList lookupList, ClassTable classTable) {
\r
208 // TODO: Might want to have the ligature table builder explicitly check for ligatures
\r
209 // which start with space and tatweel rather than pulling them out here...
\r
210 UnicodeSet arabicBlock = new UnicodeSet("[[\\p{block=Arabic}] & [[:Cf:][:Po:][:So:][:Mn:][:Nd:][:Lm:]]]");
\r
211 UnicodeSet oddLigatures = new UnicodeSet("[\\uFC5E-\\uFC63\\uFCF2-\\uFCF4\\uFE70-\\uFE7F]");
\r
212 UnicodeSet arabicLetters = new UnicodeSet("[\\p{Arabic}]");
\r
213 ArabicCharacterData arabicData = ArabicCharacterData.factory(arabicLetters.addAll(arabicBlock).removeAll(oddLigatures));
\r
215 addArabicGlyphClasses(arabicData, classTable);
\r
217 ClassTable initClassTable = new ClassTable();
\r
218 ClassTable mediClassTable = new ClassTable();
\r
219 ClassTable finaClassTable = new ClassTable();
\r
220 ClassTable isolClassTable = new ClassTable();
\r
222 buildArabicContextualForms(arabicData, initClassTable, mediClassTable, finaClassTable, isolClassTable);
\r
223 isolClassTable.snapshot();
\r
224 LigatureTree ligaTree = buildArabicLigatureTree(arabicData, isolClassTable);
\r
226 LigatureTreeWalker ligaWalker = new LigatureTreeWalker();
\r
228 ligaTree.walk(ligaWalker);
\r
230 Lookup initLookup, mediLookup, finaLookup, ligaLookup;
\r
232 initLookup = new Lookup(Lookup.GSST_Single, 0);
\r
233 initLookup.addSubtable(initClassTable);
\r
235 mediLookup = new Lookup(Lookup.GSST_Single, 0);
\r
236 mediLookup.addSubtable(mediClassTable);
\r
238 finaLookup = new Lookup(Lookup.GSST_Single, 0);
\r
239 finaLookup.addSubtable(finaClassTable);
\r
241 ligaLookup = new Lookup(Lookup.GSST_Ligature, Lookup.LF_IgnoreMarks);
\r
242 ligaLookup.addSubtable(ligaWalker);
\r
244 Feature init = new Feature("init");
\r
245 Feature medi = new Feature("medi");
\r
246 Feature fina = new Feature("fina");
\r
247 Feature liga = new Feature("liga");
\r
249 init.addLookup(lookupList.addLookup(initLookup));
\r
250 medi.addLookup(lookupList.addLookup(mediLookup));
\r
251 fina.addLookup(lookupList.addLookup(finaLookup));
\r
252 liga.addLookup(lookupList.addLookup(ligaLookup));
\r
254 featureList.addFeature(init);
\r
255 featureList.addFeature(medi);
\r
256 featureList.addFeature(fina);
\r
257 featureList.addFeature(liga);
\r
259 scriptList.addFeature("arab", "(default)", init);
\r
260 scriptList.addFeature("arab", "(default)", medi);
\r
261 scriptList.addFeature("arab", "(default)", fina);
\r
262 scriptList.addFeature("arab", "(default)", liga);
\r
264 System.out.println();
\r
267 public static void buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree)
\r
269 int ligatureCount = 0;
\r
271 System.out.print("building composition ligature tree for " + UScript.getName(script) + "... ");
\r
273 for (int i = 0; i < data.countRecords(script); i += 1) {
\r
274 CanonicalCharacterData.Record record = data.getRecord(script, i);
\r
275 String composed = UCharacter.toString(record.getComposedCharacter());
\r
277 for (int e = 0; e < record.countEquivalents(); e += 1) {
\r
278 String equivalent = record.getEquivalent(e);
\r
280 ligatureTree.insert(equivalent + composed);
\r
281 ligatureCount += 1;
\r
285 System.out.println(ligatureCount + " ligatures.");
\r
288 public static DecompTable[] buildDecompTables(CanonicalCharacterData data, int script)
\r
290 int maxDecompCount = data.getMaxEquivalents(script);
\r
291 DecompTable[] decompTables = new DecompTable[maxDecompCount];
\r
293 System.out.print("Building decompositon tables for " + UScript.getName(script) +
\r
294 "... total decompositions: " + data.countRecords(script) +
\r
295 ", max: " + maxDecompCount + "...");
\r
297 for (int i = 0; i < maxDecompCount; i += 1) {
\r
298 DecompTable table = new DecompTable();
\r
300 for (int r = 0; r < data.countRecords(script); r += 1) {
\r
301 CanonicalCharacterData.Record record = data.getRecord(script, r);
\r
303 if (record.countEquivalents() > i) {
\r
304 table.add(record.getComposedCharacter(), record.getEquivalent(i));
\r
308 decompTables[i] = table;
\r
311 System.out.println(" Done.");
\r
313 return decompTables;
\r
316 public static int[] buildLookups(CanonicalCharacterData data, LookupList lookupList, int script)
\r
318 int[] lookups = new int[2];
\r
320 DecompTable[] decompTables = buildDecompTables(data, script);
\r
322 LigatureTree compTree = new LigatureTree();
\r
324 buildLigatureTree(data, script, compTree);
\r
326 System.out.println();
\r
328 LigatureTreeWalker compWalker = new LigatureTreeWalker();
\r
330 compTree.walk(compWalker);
\r
332 Lookup compLookup, dcmpLookup;
\r
333 //int compLookupIndex, dcmpLookupIndex;
\r
335 compLookup = new Lookup(Lookup.GSST_Ligature, 0);
\r
336 compLookup.addSubtable(compWalker);
\r
338 dcmpLookup = new Lookup(Lookup.GSST_Multiple, 0);
\r
339 for (int i = 0; i < decompTables.length; i += 1) {
\r
340 dcmpLookup.addSubtable(decompTables[i]);
\r
343 lookups[0] = lookupList.addLookup(compLookup);
\r
344 lookups[1] = lookupList.addLookup(dcmpLookup);
\r
349 public static void addLookups(Feature feature, int[] lookups)
\r
351 for (int i = 0; i < lookups.length; i += 1) {
\r
352 feature.addLookup(lookups[i]);
\r
357 * Hebrew mark order taken from the SBL Hebrew Font manual
\r
358 * Arabic mark order per Thomas Milo: hamza < shadda < combining_alef < sukun, vowel_marks < madda < qur'anic_marks
\r
360 public static ClassTable buildCombiningClassTable()
\r
362 UnicodeSet markSet = new UnicodeSet("[\\P{CanonicalCombiningClass=0}]");
\r
363 ClassTable exceptions = new ClassTable();
\r
364 ClassTable combiningClasses = new ClassTable();
\r
365 int markCount = markSet.size();
\r
367 exceptions.addMapping(0x05C1, 10); // Point Shin Dot
\r
368 exceptions.addMapping(0x05C2, 11); // Point Sin Dot
\r
369 exceptions.addMapping(0x05BC, 21); // Point Dagesh or Mapiq
\r
370 exceptions.addMapping(0x05BF, 23); // Point Rafe
\r
371 exceptions.addMapping(0x05B9, 27); // Point Holam
\r
372 exceptions.addMapping(0x0323, 220); // Comb. Dot Below (low punctum)
\r
373 exceptions.addMapping(0x0591, 220); // Accent Etnahta
\r
374 exceptions.addMapping(0x0596, 220); // Accent Tipeha
\r
375 exceptions.addMapping(0x059B, 220); // Accent Tevir
\r
376 exceptions.addMapping(0x05A3, 220); // Accent Munah
\r
377 exceptions.addMapping(0x05A4, 220); // Accent Mahapakh
\r
378 exceptions.addMapping(0x05A5, 220); // Accent Merkha
\r
379 exceptions.addMapping(0x05A6, 220); // Accent Merkha Kefula
\r
380 exceptions.addMapping(0x05A7, 220); // Accent Darga
\r
381 exceptions.addMapping(0x05AA, 220); // Accent Yerah Ben Yomo
\r
382 exceptions.addMapping(0x05B0, 220); // Point Sheva
\r
383 exceptions.addMapping(0x05B1, 220); // Point Hataf Segol
\r
384 exceptions.addMapping(0x05B2, 220); // Point Hataf Patah
\r
385 exceptions.addMapping(0x05B3, 220); // Point Hataf Qamats
\r
386 exceptions.addMapping(0x05B4, 220); // Point Hiriq
\r
387 exceptions.addMapping(0x05B5, 220); // Point Tsere
\r
388 exceptions.addMapping(0x05B6, 220); // Point Segol
\r
389 exceptions.addMapping(0x05B7, 220); // Point Patah
\r
390 exceptions.addMapping(0x05B8, 220); // Point Qamats
\r
391 exceptions.addMapping(0x05BB, 220); // Point Qubuts
\r
392 exceptions.addMapping(0x05BD, 220); // Point Meteg
\r
393 exceptions.addMapping(0x059A, 222); // Accent Yetiv
\r
394 exceptions.addMapping(0x05AD, 222); // Accent Dehi
\r
395 exceptions.addMapping(0x05C4, 230); // Mark Upper Dot (high punctum)
\r
396 exceptions.addMapping(0x0593, 230); // Accent Shalshelet
\r
397 exceptions.addMapping(0x0594, 230); // Accent Zaqef Qatan
\r
398 exceptions.addMapping(0x0595, 230); // Accent Zaqef Gadol
\r
399 exceptions.addMapping(0x0597, 230); // Accent Revia
\r
400 exceptions.addMapping(0x0598, 230); // Accent Zarqa
\r
401 exceptions.addMapping(0x059F, 230); // Accent Qarney Para
\r
402 exceptions.addMapping(0x059E, 230); // Accent Gershayim
\r
403 exceptions.addMapping(0x059D, 230); // Accent Geresh Muqdam
\r
404 exceptions.addMapping(0x059C, 230); // Accent Geresh
\r
405 exceptions.addMapping(0x0592, 230); // Accent Segolta
\r
406 exceptions.addMapping(0x05A0, 230); // Accent Telisha Gedola
\r
407 exceptions.addMapping(0x05AC, 230); // Accent Iluy
\r
408 exceptions.addMapping(0x05A8, 230); // Accent Qadma
\r
409 exceptions.addMapping(0x05AB, 230); // Accent Ole
\r
410 exceptions.addMapping(0x05AF, 230); // Mark Masora Circle
\r
411 exceptions.addMapping(0x05A1, 230); // Accent Pazer
\r
412 //exceptions.addMapping(0x0307, 230); // Mark Number/Masora Dot
\r
413 exceptions.addMapping(0x05AE, 232); // Accent Zinor
\r
414 exceptions.addMapping(0x05A9, 232); // Accent Telisha Qetana
\r
415 exceptions.addMapping(0x0599, 232); // Accent Pashta
\r
417 exceptions.addMapping(0x0655, 27); // ARABIC HAMZA BELOW
\r
418 exceptions.addMapping(0x0654, 27); // ARABIC HAMZA ABOVE
\r
420 exceptions.addMapping(0x0651, 28); // ARABIC SHADDA
\r
422 exceptions.addMapping(0x0656, 29); // ARABIC SUBSCRIPT ALEF
\r
423 exceptions.addMapping(0x0670, 29); // ARABIC LETTER SUPERSCRIPT ALEF
\r
425 exceptions.addMapping(0x064D, 30); // ARABIC KASRATAN
\r
426 exceptions.addMapping(0x0650, 30); // ARABIC KASRA
\r
428 exceptions.addMapping(0x0652, 31); // ARABIC SUKUN
\r
429 exceptions.addMapping(0x06E1, 31); // ARABIC SMALL HIGH DOTLESS HEAD OF KHAH
\r
431 exceptions.addMapping(0x064B, 31); // ARABIC FATHATAN
\r
432 exceptions.addMapping(0x064C, 31); // ARABIC DAMMATAN
\r
433 exceptions.addMapping(0x064E, 31); // ARABIC FATHA
\r
434 exceptions.addMapping(0x064F, 31); // ARABIC DAMMA
\r
435 exceptions.addMapping(0x0657, 31); // ARABIC INVERTED DAMMA
\r
436 exceptions.addMapping(0x0658, 31); // ARABIC MARK NOON GHUNNA
\r
438 exceptions.addMapping(0x0653, 32); // ARABIC MADDAH ABOVE
\r
440 exceptions.snapshot();
\r
442 for (int i = 0; i < markCount; i += 1) {
\r
443 int mark = markSet.charAt(i);
\r
444 int markClass = exceptions.getGlyphClassID(mark);
\r
446 if (markClass == 0) {
\r
447 markClass = UCharacter.getCombiningClass(mark);
\r
450 combiningClasses.addMapping(mark, markClass);
\r
453 combiningClasses.snapshot();
\r
454 return combiningClasses;
\r
457 public static void buildDecompTables(String fileName)
\r
459 // F900 - FAFF are compatibility ideographs. They all decompose to a single other character, and can be ignored.
\r
460 //UnicodeSet decompSet = new UnicodeSet("[[[\\P{Hangul}] & [\\p{DecompositionType=Canonical}]] - [\uF900-\uFAFF]]");
\r
461 UnicodeSet decompSet = new UnicodeSet("[[\\p{DecompositionType=Canonical}] & [\\P{FullCompositionExclusion}] & [\\P{Hangul}]]");
\r
462 CanonicalCharacterData data = CanonicalCharacterData.factory(decompSet);
\r
463 ClassTable classTable = new ClassTable();
\r
465 LookupList lookupList = new LookupList();
\r
466 FeatureList featureList = new FeatureList();
\r
467 ScriptList scriptList = new ScriptList();
\r
469 // build common, inherited lookups...
\r
470 // int[] commonLookups = buildLookups(data, lookupList, UScript.COMMON);
\r
471 // int[] inheritedLookups = buildLookups(data, lookupList, UScript.INHERITED);
\r
473 for (int script = 0; script < UScript.CODE_LIMIT; script += 1) {
\r
475 // This is a bit lame, but it's the only way I can think of
\r
476 // to make this work w/o knowing the values of COMMON and INHERITED...
\r
477 if (script == UScript.COMMON || script == UScript.INHERITED ||
\r
478 data.getMaxEquivalents(script) == 0) {
\r
482 int[] lookups = buildLookups(data, lookupList, script);
\r
484 Feature ccmp = new Feature("ccmp");
\r
486 addLookups(ccmp, lookups);
\r
487 // addLookups(ccmp, commonLookups);
\r
488 // addLookups(ccmp, inheritedLookups);
\r
490 featureList.addFeature(ccmp);
\r
492 String scriptTag = TagUtilities.tagLabel(UScript.getShortName(script));
\r
494 scriptList.addFeature(scriptTag, "(default)", ccmp);
\r
496 if (script == UScript.ARABIC) {
\r
497 buildArabicTables(scriptList, featureList, lookupList, classTable);
\r
501 featureList.finalizeFeatureList();
\r
503 ClassTable markClassTable = buildCombiningClassTable();
\r
505 GSUBWriter gsubWriter = new GSUBWriter("Canon", scriptList, featureList, lookupList);
\r
506 GDEFWriter gdefWriter = new GDEFWriter("Canon", classTable, markClassTable);
\r
507 String[] includeFiles = {"LETypes.h", "CanonShaping.h"};
\r
509 LigatureModuleWriter writer = new LigatureModuleWriter();
\r
511 writer.openFile(fileName);
\r
512 writer.writeHeader(null, includeFiles);
\r
513 writer.writeTable(gsubWriter);
\r
514 writer.writeTable(gdefWriter);
\r
515 writer.writeTrailer();
\r
516 writer.closeFile();
\r
519 public static void main(String[] args)
\r
521 buildDecompTables(args[0]);
\r