jars/icu4j-4_8_1_1/tools/misc/src/com/ibm/icu/dev/tool/layout/CanonGSUBBuilder.java

   1 /**
   2  *******************************************************************************
   3  * Copyright (C) 2002-2010, International Business Machines Corporation and    *
   4  * others. All Rights Reserved.                                                *
   5  *******************************************************************************
   6  */
   7
   8
   9 package com.ibm.icu.dev.tool.layout;
  10
  11 import com.ibm.icu.lang.UCharacter;
  12 import com.ibm.icu.lang.UScript;
  13 import com.ibm.icu.text.UTF16;
  14 import com.ibm.icu.text.UnicodeSet;
  15
  16 /**
  17  * @author Eric Mader
  18  *
  19  * Notes:
  20  *
  21  * The property \p{Decomposition_Type=Canonical} will match all characters with a canonical
  22  * decomposition.
  23  *
  24  * So "[[\\p{Latin}\\p{Greek}\\p{Cyrillic}] & [\\p{Decomposition_Type=Canonical}]]"
  25  * will match all Latin, Greek and Cyrillic characters with a canonical decomposition.
  26  *
  27  * Are these three scripts enough? Do we want to collect them all at once and distribute by script,
  28  * or process them one script at a time. It's probably a good idea to build a single table for
  29  * however many scripts there are.
  30  *
  31  * It might be better to collect all the characters that have a canonical decomposition and just
  32  * sort them into however many scripts there are... unless we'll get characters in COMMON???
  33  */
  34 public class CanonGSUBBuilder
  35 {
  36     static public String convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable)
  37     {
  38         int leftType  = ArabicShaping.VALUE_NONE;
  39         int rightType = ArabicShaping.VALUE_NONE;
  40
  41         switch (type) {
  42             case UCharacter.DecompositionType.ISOLATED:
  43                 break;
  44
  45             case UCharacter.DecompositionType.FINAL:
  46                 rightType = ArabicShaping.VALUE_LEFT;
  47                 break;
  48
  49             case UCharacter.DecompositionType.INITIAL:
  50                 leftType = ArabicShaping.VALUE_RIGHT;
  51                 break;
  52
  53             case UCharacter.DecompositionType.MEDIAL:
  54                rightType = ArabicShaping.VALUE_LEFT;
  55                leftType  = ArabicShaping.VALUE_RIGHT;
  56                break;
  57
  58            default:
  59                return decomp + UCharacter.toString(ligature);
  60         }
  61
  62         char[] chars = decomp.toCharArray();
  63
  64         ArabicShaping.shape(chars, leftType, rightType, isolClassTable);
  65
  66         return new String(chars) + UCharacter.toString(ligature);
  67     }
  68
  69     static void buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable,
  70                                      ClassTable finaClassTable, ClassTable isolClassTable)
  71     {
  72         System.out.print("Finding Arabic contextual forms... ");
  73
  74         for (int i = 0; i < data.countRecords(); i += 1) {
  75             ArabicCharacterData.Record record = data.getRecord(i);
  76             String decomposition = record.getDecomposition();
  77
  78             if (decomposition != null && decomposition.length() == 1) {
  79                 int contextual = record.getCodePoint();
  80                 int isolated   = UTF16.charAt(record.getDecomposition(), 0);
  81
  82                 switch (record.getDecompositionType()) {
  83                 case UCharacter.DecompositionType.INITIAL:
  84                     initClassTable.addMapping(isolated, contextual);
  85                     break;
  86
  87                 case UCharacter.DecompositionType.MEDIAL:
  88                     mediClassTable.addMapping(isolated, contextual);
  89                     break;
  90
  91                case UCharacter.DecompositionType.FINAL:
  92                    finaClassTable.addMapping(isolated, contextual);
  93                    break;
  94
  95                case UCharacter.DecompositionType.ISOLATED:
  96                    isolClassTable.addMapping(isolated, contextual);
  97                    break;
  98
  99                default:
 100                    // issue some error message?
 101                    break;
 102                 }
 103             }
 104         }
 105
 106         System.out.println("Done.");
 107     }
 108
 109     static LigatureTree buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable)
 110     {
 111         LigatureTree contextualTree = new LigatureTree();
 112         int ligatureCount = 0;
 113
 114         System.out.print("Building Arabic ligature tree... ");
 115
 116         for (int i = 0; i < data.countRecords(); i += 1) {
 117             ArabicCharacterData.Record record = data.getRecord(i);
 118             String decomposition = record.getDecomposition();
 119
 120             if (decomposition != null && decomposition.length() > 1) {
 121                 int ligature   = record.getCodePoint();
 122                 int decompType = record.getDecompositionType();
 123
 124                 switch (decompType) {
 125                 case UCharacter.DecompositionType.FINAL:
 126                 case UCharacter.DecompositionType.INITIAL:
 127                 case UCharacter.DecompositionType.MEDIAL:
 128                 case UCharacter.DecompositionType.ISOLATED:
 129                     contextualTree.insert(convertArabicString(decompType, ligature, decomposition, isolClassTable));
 130                     ligatureCount += 1;
 131                     break;
 132
 133                 case UCharacter.DecompositionType.CANONICAL:
 134                     //cannonicalTree.insert(decomposition + UCharacter.toString(ligature));
 135                     break;
 136                 }
 137             }
 138         }
 139
 140         System.out.println(ligatureCount + " ligatures.");
 141
 142         return contextualTree;
 143     }
 144
 145     static final int SIMPLE_GLYPH = 1;
 146     static final int LIGATURE_GLYPH = 2;
 147     static final int MARK_GLYPH = 3;
 148     static final int COMPONENT_GLYPH = 4;
 149
 150     static final int categoryClassMap[] = {
 151     0,              // UNASSIGNED
 152     SIMPLE_GLYPH,   // UPPERCASE_LETTER
 153     SIMPLE_GLYPH,   // LOWERCASE_LETTER
 154     SIMPLE_GLYPH,   // TITLECASE_LETTER
 155     SIMPLE_GLYPH,   // MODIFIER_LETTER
 156     SIMPLE_GLYPH,   // OTHER_LETTER
 157     MARK_GLYPH,     // NON_SPACING_MARK
 158     MARK_GLYPH,     // ENCLOSING_MARK ??
 159     MARK_GLYPH,     // COMBINING_SPACING_MARK ??
 160     SIMPLE_GLYPH,   // DECIMAL_NUMBER
 161     SIMPLE_GLYPH,   // LETTER_NUMBER
 162     SIMPLE_GLYPH,   // OTHER_NUMBER;
 163     0,              // SPACE_SEPARATOR
 164     0,              // LINE_SEPARATOR
 165     0,              // PARAGRAPH_SEPARATOR
 166     0,              // CONTROL
 167     0,              // FORMAT
 168     0,              // PRIVATE_USE
 169     0,              // SURROGATE
 170     SIMPLE_GLYPH,   // DASH_PUNCTUATION
 171     SIMPLE_GLYPH,   // START_PUNCTUATION
 172     SIMPLE_GLYPH,   // END_PUNCTUATION
 173     SIMPLE_GLYPH,   // CONNECTOR_PUNCTUATION
 174     SIMPLE_GLYPH,   // OTHER_PUNCTUATION
 175     SIMPLE_GLYPH,   // MATH_SYMBOL;
 176     SIMPLE_GLYPH,   // CURRENCY_SYMBOL
 177     SIMPLE_GLYPH,   // MODIFIER_SYMBOL
 178     SIMPLE_GLYPH,   // OTHER_SYMBOL
 179     SIMPLE_GLYPH,   // INITIAL_PUNCTUATION
 180     SIMPLE_GLYPH    // FINAL_PUNCTUATION
 181     };
 182
 183     static int getGlyphClass(ArabicCharacterData.Record record)
 184     {
 185         String decomp = record.getDecomposition();
 186
 187         if (decomp != null && decomp.length() > 1) {
 188             return LIGATURE_GLYPH;
 189         }
 190
 191         return categoryClassMap[record.getGeneralCategory()];
 192     }
 193
 194     static void addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable)
 195     {
 196         System.out.print("Adding Arabic glyph classes... ");
 197
 198         for (int i = 0; i < data.countRecords(); i += 1) {
 199             ArabicCharacterData.Record record = data.getRecord(i);
 200             classTable.addMapping(record.getCodePoint(), getGlyphClass(record));
 201         }
 202
 203         System.out.println("Done.");
 204     }
 205
 206     private static void buildArabicTables(ScriptList scriptList, FeatureList featureList,
 207                                                 LookupList lookupList, ClassTable classTable) {
 208         // TODO: Might want to have the ligature table builder explicitly check for ligatures
 209         // which start with space and tatweel rather than pulling them out here...
 210         UnicodeSet arabicBlock   = new UnicodeSet("[[\\p{block=Arabic}] & [[:Cf:][:Po:][:So:][:Mn:][:Nd:][:Lm:]]]");
 211         UnicodeSet oddLigatures  = new UnicodeSet("[\\uFC5E-\\uFC63\\uFCF2-\\uFCF4\\uFE70-\\uFE7F]");
 212         UnicodeSet arabicLetters = new UnicodeSet("[\\p{Arabic}]");
 213         ArabicCharacterData arabicData = ArabicCharacterData.factory(arabicLetters.addAll(arabicBlock).removeAll(oddLigatures));
 214
 215         addArabicGlyphClasses(arabicData, classTable);
 216
 217         ClassTable initClassTable = new ClassTable();
 218         ClassTable mediClassTable = new ClassTable();
 219         ClassTable finaClassTable = new ClassTable();
 220         ClassTable isolClassTable = new ClassTable();
 221
 222         buildArabicContextualForms(arabicData, initClassTable, mediClassTable, finaClassTable, isolClassTable);
 223         isolClassTable.snapshot();
 224         LigatureTree ligaTree = buildArabicLigatureTree(arabicData, isolClassTable);
 225
 226         LigatureTreeWalker ligaWalker = new LigatureTreeWalker();
 227
 228         ligaTree.walk(ligaWalker);
 229
 230         Lookup initLookup, mediLookup, finaLookup, ligaLookup;
 231
 232         initLookup = new Lookup(Lookup.GSST_Single, 0);
 233         initLookup.addSubtable(initClassTable);
 234
 235         mediLookup = new Lookup(Lookup.GSST_Single, 0);
 236         mediLookup.addSubtable(mediClassTable);
 237
 238         finaLookup = new Lookup(Lookup.GSST_Single, 0);
 239         finaLookup.addSubtable(finaClassTable);
 240
 241         ligaLookup = new Lookup(Lookup.GSST_Ligature, Lookup.LF_IgnoreMarks);
 242         ligaLookup.addSubtable(ligaWalker);
 243
 244         Feature init = new Feature("init");
 245         Feature medi = new Feature("medi");
 246         Feature fina = new Feature("fina");
 247         Feature liga = new Feature("liga");
 248
 249         init.addLookup(lookupList.addLookup(initLookup));
 250         medi.addLookup(lookupList.addLookup(mediLookup));
 251         fina.addLookup(lookupList.addLookup(finaLookup));
 252         liga.addLookup(lookupList.addLookup(ligaLookup));
 253
 254         featureList.addFeature(init);
 255         featureList.addFeature(medi);
 256         featureList.addFeature(fina);
 257         featureList.addFeature(liga);
 258
 259         scriptList.addFeature("arab", "(default)", init);
 260         scriptList.addFeature("arab", "(default)", medi);
 261         scriptList.addFeature("arab", "(default)", fina);
 262         scriptList.addFeature("arab", "(default)", liga);
 263
 264         System.out.println();
 265     }
 266
 267     public static void buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree)
 268     {
 269         int ligatureCount = 0;
 270
 271         System.out.print("building composition ligature tree for " + UScript.getName(script) + "... ");
 272
 273         for (int i = 0; i < data.countRecords(script); i += 1) {
 274             CanonicalCharacterData.Record record = data.getRecord(script, i);
 275             String composed = UCharacter.toString(record.getComposedCharacter());
 276
 277             for (int e = 0; e < record.countEquivalents(); e += 1) {
 278                 String equivalent = record.getEquivalent(e);
 279
 280                 ligatureTree.insert(equivalent + composed);
 281                 ligatureCount += 1;
 282             }
 283         }
 284
 285         System.out.println(ligatureCount + " ligatures.");
 286     }
 287
 288     public static DecompTable[] buildDecompTables(CanonicalCharacterData data, int script)
 289     {
 290         int maxDecompCount = data.getMaxEquivalents(script);
 291         DecompTable[] decompTables = new DecompTable[maxDecompCount];
 292
 293         System.out.print("Building decompositon tables for " + UScript.getName(script) +
 294                          "... total decompositions: " + data.countRecords(script) +
 295                          ", max: " + maxDecompCount + "...");
 296
 297         for (int i = 0; i < maxDecompCount; i += 1) {
 298             DecompTable table = new DecompTable();
 299
 300             for (int r = 0; r < data.countRecords(script); r += 1) {
 301                 CanonicalCharacterData.Record record = data.getRecord(script, r);
 302
 303                 if (record.countEquivalents() > i) {
 304                     table.add(record.getComposedCharacter(), record.getEquivalent(i));
 305                 }
 306             }
 307
 308             decompTables[i] = table;
 309         }
 310
 311         System.out.println(" Done.");
 312
 313         return decompTables;
 314     }
 315
 316     public static int[] buildLookups(CanonicalCharacterData data, LookupList lookupList, int script)
 317     {
 318         int[] lookups = new int[2];
 319
 320         DecompTable[] decompTables = buildDecompTables(data, script);
 321
 322         LigatureTree compTree = new LigatureTree();
 323
 324         buildLigatureTree(data, script, compTree);
 325
 326         System.out.println();
 327
 328         LigatureTreeWalker compWalker = new LigatureTreeWalker();
 329
 330         compTree.walk(compWalker);
 331
 332         Lookup compLookup, dcmpLookup;
 333         //int compLookupIndex, dcmpLookupIndex;
 334
 335         compLookup = new Lookup(Lookup.GSST_Ligature, 0);
 336         compLookup.addSubtable(compWalker);
 337
 338         dcmpLookup = new Lookup(Lookup.GSST_Multiple, 0);
 339         for (int i = 0; i < decompTables.length; i += 1) {
 340             dcmpLookup.addSubtable(decompTables[i]);
 341         }
 342
 343         lookups[0] = lookupList.addLookup(compLookup);
 344         lookups[1] = lookupList.addLookup(dcmpLookup);
 345
 346         return lookups;
 347     }
 348
 349     public static void addLookups(Feature feature, int[] lookups)
 350     {
 351         for (int i = 0; i < lookups.length; i += 1) {
 352             feature.addLookup(lookups[i]);
 353         }
 354     }
 355
 356     /*
 357      * Hebrew mark order taken from the SBL Hebrew Font manual
 358      * Arabic mark order per Thomas Milo: hamza < shadda < combining_alef < sukun, vowel_marks < madda < qur'anic_marks
 359      */
 360     public static ClassTable buildCombiningClassTable()
 361     {
 362         UnicodeSet markSet = new UnicodeSet("[\\P{CanonicalCombiningClass=0}]");
 363         ClassTable exceptions = new ClassTable();
 364         ClassTable combiningClasses = new ClassTable();
 365         int markCount = markSet.size();
 366
 367         exceptions.addMapping(0x05C1,  10); // Point Shin Dot
 368         exceptions.addMapping(0x05C2,  11); // Point Sin Dot
 369         exceptions.addMapping(0x05BC,  21); // Point Dagesh or Mapiq
 370         exceptions.addMapping(0x05BF,  23); // Point Rafe
 371         exceptions.addMapping(0x05B9,  27); // Point Holam
 372         exceptions.addMapping(0x0323, 220); // Comb. Dot Below (low punctum)
 373         exceptions.addMapping(0x0591, 220); // Accent Etnahta
 374         exceptions.addMapping(0x0596, 220); // Accent Tipeha
 375         exceptions.addMapping(0x059B, 220); // Accent Tevir
 376         exceptions.addMapping(0x05A3, 220); // Accent Munah
 377         exceptions.addMapping(0x05A4, 220); // Accent Mahapakh
 378         exceptions.addMapping(0x05A5, 220); // Accent Merkha
 379         exceptions.addMapping(0x05A6, 220); // Accent Merkha Kefula
 380         exceptions.addMapping(0x05A7, 220); // Accent Darga
 381         exceptions.addMapping(0x05AA, 220); // Accent Yerah Ben Yomo
 382         exceptions.addMapping(0x05B0, 220); // Point Sheva
 383         exceptions.addMapping(0x05B1, 220); // Point Hataf Segol
 384         exceptions.addMapping(0x05B2, 220); // Point Hataf Patah
 385         exceptions.addMapping(0x05B3, 220); // Point Hataf Qamats
 386         exceptions.addMapping(0x05B4, 220); // Point Hiriq
 387         exceptions.addMapping(0x05B5, 220); // Point Tsere
 388         exceptions.addMapping(0x05B6, 220); // Point Segol
 389         exceptions.addMapping(0x05B7, 220); // Point Patah
 390         exceptions.addMapping(0x05B8, 220); // Point Qamats
 391         exceptions.addMapping(0x05BB, 220); // Point Qubuts
 392         exceptions.addMapping(0x05BD, 220); // Point Meteg
 393         exceptions.addMapping(0x059A, 222); // Accent Yetiv
 394         exceptions.addMapping(0x05AD, 222); // Accent Dehi
 395         exceptions.addMapping(0x05C4, 230); // Mark Upper Dot (high punctum)
 396         exceptions.addMapping(0x0593, 230); // Accent Shalshelet
 397         exceptions.addMapping(0x0594, 230); // Accent Zaqef Qatan
 398         exceptions.addMapping(0x0595, 230); // Accent Zaqef Gadol
 399         exceptions.addMapping(0x0597, 230); // Accent Revia
 400         exceptions.addMapping(0x0598, 230); // Accent Zarqa
 401         exceptions.addMapping(0x059F, 230); // Accent Qarney Para
 402         exceptions.addMapping(0x059E, 230); // Accent Gershayim
 403         exceptions.addMapping(0x059D, 230); // Accent Geresh Muqdam
 404         exceptions.addMapping(0x059C, 230); // Accent Geresh
 405         exceptions.addMapping(0x0592, 230); // Accent Segolta
 406         exceptions.addMapping(0x05A0, 230); // Accent Telisha Gedola
 407         exceptions.addMapping(0x05AC, 230); // Accent Iluy
 408         exceptions.addMapping(0x05A8, 230); // Accent Qadma
 409         exceptions.addMapping(0x05AB, 230); // Accent Ole
 410         exceptions.addMapping(0x05AF, 230); // Mark Masora Circle
 411         exceptions.addMapping(0x05A1, 230); // Accent Pazer
 412       //exceptions.addMapping(0x0307, 230); // Mark Number/Masora Dot
 413         exceptions.addMapping(0x05AE, 232); // Accent Zinor
 414         exceptions.addMapping(0x05A9, 232); // Accent Telisha Qetana
 415         exceptions.addMapping(0x0599, 232); // Accent Pashta
 416
 417         exceptions.addMapping(0x0655,  27); // ARABIC HAMZA BELOW
 418         exceptions.addMapping(0x0654,  27); // ARABIC HAMZA ABOVE
 419
 420         exceptions.addMapping(0x0651,  28); // ARABIC SHADDA
 421
 422         exceptions.addMapping(0x0656,  29); // ARABIC SUBSCRIPT ALEF
 423         exceptions.addMapping(0x0670,  29); // ARABIC LETTER SUPERSCRIPT ALEF
 424
 425         exceptions.addMapping(0x064D,  30); // ARABIC KASRATAN
 426         exceptions.addMapping(0x0650,  30); // ARABIC KASRA
 427
 428         exceptions.addMapping(0x0652,  31); // ARABIC SUKUN
 429         exceptions.addMapping(0x06E1,  31); // ARABIC SMALL HIGH DOTLESS HEAD OF KHAH
 430
 431         exceptions.addMapping(0x064B,  31); // ARABIC FATHATAN
 432         exceptions.addMapping(0x064C,  31); // ARABIC DAMMATAN
 433         exceptions.addMapping(0x064E,  31); // ARABIC FATHA
 434         exceptions.addMapping(0x064F,  31); // ARABIC DAMMA
 435         exceptions.addMapping(0x0657,  31); // ARABIC INVERTED DAMMA
 436         exceptions.addMapping(0x0658,  31); // ARABIC MARK NOON GHUNNA
 437
 438         exceptions.addMapping(0x0653,  32); // ARABIC MADDAH ABOVE
 439
 440         exceptions.snapshot();
 441
 442         for (int i = 0; i < markCount; i += 1) {
 443             int mark = markSet.charAt(i);
 444             int markClass = exceptions.getGlyphClassID(mark);
 445
 446             if (markClass == 0) {
 447                 markClass = UCharacter.getCombiningClass(mark);
 448             }
 449
 450             combiningClasses.addMapping(mark, markClass);
 451         }
 452
 453         combiningClasses.snapshot();
 454         return combiningClasses;
 455     }
 456
 457     public static void buildDecompTables(String fileName)
 458     {
 459         // F900 - FAFF are compatibility ideographs. They all decompose to a single other character, and can be ignored.
 460       //UnicodeSet decompSet = new UnicodeSet("[[[\\P{Hangul}] & [\\p{DecompositionType=Canonical}]] - [\uF900-\uFAFF]]");
 461         UnicodeSet decompSet = new UnicodeSet("[[\\p{DecompositionType=Canonical}] & [\\P{FullCompositionExclusion}] & [\\P{Hangul}]]");
 462         CanonicalCharacterData data = CanonicalCharacterData.factory(decompSet);
 463         ClassTable classTable = new ClassTable();
 464
 465         LookupList  lookupList  = new LookupList();
 466         FeatureList featureList = new FeatureList();
 467         ScriptList  scriptList  = new ScriptList();
 468
 469         // build common, inherited lookups...
 470 //        int[] commonLookups = buildLookups(data, lookupList, UScript.COMMON);
 471 //        int[] inheritedLookups = buildLookups(data, lookupList, UScript.INHERITED);
 472
 473         for (int script = 0; script < UScript.CODE_LIMIT; script += 1) {
 474
 475             // This is a bit lame, but it's the only way I can think of
 476             // to make this work w/o knowing the values of COMMON and INHERITED...
 477             if (script == UScript.COMMON || script == UScript.INHERITED ||
 478                 data.getMaxEquivalents(script) == 0) {
 479                 continue;
 480             }
 481
 482             int[] lookups = buildLookups(data, lookupList, script);
 483
 484             Feature ccmp = new Feature("ccmp");
 485
 486             addLookups(ccmp, lookups);
 487 //            addLookups(ccmp, commonLookups);
 488 //            addLookups(ccmp, inheritedLookups);
 489
 490             featureList.addFeature(ccmp);
 491
 492             String scriptTag = TagUtilities.tagLabel(UScript.getShortName(script));
 493
 494             scriptList.addFeature(scriptTag, "(default)", ccmp);
 495
 496             if (script == UScript.ARABIC) {
 497                 buildArabicTables(scriptList, featureList, lookupList, classTable);
 498             }
 499         }
 500
 501         featureList.finalizeFeatureList();
 502
 503         ClassTable markClassTable = buildCombiningClassTable();
 504
 505         GSUBWriter gsubWriter = new GSUBWriter("Canon", scriptList, featureList, lookupList);
 506         GDEFWriter gdefWriter = new GDEFWriter("Canon", classTable, markClassTable);
 507         String[] includeFiles = {"LETypes.h", "CanonShaping.h"};
 508
 509         LigatureModuleWriter writer = new LigatureModuleWriter();
 510
 511         writer.openFile(fileName);
 512         writer.writeHeader(null, includeFiles);
 513         writer.writeTable(gsubWriter);
 514         writer.writeTable(gdefWriter);
 515         writer.writeTrailer();
 516         writer.closeFile();
 517     }
 518
 519     public static void main(String[] args)
 520     {
 521         buildDecompTables(args[0]);
 522     }
 523 }