/** ******************************************************************************* * Copyright (C) 2002-2010, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.dev.tool.layout; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UScript; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; /** * @author Eric Mader * * Notes: * * The property \p{Decomposition_Type=Canonical} will match all characters with a canonical * decomposition. * * So "[[\\p{Latin}\\p{Greek}\\p{Cyrillic}] & [\\p{Decomposition_Type=Canonical}]]" * will match all Latin, Greek and Cyrillic characters with a canonical decomposition. * * Are these three scripts enough? Do we want to collect them all at once and distribute by script, * or process them one script at a time. It's probably a good idea to build a single table for * however many scripts there are. * * It might be better to collect all the characters that have a canonical decomposition and just * sort them into however many scripts there are... unless we'll get characters in COMMON??? */ public class CanonGSUBBuilder { static public String convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable) { int leftType = ArabicShaping.VALUE_NONE; int rightType = ArabicShaping.VALUE_NONE; switch (type) { case UCharacter.DecompositionType.ISOLATED: break; case UCharacter.DecompositionType.FINAL: rightType = ArabicShaping.VALUE_LEFT; break; case UCharacter.DecompositionType.INITIAL: leftType = ArabicShaping.VALUE_RIGHT; break; case UCharacter.DecompositionType.MEDIAL: rightType = ArabicShaping.VALUE_LEFT; leftType = ArabicShaping.VALUE_RIGHT; break; default: return decomp + UCharacter.toString(ligature); } char[] chars = decomp.toCharArray(); ArabicShaping.shape(chars, leftType, rightType, isolClassTable); return new String(chars) + UCharacter.toString(ligature); } static void buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable, ClassTable finaClassTable, ClassTable isolClassTable) { System.out.print("Finding Arabic contextual forms... "); for (int i = 0; i < data.countRecords(); i += 1) { ArabicCharacterData.Record record = data.getRecord(i); String decomposition = record.getDecomposition(); if (decomposition != null && decomposition.length() == 1) { int contextual = record.getCodePoint(); int isolated = UTF16.charAt(record.getDecomposition(), 0); switch (record.getDecompositionType()) { case UCharacter.DecompositionType.INITIAL: initClassTable.addMapping(isolated, contextual); break; case UCharacter.DecompositionType.MEDIAL: mediClassTable.addMapping(isolated, contextual); break; case UCharacter.DecompositionType.FINAL: finaClassTable.addMapping(isolated, contextual); break; case UCharacter.DecompositionType.ISOLATED: isolClassTable.addMapping(isolated, contextual); break; default: // issue some error message? break; } } } System.out.println("Done."); } static LigatureTree buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable) { LigatureTree contextualTree = new LigatureTree(); int ligatureCount = 0; System.out.print("Building Arabic ligature tree... "); for (int i = 0; i < data.countRecords(); i += 1) { ArabicCharacterData.Record record = data.getRecord(i); String decomposition = record.getDecomposition(); if (decomposition != null && decomposition.length() > 1) { int ligature = record.getCodePoint(); int decompType = record.getDecompositionType(); switch (decompType) { case UCharacter.DecompositionType.FINAL: case UCharacter.DecompositionType.INITIAL: case UCharacter.DecompositionType.MEDIAL: case UCharacter.DecompositionType.ISOLATED: contextualTree.insert(convertArabicString(decompType, ligature, decomposition, isolClassTable)); ligatureCount += 1; break; case UCharacter.DecompositionType.CANONICAL: //cannonicalTree.insert(decomposition + UCharacter.toString(ligature)); break; } } } System.out.println(ligatureCount + " ligatures."); return contextualTree; } static final int SIMPLE_GLYPH = 1; static final int LIGATURE_GLYPH = 2; static final int MARK_GLYPH = 3; static final int COMPONENT_GLYPH = 4; static final int categoryClassMap[] = { 0, // UNASSIGNED SIMPLE_GLYPH, // UPPERCASE_LETTER SIMPLE_GLYPH, // LOWERCASE_LETTER SIMPLE_GLYPH, // TITLECASE_LETTER SIMPLE_GLYPH, // MODIFIER_LETTER SIMPLE_GLYPH, // OTHER_LETTER MARK_GLYPH, // NON_SPACING_MARK MARK_GLYPH, // ENCLOSING_MARK ?? MARK_GLYPH, // COMBINING_SPACING_MARK ?? SIMPLE_GLYPH, // DECIMAL_NUMBER SIMPLE_GLYPH, // LETTER_NUMBER SIMPLE_GLYPH, // OTHER_NUMBER; 0, // SPACE_SEPARATOR 0, // LINE_SEPARATOR 0, // PARAGRAPH_SEPARATOR 0, // CONTROL 0, // FORMAT 0, // PRIVATE_USE 0, // SURROGATE SIMPLE_GLYPH, // DASH_PUNCTUATION SIMPLE_GLYPH, // START_PUNCTUATION SIMPLE_GLYPH, // END_PUNCTUATION SIMPLE_GLYPH, // CONNECTOR_PUNCTUATION SIMPLE_GLYPH, // OTHER_PUNCTUATION SIMPLE_GLYPH, // MATH_SYMBOL; SIMPLE_GLYPH, // CURRENCY_SYMBOL SIMPLE_GLYPH, // MODIFIER_SYMBOL SIMPLE_GLYPH, // OTHER_SYMBOL SIMPLE_GLYPH, // INITIAL_PUNCTUATION SIMPLE_GLYPH // FINAL_PUNCTUATION }; static int getGlyphClass(ArabicCharacterData.Record record) { String decomp = record.getDecomposition(); if (decomp != null && decomp.length() > 1) { return LIGATURE_GLYPH; } return categoryClassMap[record.getGeneralCategory()]; } static void addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable) { System.out.print("Adding Arabic glyph classes... "); for (int i = 0; i < data.countRecords(); i += 1) { ArabicCharacterData.Record record = data.getRecord(i); classTable.addMapping(record.getCodePoint(), getGlyphClass(record)); } System.out.println("Done."); } private static void buildArabicTables(ScriptList scriptList, FeatureList featureList, LookupList lookupList, ClassTable classTable) { // TODO: Might want to have the ligature table builder explicitly check for ligatures // which start with space and tatweel rather than pulling them out here... UnicodeSet arabicBlock = new UnicodeSet("[[\\p{block=Arabic}] & [[:Cf:][:Po:][:So:][:Mn:][:Nd:][:Lm:]]]"); UnicodeSet oddLigatures = new UnicodeSet("[\\uFC5E-\\uFC63\\uFCF2-\\uFCF4\\uFE70-\\uFE7F]"); UnicodeSet arabicLetters = new UnicodeSet("[\\p{Arabic}]"); ArabicCharacterData arabicData = ArabicCharacterData.factory(arabicLetters.addAll(arabicBlock).removeAll(oddLigatures)); addArabicGlyphClasses(arabicData, classTable); ClassTable initClassTable = new ClassTable(); ClassTable mediClassTable = new ClassTable(); ClassTable finaClassTable = new ClassTable(); ClassTable isolClassTable = new ClassTable(); buildArabicContextualForms(arabicData, initClassTable, mediClassTable, finaClassTable, isolClassTable); isolClassTable.snapshot(); LigatureTree ligaTree = buildArabicLigatureTree(arabicData, isolClassTable); LigatureTreeWalker ligaWalker = new LigatureTreeWalker(); ligaTree.walk(ligaWalker); Lookup initLookup, mediLookup, finaLookup, ligaLookup; initLookup = new Lookup(Lookup.GSST_Single, 0); initLookup.addSubtable(initClassTable); mediLookup = new Lookup(Lookup.GSST_Single, 0); mediLookup.addSubtable(mediClassTable); finaLookup = new Lookup(Lookup.GSST_Single, 0); finaLookup.addSubtable(finaClassTable); ligaLookup = new Lookup(Lookup.GSST_Ligature, Lookup.LF_IgnoreMarks); ligaLookup.addSubtable(ligaWalker); Feature init = new Feature("init"); Feature medi = new Feature("medi"); Feature fina = new Feature("fina"); Feature liga = new Feature("liga"); init.addLookup(lookupList.addLookup(initLookup)); medi.addLookup(lookupList.addLookup(mediLookup)); fina.addLookup(lookupList.addLookup(finaLookup)); liga.addLookup(lookupList.addLookup(ligaLookup)); featureList.addFeature(init); featureList.addFeature(medi); featureList.addFeature(fina); featureList.addFeature(liga); scriptList.addFeature("arab", "(default)", init); scriptList.addFeature("arab", "(default)", medi); scriptList.addFeature("arab", "(default)", fina); scriptList.addFeature("arab", "(default)", liga); System.out.println(); } public static void buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree) { int ligatureCount = 0; System.out.print("building composition ligature tree for " + UScript.getName(script) + "... "); for (int i = 0; i < data.countRecords(script); i += 1) { CanonicalCharacterData.Record record = data.getRecord(script, i); String composed = UCharacter.toString(record.getComposedCharacter()); for (int e = 0; e < record.countEquivalents(); e += 1) { String equivalent = record.getEquivalent(e); ligatureTree.insert(equivalent + composed); ligatureCount += 1; } } System.out.println(ligatureCount + " ligatures."); } public static DecompTable[] buildDecompTables(CanonicalCharacterData data, int script) { int maxDecompCount = data.getMaxEquivalents(script); DecompTable[] decompTables = new DecompTable[maxDecompCount]; System.out.print("Building decompositon tables for " + UScript.getName(script) + "... total decompositions: " + data.countRecords(script) + ", max: " + maxDecompCount + "..."); for (int i = 0; i < maxDecompCount; i += 1) { DecompTable table = new DecompTable(); for (int r = 0; r < data.countRecords(script); r += 1) { CanonicalCharacterData.Record record = data.getRecord(script, r); if (record.countEquivalents() > i) { table.add(record.getComposedCharacter(), record.getEquivalent(i)); } } decompTables[i] = table; } System.out.println(" Done."); return decompTables; } public static int[] buildLookups(CanonicalCharacterData data, LookupList lookupList, int script) { int[] lookups = new int[2]; DecompTable[] decompTables = buildDecompTables(data, script); LigatureTree compTree = new LigatureTree(); buildLigatureTree(data, script, compTree); System.out.println(); LigatureTreeWalker compWalker = new LigatureTreeWalker(); compTree.walk(compWalker); Lookup compLookup, dcmpLookup; //int compLookupIndex, dcmpLookupIndex; compLookup = new Lookup(Lookup.GSST_Ligature, 0); compLookup.addSubtable(compWalker); dcmpLookup = new Lookup(Lookup.GSST_Multiple, 0); for (int i = 0; i < decompTables.length; i += 1) { dcmpLookup.addSubtable(decompTables[i]); } lookups[0] = lookupList.addLookup(compLookup); lookups[1] = lookupList.addLookup(dcmpLookup); return lookups; } public static void addLookups(Feature feature, int[] lookups) { for (int i = 0; i < lookups.length; i += 1) { feature.addLookup(lookups[i]); } } /* * Hebrew mark order taken from the SBL Hebrew Font manual * Arabic mark order per Thomas Milo: hamza < shadda < combining_alef < sukun, vowel_marks < madda < qur'anic_marks */ public static ClassTable buildCombiningClassTable() { UnicodeSet markSet = new UnicodeSet("[\\P{CanonicalCombiningClass=0}]"); ClassTable exceptions = new ClassTable(); ClassTable combiningClasses = new ClassTable(); int markCount = markSet.size(); exceptions.addMapping(0x05C1, 10); // Point Shin Dot exceptions.addMapping(0x05C2, 11); // Point Sin Dot exceptions.addMapping(0x05BC, 21); // Point Dagesh or Mapiq exceptions.addMapping(0x05BF, 23); // Point Rafe exceptions.addMapping(0x05B9, 27); // Point Holam exceptions.addMapping(0x0323, 220); // Comb. Dot Below (low punctum) exceptions.addMapping(0x0591, 220); // Accent Etnahta exceptions.addMapping(0x0596, 220); // Accent Tipeha exceptions.addMapping(0x059B, 220); // Accent Tevir exceptions.addMapping(0x05A3, 220); // Accent Munah exceptions.addMapping(0x05A4, 220); // Accent Mahapakh exceptions.addMapping(0x05A5, 220); // Accent Merkha exceptions.addMapping(0x05A6, 220); // Accent Merkha Kefula exceptions.addMapping(0x05A7, 220); // Accent Darga exceptions.addMapping(0x05AA, 220); // Accent Yerah Ben Yomo exceptions.addMapping(0x05B0, 220); // Point Sheva exceptions.addMapping(0x05B1, 220); // Point Hataf Segol exceptions.addMapping(0x05B2, 220); // Point Hataf Patah exceptions.addMapping(0x05B3, 220); // Point Hataf Qamats exceptions.addMapping(0x05B4, 220); // Point Hiriq exceptions.addMapping(0x05B5, 220); // Point Tsere exceptions.addMapping(0x05B6, 220); // Point Segol exceptions.addMapping(0x05B7, 220); // Point Patah exceptions.addMapping(0x05B8, 220); // Point Qamats exceptions.addMapping(0x05BB, 220); // Point Qubuts exceptions.addMapping(0x05BD, 220); // Point Meteg exceptions.addMapping(0x059A, 222); // Accent Yetiv exceptions.addMapping(0x05AD, 222); // Accent Dehi exceptions.addMapping(0x05C4, 230); // Mark Upper Dot (high punctum) exceptions.addMapping(0x0593, 230); // Accent Shalshelet exceptions.addMapping(0x0594, 230); // Accent Zaqef Qatan exceptions.addMapping(0x0595, 230); // Accent Zaqef Gadol exceptions.addMapping(0x0597, 230); // Accent Revia exceptions.addMapping(0x0598, 230); // Accent Zarqa exceptions.addMapping(0x059F, 230); // Accent Qarney Para exceptions.addMapping(0x059E, 230); // Accent Gershayim exceptions.addMapping(0x059D, 230); // Accent Geresh Muqdam exceptions.addMapping(0x059C, 230); // Accent Geresh exceptions.addMapping(0x0592, 230); // Accent Segolta exceptions.addMapping(0x05A0, 230); // Accent Telisha Gedola exceptions.addMapping(0x05AC, 230); // Accent Iluy exceptions.addMapping(0x05A8, 230); // Accent Qadma exceptions.addMapping(0x05AB, 230); // Accent Ole exceptions.addMapping(0x05AF, 230); // Mark Masora Circle exceptions.addMapping(0x05A1, 230); // Accent Pazer //exceptions.addMapping(0x0307, 230); // Mark Number/Masora Dot exceptions.addMapping(0x05AE, 232); // Accent Zinor exceptions.addMapping(0x05A9, 232); // Accent Telisha Qetana exceptions.addMapping(0x0599, 232); // Accent Pashta exceptions.addMapping(0x0655, 27); // ARABIC HAMZA BELOW exceptions.addMapping(0x0654, 27); // ARABIC HAMZA ABOVE exceptions.addMapping(0x0651, 28); // ARABIC SHADDA exceptions.addMapping(0x0656, 29); // ARABIC SUBSCRIPT ALEF exceptions.addMapping(0x0670, 29); // ARABIC LETTER SUPERSCRIPT ALEF exceptions.addMapping(0x064D, 30); // ARABIC KASRATAN exceptions.addMapping(0x0650, 30); // ARABIC KASRA exceptions.addMapping(0x0652, 31); // ARABIC SUKUN exceptions.addMapping(0x06E1, 31); // ARABIC SMALL HIGH DOTLESS HEAD OF KHAH exceptions.addMapping(0x064B, 31); // ARABIC FATHATAN exceptions.addMapping(0x064C, 31); // ARABIC DAMMATAN exceptions.addMapping(0x064E, 31); // ARABIC FATHA exceptions.addMapping(0x064F, 31); // ARABIC DAMMA exceptions.addMapping(0x0657, 31); // ARABIC INVERTED DAMMA exceptions.addMapping(0x0658, 31); // ARABIC MARK NOON GHUNNA exceptions.addMapping(0x0653, 32); // ARABIC MADDAH ABOVE exceptions.snapshot(); for (int i = 0; i < markCount; i += 1) { int mark = markSet.charAt(i); int markClass = exceptions.getGlyphClassID(mark); if (markClass == 0) { markClass = UCharacter.getCombiningClass(mark); } combiningClasses.addMapping(mark, markClass); } combiningClasses.snapshot(); return combiningClasses; } public static void buildDecompTables(String fileName) { // F900 - FAFF are compatibility ideographs. They all decompose to a single other character, and can be ignored. //UnicodeSet decompSet = new UnicodeSet("[[[\\P{Hangul}] & [\\p{DecompositionType=Canonical}]] - [\uF900-\uFAFF]]"); UnicodeSet decompSet = new UnicodeSet("[[\\p{DecompositionType=Canonical}] & [\\P{FullCompositionExclusion}] & [\\P{Hangul}]]"); CanonicalCharacterData data = CanonicalCharacterData.factory(decompSet); ClassTable classTable = new ClassTable(); LookupList lookupList = new LookupList(); FeatureList featureList = new FeatureList(); ScriptList scriptList = new ScriptList(); // build common, inherited lookups... // int[] commonLookups = buildLookups(data, lookupList, UScript.COMMON); // int[] inheritedLookups = buildLookups(data, lookupList, UScript.INHERITED); for (int script = 0; script < UScript.CODE_LIMIT; script += 1) { // This is a bit lame, but it's the only way I can think of // to make this work w/o knowing the values of COMMON and INHERITED... if (script == UScript.COMMON || script == UScript.INHERITED || data.getMaxEquivalents(script) == 0) { continue; } int[] lookups = buildLookups(data, lookupList, script); Feature ccmp = new Feature("ccmp"); addLookups(ccmp, lookups); // addLookups(ccmp, commonLookups); // addLookups(ccmp, inheritedLookups); featureList.addFeature(ccmp); String scriptTag = TagUtilities.tagLabel(UScript.getShortName(script)); scriptList.addFeature(scriptTag, "(default)", ccmp); if (script == UScript.ARABIC) { buildArabicTables(scriptList, featureList, lookupList, classTable); } } featureList.finalizeFeatureList(); ClassTable markClassTable = buildCombiningClassTable(); GSUBWriter gsubWriter = new GSUBWriter("Canon", scriptList, featureList, lookupList); GDEFWriter gdefWriter = new GDEFWriter("Canon", classTable, markClassTable); String[] includeFiles = {"LETypes.h", "CanonShaping.h"}; LigatureModuleWriter writer = new LigatureModuleWriter(); writer.openFile(fileName); writer.writeHeader(null, includeFiles); writer.writeTable(gsubWriter); writer.writeTable(gdefWriter); writer.writeTrailer(); writer.closeFile(); } public static void main(String[] args) { buildDecompTables(args[0]); } }