2 // Copyright (C) 2002-2009, International Business Machines Corporation and others.
\r
3 // All Rights Reserved.
\r
7 package com.ibm.icu.text;
\r
9 import java.io.DataOutputStream;
\r
10 import java.io.IOException;
\r
11 import java.io.OutputStream;
\r
12 import java.util.ArrayList;
\r
13 import java.util.HashMap;
\r
14 import java.util.List;
\r
15 import java.util.Map;
\r
16 import java.util.Set;
\r
18 import com.ibm.icu.impl.Assert;
\r
19 import com.ibm.icu.impl.ICUDebug;
\r
21 class RBBIRuleBuilder {
\r
22 // This is the main class for building (compiling) break rules into the tables
\r
23 // required by the runtime RBBI engine.
\r
26 String fDebugEnv; // controls debug trace output
\r
27 String fRules; // The rule string that we are compiling
\r
28 RBBIRuleScanner fScanner; // The scanner.
\r
32 // There are four separate parse trees generated, one for each of the
\r
33 // forward rules, reverse rules, safe forward rules and safe reverse rules.
\r
34 // This array references the root of each of the trees.
\r
36 RBBINode[] fTreeRoots = new RBBINode[4];
\r
37 static final int fForwardTree = 0; // Indexes into the above fTreeRoots array
\r
38 static final int fReverseTree = 1; // for each of the trees.
\r
39 static final int fSafeFwdTree = 2; // (in C, these are pointer variables and
\r
40 static final int fSafeRevTree = 3; // there is no array.)
\r
41 int fDefaultTree = fForwardTree; // For rules not qualified with a !
\r
42 // the tree to which they belong to.
\r
44 boolean fChainRules; // True for chained Unicode TR style rules.
\r
45 // False for traditional regexp rules.
\r
47 boolean fLBCMNoChain; // True: suppress chaining of rules on
\r
48 // chars with LineBreak property == CM.
\r
50 boolean fLookAheadHardBreak; // True: Look ahead matches cause an
\r
51 // immediate break, no continuing for the
\r
54 RBBISetBuilder fSetBuilder; // Set and Character Category builder.
\r
55 List<RBBINode> fUSetNodes; // Vector of all uset nodes.
\r
56 RBBITableBuilder fForwardTables; // State transition tables
\r
57 RBBITableBuilder fReverseTables;
\r
58 RBBITableBuilder fSafeFwdTables;
\r
59 RBBITableBuilder fSafeRevTables;
\r
62 // Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.).
\r
64 Map<Set<Integer>, Integer> fStatusSets = new HashMap<Set<Integer>, Integer>(); // Status value sets encountered so far.
\r
65 // Map Key is the set of values.
\r
66 // Map Value is the runtime array index.
\r
68 List<Integer> fRuleStatusVals; // List of Integer objects. Has same layout as the
\r
69 // runtime array of status (tag) values -
\r
70 // number of values in group 1
\r
71 // first status value in group 1
\r
72 // 2nd status value in group 1
\r
74 // number of values in group 2
\r
75 // first status value in group 2
\r
78 // Error codes from ICU4C.
\r
79 // using these simplified the porting, and consolidated the
\r
80 // creation of Java exceptions
\r
82 static final int U_BRK_ERROR_START = 0x10200;
\r
83 /**< Start of codes indicating Break Iterator failures */
\r
85 static final int U_BRK_INTERNAL_ERROR = 0x10201;
\r
86 /**< An internal error (bug) was detected. */
\r
88 static final int U_BRK_HEX_DIGITS_EXPECTED = 0x10202;
\r
89 /**< Hex digits expected as part of a escaped char in a rule. */
\r
91 static final int U_BRK_SEMICOLON_EXPECTED = 0x10203;
\r
92 /**< Missing ';' at the end of a RBBI rule. */
\r
94 static final int U_BRK_RULE_SYNTAX = 0x10204;
\r
95 /**< Syntax error in RBBI rule. */
\r
97 static final int U_BRK_UNCLOSED_SET = 0x10205;
\r
98 /**< UnicodeSet witing an RBBI rule missing a closing ']'. */
\r
100 static final int U_BRK_ASSIGN_ERROR = 0x10206;
\r
101 /**< Syntax error in RBBI rule assignment statement. */
\r
103 static final int U_BRK_VARIABLE_REDFINITION = 0x10207;
\r
104 /**< RBBI rule $Variable redefined. */
\r
106 static final int U_BRK_MISMATCHED_PAREN = 0x10208;
\r
107 /**< Mis-matched parentheses in an RBBI rule. */
\r
109 static final int U_BRK_NEW_LINE_IN_QUOTED_STRING = 0x10209;
\r
110 /**< Missing closing quote in an RBBI rule. */
\r
112 static final int U_BRK_UNDEFINED_VARIABLE = 0x1020a;
\r
113 /**< Use of an undefined $Variable in an RBBI rule. */
\r
115 static final int U_BRK_INIT_ERROR = 0x1020b;
\r
116 /**< Initialization failure. Probable missing ICU Data. */
\r
118 static final int U_BRK_RULE_EMPTY_SET = 0x1020c;
\r
119 /**< Rule contains an empty Unicode Set. */
\r
121 static final int U_BRK_UNRECOGNIZED_OPTION = 0x1020d;
\r
122 /**< !!option in RBBI rules not recognized. */
\r
124 static final int U_BRK_MALFORMED_RULE_TAG = 0x1020e;
\r
125 /**< The {nnn} tag on a rule is mal formed */
\r
126 static final int U_BRK_MALFORMED_SET = 0x1020f;
\r
128 static final int U_BRK_ERROR_LIMIT = 0x10210;
\r
129 /**< This must always be the last value to indicate the limit for Break Iterator failures */
\r
132 //----------------------------------------------------------------------------------------
\r
136 //----------------------------------------------------------------------------------------
\r
137 RBBIRuleBuilder(String rules)
\r
139 fDebugEnv = ICUDebug.enabled("rbbi") ?
\r
140 ICUDebug.value("rbbi") : null;
\r
142 fUSetNodes = new ArrayList<RBBINode>();
\r
143 fRuleStatusVals = new ArrayList<Integer>();
\r
144 fScanner = new RBBIRuleScanner(this);
\r
145 fSetBuilder = new RBBISetBuilder(this);
\r
148 //----------------------------------------------------------------------------------------
\r
150 // flattenData() - Collect up the compiled RBBI rule data and put it into
\r
151 // the format for saving in ICU data files,
\r
153 // See the ICU4C file common/rbidata.h for a detailed description.
\r
155 //----------------------------------------------------------------------------------------
\r
156 static final int align8(int i)
\r
158 return (i + 7) & 0xfffffff8;
\r
161 void flattenData(OutputStream os) throws IOException {
\r
162 DataOutputStream dos = new DataOutputStream(os);
\r
165 // Remove comments and whitespace from the rules to make it smaller.
\r
166 String strippedRules = RBBIRuleScanner.stripRules(fRules);
\r
168 // Calculate the size of each section in the data in bytes.
\r
169 // Sizes here are padded up to a multiple of 8 for better memory alignment.
\r
170 // Sections sizes actually stored in the header are for the actual data
\r
171 // without the padding.
\r
173 int headerSize = 24 * 4; // align8(sizeof(RBBIDataHeader));
\r
174 int forwardTableSize = align8(fForwardTables.getTableSize());
\r
175 int reverseTableSize = align8(fReverseTables.getTableSize());
\r
176 int safeFwdTableSize = align8(fSafeFwdTables.getTableSize());
\r
177 int safeRevTableSize = align8(fSafeRevTables.getTableSize());
\r
178 int trieSize = align8(fSetBuilder.getTrieSize());
\r
179 int statusTableSize = align8(fRuleStatusVals.size() * 4);
\r
180 int rulesSize = align8((strippedRules.length()) * 2);
\r
181 int totalSize = headerSize + forwardTableSize + reverseTableSize
\r
182 + safeFwdTableSize + safeRevTableSize
\r
183 + statusTableSize + trieSize + rulesSize;
\r
184 int outputPos = 0; // Track stream position, starting from RBBIDataHeader.
\r
187 // Write out an ICU Data Header
\r
188 // TODO: actually create a real header, rather than just a placeholder.
\r
189 // The empty placeholder is ok for compile-and-go from within ICU4J.
\r
190 // Replicating the ICU4C genbrk tool for building .brk resources would need a real header.
\r
192 byte[] ICUDataHeader = new byte[0x80];
\r
193 dos.write(ICUDataHeader);
\r
196 // Write out the RBBIDataHeader
\r
198 int[] header = new int[RBBIDataWrapper.DH_SIZE]; // sizeof struct RBBIDataHeader
\r
199 header[RBBIDataWrapper.DH_MAGIC] = 0xb1a0;
\r
200 header[RBBIDataWrapper.DH_FORMATVERSION] = 0x03010000; // uint8_t fFormatVersion[4];
\r
201 header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections.
\r
202 header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); // fCatCount.
\r
203 header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable
\r
204 header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen
\r
205 header[RBBIDataWrapper.DH_RTABLE] = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable
\r
206 header[RBBIDataWrapper.DH_RTABLELEN] = reverseTableSize; // fRTableLen
\r
207 header[RBBIDataWrapper.DH_SFTABLE] = header[RBBIDataWrapper.DH_RTABLE]
\r
208 + reverseTableSize; // fSTable
\r
209 header[RBBIDataWrapper.DH_SFTABLELEN] = safeFwdTableSize; // fSTableLen
\r
210 header[RBBIDataWrapper.DH_SRTABLE] = header[RBBIDataWrapper.DH_SFTABLE]
\r
211 + safeFwdTableSize; // fSRTable
\r
212 header[RBBIDataWrapper.DH_SRTABLELEN] = safeRevTableSize; // fSRTableLen
\r
213 header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_SRTABLE]
\r
214 + safeRevTableSize; // fTrie
\r
215 header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen
\r
216 header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE]
\r
217 + header[RBBIDataWrapper.DH_TRIELEN];
\r
218 header[RBBIDataWrapper.DH_STATUSTABLELEN] = statusTableSize; // fStatusTableLen
\r
219 header[RBBIDataWrapper.DH_RULESOURCE] = header[RBBIDataWrapper.DH_STATUSTABLE]
\r
221 header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRules.length() * 2;
\r
222 for (i = 0; i < header.length; i++) {
\r
223 dos.writeInt(header[i]);
\r
227 // Write out the actual state tables.
\r
229 tableData = fForwardTables.exportTable();
\r
230 Assert.assrt(outputPos == header[4]);
\r
231 for (i = 0; i < tableData.length; i++) {
\r
232 dos.writeShort(tableData[i]);
\r
236 tableData = fReverseTables.exportTable();
\r
237 Assert.assrt(outputPos == header[6]);
\r
238 for (i = 0; i < tableData.length; i++) {
\r
239 dos.writeShort(tableData[i]);
\r
243 Assert.assrt(outputPos == header[8]);
\r
244 tableData = fSafeFwdTables.exportTable();
\r
245 for (i = 0; i < tableData.length; i++) {
\r
246 dos.writeShort(tableData[i]);
\r
250 Assert.assrt(outputPos == header[10]);
\r
251 tableData = fSafeRevTables.exportTable();
\r
252 for (i = 0; i < tableData.length; i++) {
\r
253 dos.writeShort(tableData[i]);
\r
257 // write out the Trie table
\r
258 Assert.assrt(outputPos == header[12]);
\r
259 fSetBuilder.serializeTrie(os);
\r
260 outputPos += header[13];
\r
261 while (outputPos % 8 != 0) { // pad to an 8 byte boundary
\r
266 // Write out the status {tag} table.
\r
267 Assert.assrt(outputPos == header[16]);
\r
268 for (Integer val : fRuleStatusVals) {
\r
269 dos.writeInt(val.intValue());
\r
273 while (outputPos % 8 != 0) { // pad to an 8 byte boundary
\r
278 // Write out the stripped rules (rules with extra spaces removed
\r
279 // These go last in the data area, even though they are not last in the header.
\r
280 Assert.assrt(outputPos == header[14]);
\r
281 dos.writeChars(strippedRules);
\r
282 outputPos += strippedRules.length() * 2;
\r
283 while (outputPos % 8 != 0) { // pad to an 8 byte boundary
\r
289 //----------------------------------------------------------------------------------------
\r
291 // compileRules compile source rules, placing the compiled form into a output stream
\r
292 // The compiled form is identical to that from ICU4C (Big Endian).
\r
294 //----------------------------------------------------------------------------------------
\r
295 static void compileRules(String rules, OutputStream os) throws IOException
\r
298 // Read the input rules, generate a parse tree, symbol table,
\r
299 // and list of all Unicode Sets referenced by the rules.
\r
301 RBBIRuleBuilder builder = new RBBIRuleBuilder(rules);
\r
302 builder.fScanner.parse();
\r
305 // UnicodeSet processing.
\r
306 // Munge the Unicode Sets to create a set of character categories.
\r
307 // Generate the mapping tables (TRIE) from input 32-bit characters to
\r
308 // the character categories.
\r
310 builder.fSetBuilder.build();
\r
313 // Generate the DFA state transition table.
\r
315 builder.fForwardTables = new RBBITableBuilder(builder, fForwardTree);
\r
316 builder.fReverseTables = new RBBITableBuilder(builder, fReverseTree);
\r
317 builder.fSafeFwdTables = new RBBITableBuilder(builder, fSafeFwdTree);
\r
318 builder.fSafeRevTables = new RBBITableBuilder(builder, fSafeRevTree);
\r
319 builder.fForwardTables.build();
\r
320 builder.fReverseTables.build();
\r
321 builder.fSafeFwdTables.build();
\r
322 builder.fSafeRevTables.build();
\r
323 if (builder.fDebugEnv != null
\r
324 && builder.fDebugEnv.indexOf("states") >= 0) {
\r
325 builder.fForwardTables.printRuleStatusTable();
\r
329 // Package up the compiled data, writing it to an output stream
\r
330 // in the serialization format. This is the same as the ICU4C runtime format.
\r
332 builder.flattenData(os);
\r