2 *******************************************************************************
3 * Copyright (C) 1996-2012, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
8 package com.ibm.icu.text;
10 import java.io.BufferedInputStream;
11 import java.io.DataInputStream;
12 import java.io.IOException;
13 import java.io.InputStream;
15 import com.ibm.icu.impl.CharTrie;
16 import com.ibm.icu.impl.Trie;
19 * <p>Internal class used for Rule Based Break Iterators</p>
20 * <p>This class provides access to the compiled break rule data, as
21 * it is stored in a .brk file.
23 final class RBBIDataWrapper {
25 // These fields are the ready-to-use compiled rule data, as
26 // read from the file.
28 RBBIDataHeader fHeader;
38 // Indexes to fields in the ICU4C style binary form of the RBBI Data Header
39 // Used by the rule compiler when flattening the data.
41 final static int DH_SIZE = 24;
42 final static int DH_MAGIC = 0;
43 final static int DH_FORMATVERSION = 1;
44 final static int DH_LENGTH = 2;
45 final static int DH_CATCOUNT = 3;
46 final static int DH_FTABLE = 4;
47 final static int DH_FTABLELEN = 5;
48 final static int DH_RTABLE = 6;
49 final static int DH_RTABLELEN = 7;
50 final static int DH_SFTABLE = 8;
51 final static int DH_SFTABLELEN = 9;
52 final static int DH_SRTABLE = 10;
53 final static int DH_SRTABLELEN = 11;
54 final static int DH_TRIE = 12;
55 final static int DH_TRIELEN = 13;
56 final static int DH_RULESOURCE = 14;
57 final static int DH_RULESOURCELEN = 15;
58 final static int DH_STATUSTABLE = 16;
59 final static int DH_STATUSTABLELEN = 17;
62 // Index offsets to the fields in a state table row.
63 // Corresponds to struct RBBIStateTableRow in the C version.
65 final static int ACCEPTING = 0;
66 final static int LOOKAHEAD = 1;
67 final static int TAGIDX = 2;
68 final static int RESERVED = 3;
69 final static int NEXTSTATES = 4;
71 // Index offsets to header fields of a state table
72 // struct RBBIStateTable {... in the C version.
74 final static int NUMSTATES = 0;
75 final static int ROWLEN = 2;
76 final static int FLAGS = 4;
77 final static int RESERVED_2 = 6;
78 final static int ROW_DATA = 8;
80 // Bit selectors for the "FLAGS" field of the state table header
81 // enum RBBIStateTableFlags in the C version.
83 final static int RBBI_LOOKAHEAD_HARD_BREAK = 1;
84 final static int RBBI_BOF_REQUIRED = 2;
87 * Data Header. A struct-like class with the fields from the RBBI data file header.
89 final static class RBBIDataHeader {
90 int fMagic; // == 0xbla0
91 int fVersion; // == 1 (for ICU 3.2 and earlier.
92 byte[] fFormatVersion; // For ICU 3.4 and later.
93 int fLength; // Total length in bytes of this RBBI Data,
94 // including all sections, not just the header.
95 int fCatCount; // Number of character categories.
98 // Offsets and sizes of each of the subsections within the RBBI data.
99 // All offsets are bytes from the start of the RBBIDataHeader.
100 // All sizes are in bytes.
102 int fFTable; // forward state transition table.
104 int fRTable; // Offset to the reverse state transition table.
106 int fSFTable; // safe point forward transition table
108 int fSRTable; // safe point reverse transition table
110 int fTrie; // Offset to Trie data for character categories
112 int fRuleSource; // Offset to the source for for the break
113 int fRuleSourceLen; // rules. Stored UChar *.
114 int fStatusTable; // Offset to the table of rule status values
117 public RBBIDataHeader() {
119 fFormatVersion = new byte[4];
125 * RBBI State Table Indexing Function. Given a state number, return the
126 * array index of the start of the state table row for that state.
129 int getRowIndex(int state){
130 return ROW_DATA + state * (fHeader.fCatCount + 4);
133 static class TrieFoldingFunc implements Trie.DataManipulate {
134 public int getFoldingOffset(int data) {
135 if ((data & 0x8000) != 0) {
136 return data & 0x7fff;
142 static TrieFoldingFunc fTrieFoldingFunc = new TrieFoldingFunc();
149 * Get an RBBIDataWrapper from an InputStream onto a pre-compiled set
152 static RBBIDataWrapper get(InputStream is) throws IOException {
155 DataInputStream dis = new DataInputStream(new BufferedInputStream(is));
156 RBBIDataWrapper This = new RBBIDataWrapper();
158 // Seek past the ICU data header.
159 // TODO: verify that the header looks good.
162 // Read in the RBBI data header...
163 This.fHeader = new RBBIDataHeader();
164 This.fHeader.fMagic = dis.readInt();
165 This.fHeader.fVersion = dis.readInt();
166 This.fHeader.fFormatVersion[0] = (byte) (This.fHeader.fVersion >> 24);
167 This.fHeader.fFormatVersion[1] = (byte) (This.fHeader.fVersion >> 16);
168 This.fHeader.fFormatVersion[2] = (byte) (This.fHeader.fVersion >> 8);
169 This.fHeader.fFormatVersion[3] = (byte) (This.fHeader.fVersion);
170 This.fHeader.fLength = dis.readInt();
171 This.fHeader.fCatCount = dis.readInt();
172 This.fHeader.fFTable = dis.readInt();
173 This.fHeader.fFTableLen = dis.readInt();
174 This.fHeader.fRTable = dis.readInt();
175 This.fHeader.fRTableLen = dis.readInt();
176 This.fHeader.fSFTable = dis.readInt();
177 This.fHeader.fSFTableLen = dis.readInt();
178 This.fHeader.fSRTable = dis.readInt();
179 This.fHeader.fSRTableLen = dis.readInt();
180 This.fHeader.fTrie = dis.readInt();
181 This.fHeader.fTrieLen = dis.readInt();
182 This.fHeader.fRuleSource = dis.readInt();
183 This.fHeader.fRuleSourceLen = dis.readInt();
184 This.fHeader.fStatusTable = dis.readInt();
185 This.fHeader.fStatusTableLen = dis.readInt();
186 dis.skip(6 * 4); // uint32_t fReserved[6];
189 if (This.fHeader.fMagic != 0xb1a0 ||
190 ! (This.fHeader.fVersion == 1 || // ICU 3.2 and earlier
191 This.fHeader.fFormatVersion[0] == 3) // ICU 3.4
193 throw new IOException("Break Iterator Rule Data Magic Number Incorrect, or unsupported data version.");
196 // Current position in input stream.
197 int pos = 24 * 4; // offset of end of header, which has 24 fields, all int32_t (4 bytes)
200 // Read in the Forward state transition table as an array of shorts.
203 // Quick Sanity Check
204 if (This.fHeader.fFTable < pos || This.fHeader.fFTable > This.fHeader.fLength) {
205 throw new IOException("Break iterator Rule data corrupt");
208 // Skip over any padding preceding this table
209 dis.skip(This.fHeader.fFTable - pos);
210 pos = This.fHeader.fFTable;
212 This.fFTable = new short[This.fHeader.fFTableLen / 2];
213 for ( i=0; i<This.fFTable.length; i++) {
214 This.fFTable[i] = dis.readShort();
219 // Read in the Reverse state table
222 // Skip over any padding in the file
223 dis.skip(This.fHeader.fRTable - pos);
224 pos = This.fHeader.fRTable;
226 // Create & fill the table itself.
227 This.fRTable = new short[This.fHeader.fRTableLen / 2];
228 for (i=0; i<This.fRTable.length; i++) {
229 This.fRTable[i] = dis.readShort();
234 // Read in the Safe Forward state table
236 if (This.fHeader.fSFTableLen > 0) {
237 // Skip over any padding in the file
238 dis.skip(This.fHeader.fSFTable - pos);
239 pos = This.fHeader.fSFTable;
241 // Create & fill the table itself.
242 This.fSFTable = new short[This.fHeader.fSFTableLen / 2];
243 for (i=0; i<This.fSFTable.length; i++) {
244 This.fSFTable[i] = dis.readShort();
250 // Read in the Safe Reverse state table
252 if (This.fHeader.fSRTableLen > 0) {
253 // Skip over any padding in the file
254 dis.skip(This.fHeader.fSRTable - pos);
255 pos = This.fHeader.fSRTable;
257 // Create & fill the table itself.
258 This.fSRTable = new short[This.fHeader.fSRTableLen / 2];
259 for (i=0; i<This.fSRTable.length; i++) {
260 This.fSRTable[i] = dis.readShort();
266 // Unserialize the Character categories TRIE
267 // Because we can't be absolutely certain where the Trie deserialize will
268 // leave the input stream, leave position unchanged.
269 // The seek to the start of the next item following the TRIE will get us
272 dis.skip(This.fHeader.fTrie - pos); // seek input stream from end of previous section to
273 pos = This.fHeader.fTrie; // to the start of the trie
275 dis.mark(This.fHeader.fTrieLen+100); // Mark position of start of TRIE in the input
276 // and tell Java to keep the mark valid so long
277 // as we don't go more than 100 bytes past the
278 // past the end of the TRIE.
280 This.fTrie = new CharTrie(dis, fTrieFoldingFunc); // Deserialize the TRIE, leaving input
281 // stream at an unknown position, preceding the
282 // padding between TRIE and following section.
284 dis.reset(); // Move input stream back to marked position at
285 // the start of the serialized TRIE. Now our
286 // "pos" variable and the input stream are in
290 // Read the Rule Status Table
292 if (pos > This.fHeader.fStatusTable) {
293 throw new IOException("Break iterator Rule data corrupt");
295 dis.skip(This.fHeader.fStatusTable - pos);
296 pos = This.fHeader.fStatusTable;
297 This.fStatusTable = new int[This.fHeader.fStatusTableLen / 4];
298 for (i=0; i<This.fStatusTable.length; i++) {
299 This.fStatusTable[i] = dis.readInt();
304 // Put the break rule source into a String
306 if (pos > This.fHeader.fRuleSource) {
307 throw new IOException("Break iterator Rule data corrupt");
309 dis.skip(This.fHeader.fRuleSource - pos);
310 pos = This.fHeader.fRuleSource;
311 StringBuilder sb = new StringBuilder(This.fHeader.fRuleSourceLen / 2);
312 for (i=0; i<This.fHeader.fRuleSourceLen; i+=2) {
313 sb.append(dis.readChar());
316 This.fRuleSource = sb.toString();
318 if (RuleBasedBreakIterator.fDebugEnv!=null && RuleBasedBreakIterator.fDebugEnv.indexOf("data")>=0) {
325 // Getters for fields from the state table header
327 final static int getNumStates(short table[]) {
328 int hi = table[NUMSTATES];
329 int lo = table[NUMSTATES+1];
330 int val = (hi<<16) + (lo&0x0000ffff);
336 /* Debug function to display the break iterator data. */
338 if (fFTable.length == 0) {
339 // There is no table. Fail early for testing purposes.
340 throw new NullPointerException();
342 System.out.println("RBBI Data Wrapper dump ...");
343 System.out.println();
344 System.out.println("Forward State Table");
346 System.out.println("Reverse State Table");
348 System.out.println("Forward Safe Points Table");
350 System.out.println("Reverse Safe Points Table");
353 dumpCharCategories();
354 System.out.println("Source Rules: " + fRuleSource);
360 /* Fixed width int-to-string conversion. */
361 static public String intToString(int n, int width) {
362 StringBuilder dest = new StringBuilder(width);
364 while (dest.length() < width) {
367 return dest.toString();
372 /* Fixed width int-to-string conversion. */
373 static public String intToHexString(int n, int width) {
374 StringBuilder dest = new StringBuilder(width);
375 dest.append(Integer.toHexString(n));
376 while (dest.length() < width) {
379 return dest.toString();
384 /** Dump a state table. (A full set of RBBI rules has 4 state tables.) */
385 private void dumpTable(short table[]) {
387 System.out.println(" -- null -- ");
391 StringBuilder header = new StringBuilder(" Row Acc Look Tag");
392 for (n=0; n<fHeader.fCatCount; n++) {
393 header.append(intToString(n, 5));
395 System.out.println(header.toString());
396 for (n=0; n<header.length(); n++) {
397 System.out.print("-");
399 System.out.println();
400 for (state=0; state< getNumStates(table); state++) {
401 dumpRow(table, state);
403 System.out.println();
410 * Dump (for debug) a single row of an RBBI state table
414 private void dumpRow(short table[], int state) {
415 StringBuilder dest = new StringBuilder(fHeader.fCatCount*5 + 20);
416 dest.append(intToString(state, 4));
417 int row = getRowIndex(state);
418 if (table[row+ACCEPTING] != 0) {
419 dest.append(intToString(table[row+ACCEPTING], 5));
423 if (table[row+LOOKAHEAD] != 0) {
424 dest.append(intToString(table[row+LOOKAHEAD], 5));
428 dest.append(intToString(table[row+TAGIDX], 5));
430 for (int col=0; col<fHeader.fCatCount; col++) {
431 dest.append(intToString(table[row+NEXTSTATES+col], 5));
434 System.out.println(dest);
439 private void dumpCharCategories() {
440 int n = fHeader.fCatCount;
441 String catStrings[] = new String[n+1];
447 int lastNewline[] = new int[n+1];
449 for (category = 0; category <= fHeader.fCatCount; category ++) {
450 catStrings[category] = "";
452 System.out.println("\nCharacter Categories");
453 System.out.println("--------------------");
454 for (char32 = 0; char32<=0x10ffff; char32++) {
455 category = fTrie.getCodePointValue(char32);
456 category &= ~0x4000; // Mask off dictionary bit.
457 if (category < 0 || category > fHeader.fCatCount) {
458 System.out.println("Error, bad category " + Integer.toHexString(category) +
459 " for char " + Integer.toHexString(char32));
462 if (category == lastCat ) {
466 if (catStrings[lastCat].length() > lastNewline[lastCat] + 70) {
467 lastNewline[lastCat] = catStrings[lastCat].length() + 10;
468 catStrings[lastCat] += "\n ";
471 catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
472 if (rangeEnd != rangeStart) {
473 catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
477 rangeStart = rangeEnd = char32;
480 catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
481 if (rangeEnd != rangeStart) {
482 catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
485 for (category = 0; category <= fHeader.fCatCount; category ++) {
486 System.out.println (intToString(category, 5) + " " + catStrings[category]);
488 System.out.println();
492 /*static RBBIDataWrapper get(String name) throws IOException {
493 String fullName = "data/" + name;
494 InputStream is = ICUData.getRequiredStream(fullName);
498 public static void main(String[] args) {
500 if (args.length == 0) {
505 System.out.println("RBBIDataWrapper.main(" + s + ") ");
507 String versionedName = ICUResourceBundle.ICU_BUNDLE+"/"+ s + ".brk";
510 RBBIDataWrapper This = RBBIDataWrapper.get(versionedName);
513 catch (Exception e) {
514 System.out.println("Exception: " + e.toString());