2 *******************************************************************************
\r
3 * Copyright (C) 1996-2006, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import java.io.BufferedInputStream;
\r
11 import java.io.InputStream;
\r
12 import java.io.DataInputStream;
\r
13 import java.io.IOException;
\r
15 import com.ibm.icu.impl.Trie;
\r
16 import com.ibm.icu.impl.CharTrie;
\r
19 * <p>Internal class used for Rule Based Break Iterators</p>
\r
20 * <p>This class provides access to the compiled break rule data, as
\r
21 * it is stored in a .brk file.
\r
25 final class RBBIDataWrapper {
\r
27 // These fields are the ready-to-use compiled rule data, as
\r
28 // read from the file.
\r
30 RBBIDataHeader fHeader;
\r
40 // Indexes to fields in the ICU4C style binary form of the RBBI Data Header
\r
41 // Used by the rule compiler when flattening the data.
\r
43 final static int DH_SIZE = 24;
\r
44 final static int DH_MAGIC = 0;
\r
45 final static int DH_FORMATVERSION = 1;
\r
46 final static int DH_LENGTH = 2;
\r
47 final static int DH_CATCOUNT = 3;
\r
48 final static int DH_FTABLE = 4;
\r
49 final static int DH_FTABLELEN = 5;
\r
50 final static int DH_RTABLE = 6;
\r
51 final static int DH_RTABLELEN = 7;
\r
52 final static int DH_SFTABLE = 8;
\r
53 final static int DH_SFTABLELEN = 9;
\r
54 final static int DH_SRTABLE = 10;
\r
55 final static int DH_SRTABLELEN = 11;
\r
56 final static int DH_TRIE = 12;
\r
57 final static int DH_TRIELEN = 13;
\r
58 final static int DH_RULESOURCE = 14;
\r
59 final static int DH_RULESOURCELEN = 15;
\r
60 final static int DH_STATUSTABLE = 16;
\r
61 final static int DH_STATUSTABLELEN = 17;
\r
64 // Index offsets to the fields in a state table row.
\r
65 // Corresponds to struct RBBIStateTableRow in the C version.
\r
67 final static int ACCEPTING = 0;
\r
68 final static int LOOKAHEAD = 1;
\r
69 final static int TAGIDX = 2;
\r
70 final static int RESERVED = 3;
\r
71 final static int NEXTSTATES = 4;
\r
73 // Index offsets to header fields of a state table
\r
74 // struct RBBIStateTable {... in the C version.
\r
76 final static int NUMSTATES = 0;
\r
77 final static int ROWLEN = 2;
\r
78 final static int FLAGS = 4;
\r
79 final static int RESERVED_2 = 6;
\r
80 final static int ROW_DATA = 8;
\r
82 // Bit selectors for the "FLAGS" field of the state table header
\r
83 // enum RBBIStateTableFlags in the C version.
\r
85 final static int RBBI_LOOKAHEAD_HARD_BREAK = 1;
\r
86 final static int RBBI_BOF_REQUIRED = 2;
\r
89 * Data Header. A struct-like class with the fields from the RBBI data file header.
\r
91 final static class RBBIDataHeader {
\r
92 int fMagic; // == 0xbla0
\r
93 int fVersion; // == 1 (for ICU 3.2 and earlier.
\r
94 byte[] fFormatVersion; // For ICU 3.4 and later.
\r
95 int fLength; // Total length in bytes of this RBBI Data,
\r
96 // including all sections, not just the header.
\r
97 int fCatCount; // Number of character categories.
\r
100 // Offsets and sizes of each of the subsections within the RBBI data.
\r
101 // All offsets are bytes from the start of the RBBIDataHeader.
\r
102 // All sizes are in bytes.
\r
104 int fFTable; // forward state transition table.
\r
106 int fRTable; // Offset to the reverse state transition table.
\r
108 int fSFTable; // safe point forward transition table
\r
110 int fSRTable; // safe point reverse transition table
\r
112 int fTrie; // Offset to Trie data for character categories
\r
114 int fRuleSource; // Offset to the source for for the break
\r
115 int fRuleSourceLen; // rules. Stored UChar *.
\r
116 int fStatusTable; // Offset to the table of rule status values
\r
117 int fStatusTableLen;
\r
119 public RBBIDataHeader() {
\r
121 fFormatVersion = new byte[4];
\r
127 * RBBI State Table Indexing Function. Given a state number, return the
\r
128 * array index of the start of the state table row for that state.
\r
131 int getRowIndex(int state){
\r
132 return ROW_DATA + state * (fHeader.fCatCount + 4);
\r
135 static class TrieFoldingFunc implements Trie.DataManipulate {
\r
136 public int getFoldingOffset(int data) {
\r
137 if ((data & 0x8000) != 0) {
\r
138 return data & 0x7fff;
\r
144 static TrieFoldingFunc fTrieFoldingFunc = new TrieFoldingFunc();
\r
147 RBBIDataWrapper() {
\r
151 * Get an RBBIDataWrapper from an InputStream onto a pre-compiled set
\r
154 static RBBIDataWrapper get(InputStream is) throws IOException {
\r
157 DataInputStream dis = new DataInputStream(new BufferedInputStream(is));
\r
158 RBBIDataWrapper This = new RBBIDataWrapper();
\r
160 // Seek past the ICU data header.
\r
161 // TODO: verify that the header looks good.
\r
164 // Read in the RBBI data header...
\r
165 This.fHeader = new RBBIDataHeader();
\r
166 This.fHeader.fMagic = dis.readInt();
\r
167 This.fHeader.fVersion = dis.readInt();
\r
168 This.fHeader.fFormatVersion[0] = (byte) (This.fHeader.fVersion >> 24);
\r
169 This.fHeader.fFormatVersion[1] = (byte) (This.fHeader.fVersion >> 16);
\r
170 This.fHeader.fFormatVersion[2] = (byte) (This.fHeader.fVersion >> 8);
\r
171 This.fHeader.fFormatVersion[3] = (byte) (This.fHeader.fVersion);
\r
172 This.fHeader.fLength = dis.readInt();
\r
173 This.fHeader.fCatCount = dis.readInt();
\r
174 This.fHeader.fFTable = dis.readInt();
\r
175 This.fHeader.fFTableLen = dis.readInt();
\r
176 This.fHeader.fRTable = dis.readInt();
\r
177 This.fHeader.fRTableLen = dis.readInt();
\r
178 This.fHeader.fSFTable = dis.readInt();
\r
179 This.fHeader.fSFTableLen = dis.readInt();
\r
180 This.fHeader.fSRTable = dis.readInt();
\r
181 This.fHeader.fSRTableLen = dis.readInt();
\r
182 This.fHeader.fTrie = dis.readInt();
\r
183 This.fHeader.fTrieLen = dis.readInt();
\r
184 This.fHeader.fRuleSource = dis.readInt();
\r
185 This.fHeader.fRuleSourceLen = dis.readInt();
\r
186 This.fHeader.fStatusTable = dis.readInt();
\r
187 This.fHeader.fStatusTableLen = dis.readInt();
\r
188 dis.skip(6 * 4); // uint32_t fReserved[6];
\r
191 if (This.fHeader.fMagic != 0xb1a0 ||
\r
192 ! (This.fHeader.fVersion == 1 || // ICU 3.2 and earlier
\r
193 This.fHeader.fFormatVersion[0] == 3) // ICU 3.4
\r
195 throw new IOException("Break Iterator Rule Data Magic Number Incorrect, or unsupported data version.");
\r
198 // Current position in input stream.
\r
199 int pos = 24 * 4; // offset of end of header, which has 24 fields, all int32_t (4 bytes)
\r
202 // Read in the Forward state transition table as an array of shorts.
\r
205 // Quick Sanity Check
\r
206 if (This.fHeader.fFTable < pos || This.fHeader.fFTable > This.fHeader.fLength) {
\r
207 throw new IOException("Break iterator Rule data corrupt");
\r
210 // Skip over any padding preceding this table
\r
211 dis.skip(This.fHeader.fFTable - pos);
\r
212 pos = This.fHeader.fFTable;
\r
214 This.fFTable = new short[This.fHeader.fFTableLen / 2];
\r
215 for ( i=0; i<This.fFTable.length; i++) {
\r
216 This.fFTable[i] = dis.readShort();
\r
221 // Read in the Reverse state table
\r
224 // Skip over any padding in the file
\r
225 dis.skip(This.fHeader.fRTable - pos);
\r
226 pos = This.fHeader.fRTable;
\r
228 // Create & fill the table itself.
\r
229 This.fRTable = new short[This.fHeader.fRTableLen / 2];
\r
230 for (i=0; i<This.fRTable.length; i++) {
\r
231 This.fRTable[i] = dis.readShort();
\r
236 // Read in the Safe Forward state table
\r
238 if (This.fHeader.fSFTableLen > 0) {
\r
239 // Skip over any padding in the file
\r
240 dis.skip(This.fHeader.fSFTable - pos);
\r
241 pos = This.fHeader.fSFTable;
\r
243 // Create & fill the table itself.
\r
244 This.fSFTable = new short[This.fHeader.fSFTableLen / 2];
\r
245 for (i=0; i<This.fSFTable.length; i++) {
\r
246 This.fSFTable[i] = dis.readShort();
\r
252 // Read in the Safe Reverse state table
\r
254 if (This.fHeader.fSRTableLen > 0) {
\r
255 // Skip over any padding in the file
\r
256 dis.skip(This.fHeader.fSRTable - pos);
\r
257 pos = This.fHeader.fSRTable;
\r
259 // Create & fill the table itself.
\r
260 This.fSRTable = new short[This.fHeader.fSRTableLen / 2];
\r
261 for (i=0; i<This.fSRTable.length; i++) {
\r
262 This.fSRTable[i] = dis.readShort();
\r
268 // Unserialize the Character categories TRIE
\r
269 // Because we can't be absolutely certain where the Trie deserialize will
\r
270 // leave the input stream, leave position unchanged.
\r
271 // The seek to the start of the next item following the TRIE will get us
\r
274 dis.skip(This.fHeader.fTrie - pos); // seek input stream from end of previous section to
\r
275 pos = This.fHeader.fTrie; // to the start of the trie
\r
277 dis.mark(This.fHeader.fTrieLen+100); // Mark position of start of TRIE in the input
\r
278 // and tell Java to keep the mark valid so long
\r
279 // as we don't go more than 100 bytes past the
\r
280 // past the end of the TRIE.
\r
282 This.fTrie = new CharTrie(dis, fTrieFoldingFunc); // Deserialize the TRIE, leaving input
\r
283 // stream at an unknown position, preceding the
\r
284 // padding between TRIE and following section.
\r
286 dis.reset(); // Move input stream back to marked position at
\r
287 // the start of the serialized TRIE. Now our
\r
288 // "pos" variable and the input stream are in
\r
292 // Read the Rule Status Table
\r
294 if (pos > This.fHeader.fStatusTable) {
\r
295 throw new IOException("Break iterator Rule data corrupt");
\r
297 dis.skip(This.fHeader.fStatusTable - pos);
\r
298 pos = This.fHeader.fStatusTable;
\r
299 This.fStatusTable = new int[This.fHeader.fStatusTableLen / 4];
\r
300 for (i=0; i<This.fStatusTable.length; i++) {
\r
301 This.fStatusTable[i] = dis.readInt();
\r
306 // Put the break rule source into a String
\r
308 if (pos > This.fHeader.fRuleSource) {
\r
309 throw new IOException("Break iterator Rule data corrupt");
\r
311 dis.skip(This.fHeader.fRuleSource - pos);
\r
312 pos = This.fHeader.fRuleSource;
\r
313 StringBuffer sb = new StringBuffer(This.fHeader.fRuleSourceLen / 2);
\r
314 for (i=0; i<This.fHeader.fRuleSourceLen; i+=2) {
\r
315 sb.append(dis.readChar());
\r
318 This.fRuleSource = sb.toString();
\r
320 if (RuleBasedBreakIterator.fDebugEnv!=null && RuleBasedBreakIterator.fDebugEnv.indexOf("data")>=0) {
\r
327 // Getters for fields from the state table header
\r
329 final static int getNumStates(short table[]) {
\r
330 int hi = table[NUMSTATES];
\r
331 int lo = table[NUMSTATES+1];
\r
332 int val = (hi<<16) + (lo&0x0000ffff);
\r
338 /** Debug function to display the break iterator data.
\r
342 System.out.println("RBBI Data Wrapper dump ...");
\r
343 System.out.println();
\r
344 System.out.println("Forward State Table");
\r
345 dumpTable(fFTable);
\r
346 System.out.println("Reverse State Table");
\r
347 dumpTable(fRTable);
\r
348 System.out.println("Forward Safe Points Table");
\r
349 dumpTable(fSFTable);
\r
350 System.out.println("Reverse Safe Points Table");
\r
351 dumpTable(fSRTable);
\r
353 dumpCharCategories();
\r
354 System.out.println("Source Rules: " + fRuleSource);
\r
360 /** Fixed width int-to-string conversion.
\r
364 static public String intToString(int n, int width) {
\r
365 StringBuffer dest = new StringBuffer(width);
\r
367 while (dest.length() < width) {
\r
368 dest.insert(0, ' ');
\r
370 return dest.toString();
\r
375 /** Fixed width int-to-string conversion.
\r
379 static public String intToHexString(int n, int width) {
\r
380 StringBuffer dest = new StringBuffer(width);
\r
381 dest.append(Integer.toHexString(n));
\r
382 while (dest.length() < width) {
\r
383 dest.insert(0, ' ');
\r
385 return dest.toString();
\r
390 /** Dump a state table. (A full set of RBBI rules has 4 state tables.) */
\r
391 private void dumpTable(short table[]) {
\r
392 if (table == null) {
\r
393 System.out.println(" -- null -- ");
\r
397 String header = " Row Acc Look Tag";
\r
398 for (n=0; n<fHeader.fCatCount; n++) {
\r
399 header += intToString(n, 5);
\r
401 System.out.println(header);
\r
402 for (n=0; n<header.length(); n++) {
\r
403 System.out.print("-");
\r
405 System.out.println();
\r
406 for (state=0; state< getNumStates(table); state++) {
\r
407 dumpRow(table, state);
\r
409 System.out.println();
\r
416 * Dump (for debug) a single row of an RBBI state table
\r
421 private void dumpRow(short table[], int state) {
\r
422 StringBuffer dest = new StringBuffer(fHeader.fCatCount*5 + 20);
\r
423 dest.append(intToString(state, 4));
\r
424 int row = getRowIndex(state);
\r
425 if (table[row+ACCEPTING] != 0) {
\r
426 dest.append(intToString(table[row+ACCEPTING], 5));
\r
430 if (table[row+LOOKAHEAD] != 0) {
\r
431 dest.append(intToString(table[row+LOOKAHEAD], 5));
\r
435 dest.append(intToString(table[row+TAGIDX], 5));
\r
437 for (int col=0; col<fHeader.fCatCount; col++) {
\r
438 dest.append(intToString(table[row+NEXTSTATES+col], 5));
\r
441 System.out.println(dest);
\r
446 private void dumpCharCategories() {
\r
447 int n = fHeader.fCatCount;
\r
448 String catStrings[] = new String[n+1];
\r
449 int rangeStart = 0;
\r
454 int lastNewline[] = new int[n+1];
\r
456 for (category = 0; category <= fHeader.fCatCount; category ++) {
\r
457 catStrings[category] = "";
\r
459 System.out.println("\nCharacter Categories");
\r
460 System.out.println("--------------------");
\r
461 for (char32 = 0; char32<=0x10ffff; char32++) {
\r
462 category = fTrie.getCodePointValue(char32);
\r
463 category &= ~0x4000; // Mask off dictionary bit.
\r
464 if (category < 0 || category > fHeader.fCatCount) {
\r
465 System.out.println("Error, bad category " + Integer.toHexString(category) +
\r
466 " for char " + Integer.toHexString(char32));
\r
469 if (category == lastCat ) {
\r
470 rangeEnd = char32;
\r
472 if (lastCat >= 0) {
\r
473 if (catStrings[lastCat].length() > lastNewline[lastCat] + 70) {
\r
474 lastNewline[lastCat] = catStrings[lastCat].length() + 10;
\r
475 catStrings[lastCat] += "\n ";
\r
478 catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
\r
479 if (rangeEnd != rangeStart) {
\r
480 catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
\r
483 lastCat = category;
\r
484 rangeStart = rangeEnd = char32;
\r
487 catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
\r
488 if (rangeEnd != rangeStart) {
\r
489 catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
\r
492 for (category = 0; category <= fHeader.fCatCount; category ++) {
\r
493 System.out.println (intToString(category, 5) + " " + catStrings[category]);
\r
495 System.out.println();
\r
499 /*static RBBIDataWrapper get(String name) throws IOException {
\r
500 String fullName = "data/" + name;
\r
501 InputStream is = ICUData.getRequiredStream(fullName);
\r
505 public static void main(String[] args) {
\r
507 if (args.length == 0) {
\r
512 System.out.println("RBBIDataWrapper.main(" + s + ") ");
\r
514 String versionedName = ICUResourceBundle.ICU_BUNDLE+"/"+ s + ".brk";
\r
517 RBBIDataWrapper This = RBBIDataWrapper.get(versionedName);
\r
520 catch (Exception e) {
\r
521 System.out.println("Exception: " + e.toString());
\r