2 *******************************************************************************
\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import com.ibm.icu.impl.ICUBinary;
\r
11 import java.io.IOException;
\r
12 import java.io.InputStream;
\r
13 import java.io.DataInputStream;
\r
14 import java.text.CharacterIterator;
\r
17 * This is a class used to load in the compact trie dictionary file
\r
18 * used for dictionary based break iteration.
\r
20 * @deprecated This API is ICU internal only.
\r
22 class BreakCTDictionary {
\r
23 private CompactTrieHeader fData;
\r
25 static class CompactTrieHeader {
\r
26 int size; // Size of data in bytes
\r
28 int magic; // Magic number (including versions)
\r
30 int nodeCount; // Number of entries in offsets[]
\r
32 int root; // Node number of the root node
\r
34 int []offset; // Offsets to nodes from start of data
\r
36 CompactTrieHeader() {
\r
45 static final class CompactTrieNodeFlags {
\r
46 static final int kVerticalNode = 0x1000; // This is a vertical node
\r
48 static final int kParentEndsWord = 0x2000; // The node whose equal link
\r
49 // points to this ends a
\r
52 static final int kReservedFlag1 = 0x4000;
\r
54 static final int kReservedFlag2 = 0x8000;
\r
56 static final int kCountMask = 0x0FFF; // The count portion of
\r
59 static final int kFlagMask = 0xF000; // The flags portion of
\r
63 // The two node types are distinguished by the kVerticalNode flag.
\r
64 static class CompactTrieHorizontalNode {
\r
67 int equal; // Equal link node index
\r
69 CompactTrieHorizontalNode(char newCh, int newEqual) {
\r
75 static class CompactTrieVerticalNode {
\r
76 int equal; // Equal link node index
\r
78 char chars[]; // Code units
\r
80 CompactTrieVerticalNode() {
\r
86 private CompactTrieNodes getCompactTrieNode(int node) {
\r
90 // private class to hold both node information
\r
91 static class CompactTrieNodes {
\r
92 short flagscount; // Count of sub-entries, plus flags
\r
94 CompactTrieHorizontalNode[] hnode;
\r
96 CompactTrieVerticalNode vnode;
\r
98 CompactTrieNodes() {
\r
105 private CompactTrieNodes[] nodes;
\r
108 public BreakCTDictionary(InputStream is) throws IOException {
\r
109 ICUBinary.readHeader(is, DATA_FORMAT_ID, null);
\r
111 DataInputStream in = new DataInputStream(is);
\r
112 // Get header information
\r
113 fData = new CompactTrieHeader();
\r
114 fData.size = in.readInt();
\r
115 fData.magic = in.readInt();
\r
116 fData.nodeCount = in.readShort();
\r
117 fData.root = in.readShort();
\r
119 loadBreakCTDictionary(in);
\r
122 // Loads the compact trie dictionary file into the CompactTrieNodes
\r
123 private void loadBreakCTDictionary(DataInputStream in) throws IOException {
\r
124 // skip over offset information
\r
125 for (int i = 0; i < fData.nodeCount; i++) {
\r
129 // Create compact trie dictionary
\r
130 nodes = new CompactTrieNodes[fData.nodeCount];
\r
131 nodes[0] = new CompactTrieNodes();
\r
133 // Load in compact trie dictionary
\r
134 for (int j = 1; j < fData.nodeCount; j++) {
\r
135 nodes[j] = new CompactTrieNodes();
\r
136 nodes[j].flagscount = in.readShort();
\r
138 int count = nodes[j].flagscount & CompactTrieNodeFlags.kCountMask;
\r
141 boolean isVerticalNode = (nodes[j].flagscount & CompactTrieNodeFlags.kVerticalNode) != 0;
\r
144 if (isVerticalNode) {
\r
145 nodes[j].vnode = new CompactTrieVerticalNode();
\r
146 nodes[j].vnode.equal = in.readShort();
\r
148 nodes[j].vnode.chars = new char[count];
\r
149 for (int l = 0; l < count; l++) {
\r
150 nodes[j].vnode.chars[l] = in.readChar();
\r
152 } else { // Horizontal node
\r
153 nodes[j].hnode = new CompactTrieHorizontalNode[count];
\r
154 for (int n = 0; n < count; n++) {
\r
155 nodes[j].hnode[n] = new CompactTrieHorizontalNode(in
\r
156 .readChar(), in.readShort());
\r
164 * Find dictionary words that match the text.
\r
167 * A CharacterIterator representing the text. The iterator is
\r
168 * left after the longest prefix match in the dictionary.
\r
170 * The maximum number of code units to match.
\r
172 * An array that is filled with the lengths of words that
\r
175 * Filled with the number of elements output in lengths.
\r
177 * The size of the lengths array; this limits the number of words
\r
179 * @return The number of characters in text that were matched.
\r
181 public int matches(CharacterIterator text, int maxLength, int lengths[],
\r
182 int count[], int limit) {
\r
183 // Current implementation works in UTF-16 space
\r
184 CompactTrieNodes node = getCompactTrieNode(fData.root);
\r
187 char uc = (char) text.current();
\r
189 boolean exitFlag = false;
\r
191 while (node != null) {
\r
192 // Check if the node we just exited ends a word
\r
194 && (node.flagscount & CompactTrieNodeFlags.kParentEndsWord) != 0) {
\r
195 lengths[mycount++] = i;
\r
198 // Check that we haven't exceeded the maximum number of input
\r
200 // We have to do that here rather than in the while condition so
\r
202 // we can check for ending of a word above.
\r
203 if (i >= maxLength) {
\r
207 int nodeCount = (node.flagscount & CompactTrieNodeFlags.kCountMask);
\r
208 if (nodeCount == 0) {
\r
209 // Special terminal node; return now
\r
212 if ((node.flagscount & CompactTrieNodeFlags.kVerticalNode) != 0) {
\r
213 // Vertical node; check all the characters in it
\r
214 CompactTrieVerticalNode vnode = node.vnode;
\r
215 for (int j = 0; j < nodeCount && i < maxLength; j++) {
\r
216 if (uc != vnode.chars[j]) {
\r
217 // We hit a non equal character return;
\r
222 uc = (char) text.current();
\r
228 // To get here, we must have come through the whole list successfully;
\r
229 // go on to the next node. Note that a word cannot end in the middle
\r
230 // of a vertical node.
\r
231 node = getCompactTrieNode(vnode.equal);
\r
233 // Horizontal node; do binary search
\r
234 CompactTrieHorizontalNode[] hnode = node.hnode;
\r
236 int high = nodeCount - 1;
\r
238 node = null; // If we don't find a match, we'll fall out of the loop
\r
239 while (high >= low) {
\r
240 middle = (high + low) / 2;
\r
241 if (uc == hnode[middle].ch) {
\r
242 // We hit a match; get the next node and next character
\r
243 node = getCompactTrieNode(hnode[middle].equal);
\r
245 uc = (char) text.current();
\r
248 } else if (uc < hnode[middle].ch) {
\r
257 count[0] = mycount;
\r
261 // Use for reading the header portion of the file
\r
262 private static final byte DATA_FORMAT_ID[] = { (byte) 0x54, (byte) 0x72,
\r
263 (byte) 0x44, (byte) 0x63 };
\r