2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.io.DataInputStream;
\r
10 import java.io.IOException;
\r
11 import java.io.InputStream;
\r
12 import java.text.CharacterIterator;
\r
14 import com.ibm.icu.impl.ICUBinary;
\r
17 * This is a class used to load in the compact trie dictionary file
\r
18 * used for dictionary based break iteration.
\r
20 class BreakCTDictionary {
\r
21 private CompactTrieHeader fData;
\r
23 static class CompactTrieHeader {
\r
24 int size; // Size of data in bytes
\r
26 int magic; // Magic number (including versions)
\r
28 int nodeCount; // Number of entries in offsets[]
\r
30 int root; // Node number of the root node
\r
32 int []offset; // Offsets to nodes from start of data
\r
34 CompactTrieHeader() {
\r
43 static final class CompactTrieNodeFlags {
\r
44 static final int kVerticalNode = 0x1000; // This is a vertical node
\r
46 static final int kParentEndsWord = 0x2000; // The node whose equal link
\r
47 // points to this ends a
\r
50 static final int kReservedFlag1 = 0x4000;
\r
52 static final int kReservedFlag2 = 0x8000;
\r
54 static final int kCountMask = 0x0FFF; // The count portion of
\r
57 static final int kFlagMask = 0xF000; // The flags portion of
\r
61 // The two node types are distinguished by the kVerticalNode flag.
\r
62 static class CompactTrieHorizontalNode {
\r
65 int equal; // Equal link node index
\r
67 CompactTrieHorizontalNode(char newCh, int newEqual) {
\r
73 static class CompactTrieVerticalNode {
\r
74 int equal; // Equal link node index
\r
76 char chars[]; // Code units
\r
78 CompactTrieVerticalNode() {
\r
84 private CompactTrieNodes getCompactTrieNode(int node) {
\r
88 // private class to hold both node information
\r
89 static class CompactTrieNodes {
\r
90 short flagscount; // Count of sub-entries, plus flags
\r
92 CompactTrieHorizontalNode[] hnode;
\r
94 CompactTrieVerticalNode vnode;
\r
96 CompactTrieNodes() {
\r
103 private CompactTrieNodes[] nodes;
\r
106 public BreakCTDictionary(InputStream is) throws IOException {
\r
107 ICUBinary.readHeader(is, DATA_FORMAT_ID, null);
\r
109 DataInputStream in = new DataInputStream(is);
\r
110 // Get header information
\r
111 fData = new CompactTrieHeader();
\r
112 fData.size = in.readInt();
\r
113 fData.magic = in.readInt();
\r
114 fData.nodeCount = in.readShort();
\r
115 fData.root = in.readShort();
\r
117 loadBreakCTDictionary(in);
\r
120 // Loads the compact trie dictionary file into the CompactTrieNodes
\r
121 private void loadBreakCTDictionary(DataInputStream in) throws IOException {
\r
122 // skip over offset information
\r
123 for (int i = 0; i < fData.nodeCount; i++) {
\r
127 // Create compact trie dictionary
\r
128 nodes = new CompactTrieNodes[fData.nodeCount];
\r
129 nodes[0] = new CompactTrieNodes();
\r
131 // Load in compact trie dictionary
\r
132 for (int j = 1; j < fData.nodeCount; j++) {
\r
133 nodes[j] = new CompactTrieNodes();
\r
134 nodes[j].flagscount = in.readShort();
\r
136 int count = nodes[j].flagscount & CompactTrieNodeFlags.kCountMask;
\r
139 boolean isVerticalNode = (nodes[j].flagscount & CompactTrieNodeFlags.kVerticalNode) != 0;
\r
142 if (isVerticalNode) {
\r
143 nodes[j].vnode = new CompactTrieVerticalNode();
\r
144 nodes[j].vnode.equal = in.readShort();
\r
146 nodes[j].vnode.chars = new char[count];
\r
147 for (int l = 0; l < count; l++) {
\r
148 nodes[j].vnode.chars[l] = in.readChar();
\r
150 } else { // Horizontal node
\r
151 nodes[j].hnode = new CompactTrieHorizontalNode[count];
\r
152 for (int n = 0; n < count; n++) {
\r
153 nodes[j].hnode[n] = new CompactTrieHorizontalNode(in
\r
154 .readChar(), in.readShort());
\r
162 * Find dictionary words that match the text.
\r
164 * @param text A CharacterIterator representing the text. The iterator is
\r
165 * left after the longest prefix match in the dictionary.
\r
166 * @param maxLength The maximum number of code units to match.
\r
167 * @param lengths An array that is filled with the lengths of words that matched.
\r
168 * @param count Filled with the number of elements output in lengths.
\r
169 * @param limit The size of the lengths array; this limits the number of words output.
\r
170 * @return The number of characters in text that were matched.
\r
172 public int matches(CharacterIterator text, int maxLength, int lengths[],
\r
173 int count[], int limit) {
\r
174 // Current implementation works in UTF-16 space
\r
175 CompactTrieNodes node = getCompactTrieNode(fData.root);
\r
178 char uc = text.current();
\r
180 boolean exitFlag = false;
\r
182 while (node != null) {
\r
183 // Check if the node we just exited ends a word
\r
185 && (node.flagscount & CompactTrieNodeFlags.kParentEndsWord) != 0) {
\r
186 lengths[mycount++] = i;
\r
189 // Check that we haven't exceeded the maximum number of input
\r
191 // We have to do that here rather than in the while condition so
\r
193 // we can check for ending of a word above.
\r
194 if (i >= maxLength) {
\r
198 int nodeCount = (node.flagscount & CompactTrieNodeFlags.kCountMask);
\r
199 if (nodeCount == 0) {
\r
200 // Special terminal node; return now
\r
203 if ((node.flagscount & CompactTrieNodeFlags.kVerticalNode) != 0) {
\r
204 // Vertical node; check all the characters in it
\r
205 CompactTrieVerticalNode vnode = node.vnode;
\r
206 for (int j = 0; j < nodeCount && i < maxLength; j++) {
\r
207 if (uc != vnode.chars[j]) {
\r
208 // We hit a non equal character return;
\r
213 uc = text.current();
\r
219 // To get here, we must have come through the whole list successfully;
\r
220 // go on to the next node. Note that a word cannot end in the middle
\r
221 // of a vertical node.
\r
222 node = getCompactTrieNode(vnode.equal);
\r
224 // Horizontal node; do binary search
\r
225 CompactTrieHorizontalNode[] hnode = node.hnode;
\r
227 int high = nodeCount - 1;
\r
229 node = null; // If we don't find a match, we'll fall out of the loop
\r
230 while (high >= low) {
\r
231 middle = (high + low) / 2;
\r
232 if (uc == hnode[middle].ch) {
\r
233 // We hit a match; get the next node and next character
\r
234 node = getCompactTrieNode(hnode[middle].equal);
\r
236 uc = text.current();
\r
239 } else if (uc < hnode[middle].ch) {
\r
248 count[0] = mycount;
\r
252 // Use for reading the header portion of the file
\r
253 private static final byte DATA_FORMAT_ID[] = { (byte) 0x54, (byte) 0x72,
\r
254 (byte) 0x44, (byte) 0x63 };
\r