1 package com.hughes.android.dictionary;
\r
3 import java.io.FileNotFoundException;
\r
4 import java.io.IOException;
\r
5 import java.io.RandomAccessFile;
\r
6 import java.io.Serializable;
\r
7 import java.util.ArrayList;
\r
8 import java.util.Collections;
\r
9 import java.util.LinkedHashMap;
\r
10 import java.util.List;
\r
11 import java.util.Map;
\r
12 import java.util.TreeMap;
\r
13 import java.util.regex.Pattern;
\r
15 import com.hughes.util.FileUtil;
\r
17 public class IndexBuilder {
\r
19 static final Pattern WHITESPACE = Pattern.compile("\\s+");
\r
20 static final Pattern NONALPHA = Pattern.compile("[^A-Za-z]+");
\r
22 public static void main(String[] args) throws IOException,
\r
23 ClassNotFoundException {
\r
24 if (args.length != 1) {
\r
25 System.err.println("No input file.");
\r
28 final String dictionaryFileName = args[0];
\r
29 createIndex(dictionaryFileName, Entry.LANG1);
\r
30 createIndex(dictionaryFileName, Entry.LANG2);
\r
33 private static void createIndex(final String dictionaryFileName,
\r
34 final byte lang) throws IOException, FileNotFoundException,
\r
35 ClassNotFoundException {
\r
37 rootBuilder = processDictionaryLines(dictionaryFileName, lang);
\r
38 FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", dictionaryFileName, lang));
\r
39 rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", dictionaryFileName, lang));
\r
41 rootBuilder.forEachNode(new Function<Node>() {
\r
43 public void invoke(final Node node) {
\r
44 for (final List<EntryDescriptor> entryDescriptors : node.entryDescriptorsMap.values()) {
\r
45 Collections.sort(entryDescriptors);
\r
49 // Dump twice to get accurate file locations.
\r
50 for (int i = 0; i < 2; ++i) {
\r
51 final RandomAccessFile raf = new RandomAccessFile(String.format(Dictionary.INDEX_FORMAT, dictionaryFileName, lang), "rw");
\r
52 rootBuilder.dump(raf);
\r
57 // ----------------------------------------------------------------
\r
59 static final class EntryDescriptor implements Comparable<EntryDescriptor>, Serializable {
\r
61 final int numTokens;
\r
62 public EntryDescriptor(int offset, int numTokens) {
\r
63 this.offset = offset;
\r
64 this.numTokens = numTokens;
\r
67 public boolean equals(Object obj) {
\r
68 final EntryDescriptor that = (EntryDescriptor) obj;
\r
69 return this.offset == that.offset;
\r
72 public int hashCode() {
\r
76 public int compareTo(EntryDescriptor o) {
\r
77 return this.numTokens < o.numTokens ? -1 : this.numTokens == o.numTokens ? 0 : 1;
\r
81 static final class Node implements Serializable {
\r
82 final String normalizedToken;
\r
84 final TreeMap<String, Node> children = new TreeMap<String, Node>();
\r
85 final TreeMap<String,List<EntryDescriptor>> entryDescriptorsMap = new TreeMap<String, List<EntryDescriptor>>();
\r
87 // final List<EntryDescriptor> offsets = new ArrayList<EntryDescriptor>();
\r
89 int indexFileLocation = -1;
\r
91 private int descendantTokenCount;
\r
92 private int descendantEntryCount = 0;
\r
94 public Node(final String normalizedToken) {
\r
95 if (normalizedToken.length() == 0) {
\r
96 System.out.println("Created root.");
\r
98 this.normalizedToken = normalizedToken.intern();
\r
101 public Node getNode(final String nToken, final int pos,
\r
102 final boolean create) {
\r
103 assert this.normalizedToken.equals(nToken.substring(0, pos));
\r
105 if (pos == nToken.length()) {
\r
106 assert normalizedToken.equals(nToken);
\r
110 final String rest = nToken.substring(pos);
\r
111 assert rest.length() > 0;
\r
113 final Map.Entry<String, Node> lcsEntry;
\r
116 final Map.Entry<String, Node> floorEntry = children.floorEntry(rest);
\r
117 final Map.Entry<String, Node> ceilingEntry = children
\r
118 .ceilingEntry(rest);
\r
119 final String floorLcs = floorEntry == null ? "" : StringUtil
\r
120 .longestCommonSubstring(rest, floorEntry.getKey());
\r
121 final String ceilingLcs = ceilingEntry == null ? "" : StringUtil
\r
122 .longestCommonSubstring(rest, ceilingEntry.getKey());
\r
123 if (floorLcs.length() > ceilingLcs.length()) {
\r
124 lcsEntry = floorEntry;
\r
127 lcsEntry = ceilingEntry;
\r
132 // No LCS, have to add everything.
\r
133 if (lcs.length() == 0) {
\r
137 final Node result = new Node(nToken);
\r
138 final Object old = children.put(rest.intern(), result);
\r
139 assert old == null;
\r
140 // System.out.println(" Adding final chunk: " + rest);
\r
144 assert lcsEntry != null;
\r
146 // The map already contained the LCS.
\r
147 if (lcs.length() == lcsEntry.getKey().length()) {
\r
148 assert lcs.equals(lcsEntry.getKey());
\r
149 final Node result = lcsEntry.getValue().getNode(nToken,
\r
150 pos + lcs.length(), create);
\r
151 assert result.normalizedToken.equals(nToken);
\r
159 // Have to split, inserting the LCS.
\r
160 // System.out.println(" Splitting " + lcsEntry + "/" + word + " @ " +
\r
162 final Node newChild = new Node(nToken.substring(0, pos + lcs.length()));
\r
163 final Object old = children.put(lcs.intern(), newChild);
\r
164 assert old == null;
\r
165 children.remove(lcsEntry.getKey());
\r
166 newChild.children.put(lcsEntry.getKey().substring(lcs.length())
\r
167 .intern(), lcsEntry.getValue());
\r
169 if (lcs.equals(rest)) {
\r
172 final Node result = new Node(nToken);
\r
173 final Object old2 = newChild.children.put(rest.substring(lcs.length())
\r
174 .intern(), result);
\r
175 assert old2 == null;
\r
176 // System.out.println(" newchildren=" + newChild.children);
\r
181 void forEachNode(final Function<Node> f) {
\r
183 for (final Node child : children.values()) {
\r
184 child.forEachNode(f);
\r
188 int descendantCount() {
\r
190 for (final Node child : children.values()) {
\r
191 count += child.descendantCount();
\r
196 void recursiveSetDescendantOffsetCount() {
\r
197 descendantEntryCount = 0;
\r
198 descendantTokenCount = 0;
\r
199 for (final List<EntryDescriptor> entryDescriptors : entryDescriptorsMap.values()) {
\r
200 descendantTokenCount += 1;
\r
201 descendantEntryCount += entryDescriptors.size();
\r
203 for (final Node child : children.values()) {
\r
204 child.recursiveSetDescendantOffsetCount();
\r
205 descendantTokenCount += child.descendantTokenCount;
\r
206 descendantEntryCount += child.descendantEntryCount;
\r
211 public String toString() {
\r
212 return normalizedToken;
\r
215 void dump(final RandomAccessFile file) throws IOException {
\r
216 if (indexFileLocation == -1) {
\r
217 indexFileLocation = (int) file.getFilePointer();
\r
219 assert indexFileLocation == file.getFilePointer();
\r
223 file.writeInt(children.size());
\r
224 for (final Map.Entry<String, Node> child : children.entrySet()) {
\r
225 file.writeUTF(child.getKey());
\r
226 file.writeInt(child.getValue().indexFileLocation);
\r
230 file.writeInt(entryDescriptorsMap.size());
\r
231 for (final Map.Entry<String, List<EntryDescriptor>> entry : entryDescriptorsMap.entrySet()) {
\r
232 file.writeUTF(entry.getKey());
\r
233 file.writeInt(entry.getValue().size());
\r
234 for (int i = 0; i < entry.getValue().size(); ++i) {
\r
235 file.writeInt(entry.getValue().get(i).offset);
\r
240 for (final Map.Entry<String, Node> child : children.entrySet()) {
\r
241 child.getValue().dump(file);
\r
245 public void addToken(final String token, final EntryDescriptor entryDescriptor) {
\r
246 List<EntryDescriptor> entryDescriptors = this.entryDescriptorsMap.get(token);
\r
247 if (entryDescriptors == null) {
\r
248 entryDescriptors = new ArrayList<EntryDescriptor>();
\r
249 this.entryDescriptorsMap.put(token, entryDescriptors);
\r
251 entryDescriptors.add(entryDescriptor);
\r
255 // ----------------------------------------------------------------
\r
257 static Node processDictionaryLines(final String dictionaryFileName, final byte lang) throws IOException {
\r
258 final Node root = new Node("");
\r
259 final RandomAccessFile dictionaryFile = new RandomAccessFile(dictionaryFileName, "r");
\r
261 final Entry entry = new Entry();
\r
263 long fileLocation = 0;
\r
264 while ((line = dictionaryFile.readLine()) != null) {
\r
265 assert ((int) fileLocation) == fileLocation;
\r
267 line = line.trim();
\r
268 if (line.isEmpty() || line.startsWith("#") || !entry.parseFromLine(line)) {
\r
271 final String text = entry.getIndexableText(Entry.LANG1);
\r
272 final String[] tokens = WHITESPACE.split(text);
\r
273 final Map<String,String> tokenToNormalizedMap = new LinkedHashMap<String,String>();
\r
274 for (String token : tokens) {
\r
275 if (token.length() <= 1 || !Character.isLetter(token.charAt(0))) {
\r
278 tokenToNormalizedMap.put(token, EntryFactory.entryFactory.normalizeToken(token, lang));
\r
280 for (final Map.Entry<String, String> tokenToNormalized : tokenToNormalizedMap.entrySet()) {
\r
281 final String normalizedToken = tokenToNormalized.getValue();
\r
282 final Node node = root.getNode(normalizedToken, 0, true);
\r
283 node.addToken(tokenToNormalized.getKey(), new EntryDescriptor((int) fileLocation, tokens.length));
\r
284 assert node == root.getNode(normalizedToken, 0, false);
\r
285 assert normalizedToken
\r
286 .equals(root.getNode(normalizedToken, 0, false).normalizedToken);
\r
289 if (lineCount % 10000 == 0) {
\r
290 System.out.println("IndexBuilder: " + "lineCount=" + lineCount);
\r
294 fileLocation = dictionaryFile.getFilePointer();
\r
296 dictionaryFile.close();
\r