1 package com.hughes.android.dictionary;
\r
3 import java.io.FileNotFoundException;
\r
4 import java.io.IOException;
\r
5 import java.io.RandomAccessFile;
\r
6 import java.io.Serializable;
\r
7 import java.util.ArrayList;
\r
8 import java.util.Collections;
\r
9 import java.util.LinkedHashMap;
\r
10 import java.util.List;
\r
11 import java.util.Map;
\r
12 import java.util.TreeMap;
\r
13 import java.util.regex.Pattern;
\r
15 import com.hughes.util.FileUtil;
\r
17 public class IndexBuilder {
\r
19 static final Pattern WHITESPACE = Pattern.compile("\\s+");
\r
20 static final Pattern NONALPHA = Pattern.compile("[^A-Za-z]+");
\r
22 public static void main(String[] args) throws IOException,
\r
23 ClassNotFoundException {
\r
24 if (args.length != 1) {
\r
25 System.err.println("No input file.");
\r
28 final String dictionaryFileName = args[0];
\r
29 createIndex(dictionaryFileName, Entry.LANG1);
\r
30 createIndex(dictionaryFileName, Entry.LANG2);
\r
33 private static void createIndex(final String dictionaryFileName,
\r
34 final byte lang) throws IOException, FileNotFoundException,
\r
35 ClassNotFoundException {
\r
37 rootBuilder = processDictionaryLines(dictionaryFileName, lang);
\r
38 FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", dictionaryFileName, lang));
\r
39 rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", dictionaryFileName, lang));
\r
41 rootBuilder.forEachNode(new Function<Node>() {
\r
43 public void invoke(final Node node) {
\r
44 for (final List<EntryDescriptor> entryDescriptors : node.entryDescriptorsMap.values()) {
\r
45 Collections.sort(entryDescriptors);
\r
49 // Dump twice to get accurate file locations.
\r
50 for (int i = 0; i < 2; ++i) {
\r
51 final RandomAccessFile raf = new RandomAccessFile(String.format(Dictionary.INDEX_FORMAT, dictionaryFileName, lang), "rw");
\r
52 rootBuilder.dump(raf);
\r
57 // ----------------------------------------------------------------
\r
59 static final class EntryDescriptor implements Comparable<EntryDescriptor>, Serializable {
\r
61 final int numTokens;
\r
62 public EntryDescriptor(int offset, int numTokens) {
\r
63 this.offset = offset;
\r
64 this.numTokens = numTokens;
\r
67 public boolean equals(Object obj) {
\r
68 final EntryDescriptor that = (EntryDescriptor) obj;
\r
69 return this.offset == that.offset;
\r
72 public int hashCode() {
\r
76 public int compareTo(EntryDescriptor o) {
\r
77 return this.numTokens < o.numTokens ? -1 : this.numTokens == o.numTokens ? 0 : 1;
\r
82 // ----------------------------------------------------------------
\r
84 static Node processDictionaryLines(final String dictionaryFileName, final byte lang) throws IOException {
\r
85 final Node root = new Node("");
\r
86 final RandomAccessFile dictionaryFile = new RandomAccessFile(dictionaryFileName, "r");
\r
88 final Entry entry = new Entry();
\r
90 long fileLocation = 0;
\r
91 while ((line = dictionaryFile.readLine()) != null) {
\r
92 assert ((int) fileLocation) == fileLocation;
\r
95 if (line.isEmpty() || line.startsWith("#") || !entry.parseFromLine(line)) {
\r
98 final String text = entry.getIndexableText(Entry.LANG1);
\r
99 final String[] tokens = WHITESPACE.split(text);
\r
100 final Map<String,String> tokenToNormalizedMap = new LinkedHashMap<String,String>();
\r
101 for (String token : tokens) {
\r
102 if (token.length() <= 1 || !Character.isLetter(token.charAt(0))) {
\r
105 tokenToNormalizedMap.put(token, EntryFactory.entryFactory.normalizeToken(token));
\r
107 for (final Map.Entry<String, String> tokenToNormalized : tokenToNormalizedMap.entrySet()) {
\r
108 final String normalizedToken = tokenToNormalized.getValue();
\r
109 final Node node = root.getNode(normalizedToken, 0, true);
\r
110 node.addToken(tokenToNormalized.getKey(), new EntryDescriptor((int) fileLocation, tokens.length));
\r
111 assert node == root.getNode(normalizedToken, 0, false);
\r
112 assert normalizedToken
\r
113 .equals(root.getNode(normalizedToken, 0, false).normalizedToken);
\r
116 if (lineCount % 10000 == 0) {
\r
117 System.out.println("IndexBuilder: " + "lineCount=" + lineCount);
\r
121 fileLocation = dictionaryFile.getFilePointer();
\r
123 dictionaryFile.close();
\r
125 root.recursiveSetDescendantCounts();
\r