1 package com.hughes.android.dictionary;
\r
3 import java.io.IOException;
\r
4 import java.io.RandomAccessFile;
\r
5 import java.io.Serializable;
\r
6 import java.util.ArrayList;
\r
7 import java.util.Collections;
\r
8 import java.util.LinkedHashSet;
\r
9 import java.util.List;
\r
10 import java.util.Map;
\r
11 import java.util.Set;
\r
12 import java.util.TreeMap;
\r
13 import java.util.regex.Pattern;
\r
15 import com.hughes.util.FileUtil;
\r
17 public class IndexBuilder {
\r
19 static final Pattern WHITESPACE = Pattern.compile("\\s+");
\r
20 static final Pattern NONALPHA = Pattern.compile("[^A-Za-z]+");
\r
22 public static void main(String[] args) throws IOException,
\r
23 ClassNotFoundException {
\r
24 if (args.length != 1) {
\r
25 System.err.println("No input file.");
\r
29 final String file = args[0];
\r
30 final byte lang = Entry.LANG1;
\r
32 rootBuilder = createIndex(file, lang);
\r
33 FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", file, lang));
\r
34 rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", file, lang));
\r
36 rootBuilder.forEachNode(new Function<Node>() {
\r
38 public void invoke(final Node node) {
\r
39 for (final List<EntryDescriptor> entryDescriptors : node.entryDescriptorsMap.values()) {
\r
40 Collections.sort(entryDescriptors);
\r
44 // Dump twice to get accurate file locations.
\r
45 for (int i = 0; i < 2; ++i) {
\r
46 final RandomAccessFile raf = new RandomAccessFile(String.format(Dictionary.INDEX_FORMAT, file, lang), "rw");
\r
47 rootBuilder.dump(raf);
\r
53 // ----------------------------------------------------------------
\r
55 static final class EntryDescriptor implements Comparable<EntryDescriptor>, Serializable {
\r
57 final int numTokens;
\r
58 public EntryDescriptor(int offset, int numTokens) {
\r
59 this.offset = offset;
\r
60 this.numTokens = numTokens;
\r
63 public boolean equals(Object obj) {
\r
64 final EntryDescriptor that = (EntryDescriptor) obj;
\r
65 return this.offset == that.offset;
\r
68 public int hashCode() {
\r
72 public int compareTo(EntryDescriptor o) {
\r
73 return this.numTokens < o.numTokens ? -1 : this.numTokens == o.numTokens ? 0 : 1;
\r
77 static final class Node implements Serializable {
\r
78 final String normalizedWord;
\r
80 final TreeMap<String, Node> children = new TreeMap<String, Node>();
\r
81 final TreeMap<String,List<EntryDescriptor>> entryDescriptorsMap = new TreeMap<String, List<EntryDescriptor>>();
\r
83 // final List<EntryDescriptor> offsets = new ArrayList<EntryDescriptor>();
\r
85 int descendantOffsetCount = 0;
\r
87 int indexFileLocation = -1;
\r
89 public Node(final String normalizedWord) {
\r
90 if (normalizedWord.length() == 0) {
\r
91 System.out.println("Created root.");
\r
93 this.normalizedWord = normalizedWord.intern();
\r
96 public Node getNode(final String nWord, final int pos,
\r
97 final boolean create) {
\r
98 assert this.normalizedWord.equals(nWord.substring(0, pos));
\r
100 if (pos == nWord.length()) {
\r
101 assert normalizedWord.equals(nWord);
\r
105 final String rest = nWord.substring(pos);
\r
106 assert rest.length() > 0;
\r
108 final Map.Entry<String, Node> lcsEntry;
\r
111 final Map.Entry<String, Node> floorEntry = children.floorEntry(rest);
\r
112 final Map.Entry<String, Node> ceilingEntry = children
\r
113 .ceilingEntry(rest);
\r
114 final String floorLcs = floorEntry == null ? "" : StringUtil
\r
115 .longestCommonSubstring(rest, floorEntry.getKey());
\r
116 final String ceilingLcs = ceilingEntry == null ? "" : StringUtil
\r
117 .longestCommonSubstring(rest, ceilingEntry.getKey());
\r
118 if (floorLcs.length() > ceilingLcs.length()) {
\r
119 lcsEntry = floorEntry;
\r
122 lcsEntry = ceilingEntry;
\r
127 // No LCS, have to add everything.
\r
128 if (lcs.length() == 0) {
\r
132 final Node result = new Node(nWord);
\r
133 final Object old = children.put(rest.intern(), result);
\r
134 assert old == null;
\r
135 // System.out.println(" Adding final chunk: " + rest);
\r
139 assert lcsEntry != null;
\r
141 // The map already contained the LCS.
\r
142 if (lcs.length() == lcsEntry.getKey().length()) {
\r
143 assert lcs.equals(lcsEntry.getKey());
\r
144 final Node result = lcsEntry.getValue().getNode(nWord,
\r
145 pos + lcs.length(), create);
\r
146 assert result.normalizedWord.equals(nWord);
\r
154 // Have to split, inserting the LCS.
\r
155 // System.out.println(" Splitting " + lcsEntry + "/" + word + " @ " +
\r
157 final Node newChild = new Node(nWord.substring(0, pos + lcs.length()));
\r
158 final Object old = children.put(lcs.intern(), newChild);
\r
159 assert old == null;
\r
160 children.remove(lcsEntry.getKey());
\r
161 newChild.children.put(lcsEntry.getKey().substring(lcs.length())
\r
162 .intern(), lcsEntry.getValue());
\r
164 if (lcs.equals(rest)) {
\r
167 final Node result = new Node(nWord);
\r
168 final Object old2 = newChild.children.put(rest.substring(lcs.length())
\r
169 .intern(), result);
\r
170 assert old2 == null;
\r
171 // System.out.println(" newchildren=" + newChild.children);
\r
176 void forEachNode(final Function<Node> f) {
\r
178 for (final Node child : children.values()) {
\r
179 child.forEachNode(f);
\r
183 int descendantCount() {
\r
185 for (final Node child : children.values()) {
\r
186 count += child.descendantCount();
\r
191 void recursiveSetDescendantOffsetCount() {
\r
192 descendantOffsetCount = offsets.size();
\r
193 for (final Node child : children.values()) {
\r
194 child.recursiveSetDescendantOffsetCount();
\r
195 descendantOffsetCount += child.descendantOffsetCount;
\r
200 public String toString() {
\r
201 return normalizedWord;
\r
204 void dump(final RandomAccessFile file) throws IOException {
\r
205 if (indexFileLocation == -1) {
\r
206 indexFileLocation = (int) file.getFilePointer();
\r
208 assert indexFileLocation == file.getFilePointer();
\r
212 file.writeInt(children.size());
\r
213 for (final Map.Entry<String, Node> child : children.entrySet()) {
\r
214 file.writeUTF(child.getKey());
\r
215 file.writeInt(child.getValue().indexFileLocation);
\r
219 file.writeInt(entryDescriptorsMap.size());
\r
220 for (final Map.Entry<String, List<EntryDescriptor>> entry : entryDescriptorsMap.entrySet()) {
\r
221 file.writeUTF(entry.getKey());
\r
222 file.writeInt(entry.getValue().size());
\r
224 file.writeInt(offsets.get(i).offset);
\r
228 for (final Map.Entry<String, Node> child : children.entrySet()) {
\r
229 child.getValue().dump(file);
\r
234 // ----------------------------------------------------------------
\r
236 static Node createIndex(final String file, final byte lang) throws IOException {
\r
237 final Node root = new Node("");
\r
238 final RandomAccessFile raf = new RandomAccessFile(file, "r");
\r
240 final Entry entry = new Entry();
\r
242 long fileLocation = 0;
\r
243 while ((line = raf.readLine()) != null) {
\r
244 assert ((int) fileLocation) == fileLocation;
\r
246 line = line.trim();
\r
247 if (line.isEmpty() || line.startsWith("#") || !entry.parseFromLine(line)) {
\r
250 final String text = entry.getIndexableText(Entry.LANG1);
\r
251 final String[] tokens = WHITESPACE.split(text);
\r
252 final Set<String> tokenSet = new LinkedHashSet<String>();
\r
253 for (String token : tokens) {
\r
254 if (token.length() <= 1 || !Character.isLetter(token.charAt(0))) {
\r
257 tokenSet.add(EntryFactory.entryFactory.normalizeToken(token, lang));
\r
259 for (final String normalized : tokenSet) {
\r
260 // System.out.println("Inserting: " + normalized);
\r
261 if ("die".equals(normalized) || "eine".equals(normalized)) {
\r
262 // System.out.println("hello");
\r
264 final Node node = root.getNode(normalized, 0, true);
\r
265 node.offsets.add(new EntryDescriptor((int) fileLocation, tokens.length));
\r
266 assert node == root.getNode(normalized, 0, false);
\r
268 .equals(root.getNode(normalized, 0, false).normalizedWord);
\r
271 if (lineCount % 10000 == 0) {
\r
272 System.out.println("IndexBuilder: " + "lineCount=" + lineCount);
\r
276 fileLocation = raf.getFilePointer();
\r