1 package com.hughes.android.dictionary;
\r
3 import java.io.IOException;
\r
4 import java.io.RandomAccessFile;
\r
5 import java.io.Serializable;
\r
6 import java.util.ArrayList;
\r
7 import java.util.Collections;
\r
8 import java.util.LinkedHashSet;
\r
9 import java.util.List;
\r
10 import java.util.Map;
\r
11 import java.util.Set;
\r
12 import java.util.TreeMap;
\r
13 import java.util.regex.Pattern;
\r
15 import com.hughes.util.FileUtil;
\r
17 public class IndexBuilder {
\r
19 static final Pattern WHITESPACE = Pattern.compile("\\s+");
\r
20 static final Pattern NONALPHA = Pattern.compile("[^A-Za-z]+");
\r
22 public static void main(String[] args) throws IOException,
\r
23 ClassNotFoundException {
\r
24 if (args.length != 1) {
\r
25 System.err.println("No input file.");
\r
29 final String file = args[0];
\r
30 final byte lang = Entry.LANG1;
\r
32 rootBuilder = createIndex(file, lang);
\r
33 FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", file, lang));
\r
34 rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", file, lang));
\r
36 // final AtomicInteger c = new AtomicInteger();
\r
37 rootBuilder.forEachNode(new Function<Node>() {
\r
39 public void invoke(Node t) {
\r
40 Collections.sort(t.offsets);
\r
41 // if (t.offsets.size() > 128) {
\r
42 // System.out.println(t);
\r
43 // c.incrementAndGet();
\r
46 // System.out.println(c);
\r
48 // rootBuilder.recursiveSetDescendantOffsetCount();
\r
49 // rootBuilder.packDescendants(128);
\r
51 // Dump twice to get accurate file locations.
\r
52 for (int i = 0; i < 2; ++i) {
\r
53 final RandomAccessFile raf = new RandomAccessFile(String.format(Dictionary.INDEX_FORMAT, file, lang), "rw");
\r
54 rootBuilder.dump(raf);
\r
60 // ----------------------------------------------------------------
\r
62 static final class EntryDescriptor implements Comparable<EntryDescriptor>, Serializable {
\r
64 final int numTokens;
\r
65 public EntryDescriptor(int offset, int numTokens) {
\r
66 this.offset = offset;
\r
67 this.numTokens = numTokens;
\r
70 public boolean equals(Object obj) {
\r
71 final EntryDescriptor that = (EntryDescriptor) obj;
\r
72 return this.offset == that.offset;
\r
75 public int hashCode() {
\r
79 public int compareTo(EntryDescriptor o) {
\r
80 return this.numTokens < o.numTokens ? -1 : this.numTokens == o.numTokens ? 0 : 1;
\r
84 static final class Node implements Serializable {
\r
85 private static final long serialVersionUID = -5423134653901704956L;
\r
87 final TreeMap<String, Node> children = new TreeMap<String, Node>();
\r
88 final List<EntryDescriptor> offsets = new ArrayList<EntryDescriptor>();
\r
89 final String sequence;
\r
91 int descendantOffsetCount = 0;
\r
93 int indexFileLocation = -1;
\r
95 public Node(String sequence) {
\r
96 if (sequence.length() == 0) {
\r
97 System.out.println("Created root.");
\r
99 this.sequence = sequence.intern();
\r
102 public Node getIndexNode(final String word, final int pos,
\r
103 final boolean create) {
\r
104 assert this.sequence.equals(word.substring(0, pos));
\r
106 if (pos == word.length()) {
\r
107 assert sequence.equals(word);
\r
111 final String rest = word.substring(pos);
\r
112 assert rest.length() > 0;
\r
114 final Map.Entry<String, Node> lcsEntry;
\r
117 final Map.Entry<String, Node> floorEntry = children.floorEntry(rest);
\r
118 final Map.Entry<String, Node> ceilingEntry = children
\r
119 .ceilingEntry(rest);
\r
120 final String floorLcs = floorEntry == null ? "" : StringUtil
\r
121 .longestCommonSubstring(rest, floorEntry.getKey());
\r
122 final String ceilingLcs = ceilingEntry == null ? "" : StringUtil
\r
123 .longestCommonSubstring(rest, ceilingEntry.getKey());
\r
124 if (floorLcs.length() > ceilingLcs.length()) {
\r
125 lcsEntry = floorEntry;
\r
128 lcsEntry = ceilingEntry;
\r
133 // No LCS, have to add everything.
\r
134 if (lcs.length() == 0) {
\r
138 final Node result = new Node(word);
\r
139 final Object old = children.put(rest.intern(), result);
\r
140 assert old == null;
\r
141 // System.out.println(" Adding final chunk: " + rest);
\r
145 assert lcsEntry != null;
\r
147 // The map already contained the LCS.
\r
148 if (lcs.length() == lcsEntry.getKey().length()) {
\r
149 assert lcs.equals(lcsEntry.getKey());
\r
150 final Node result = lcsEntry.getValue().getIndexNode(word,
\r
151 pos + lcs.length(), create);
\r
152 assert result.sequence.equals(word);
\r
160 // Have to split, inserting the LCS.
\r
161 // System.out.println(" Splitting " + lcsEntry + "/" + word + " @ " +
\r
163 final Node newChild = new Node(word.substring(0, pos + lcs.length()));
\r
164 final Object old = children.put(lcs.intern(), newChild);
\r
165 assert old == null;
\r
166 children.remove(lcsEntry.getKey());
\r
167 newChild.children.put(lcsEntry.getKey().substring(lcs.length())
\r
168 .intern(), lcsEntry.getValue());
\r
170 if (lcs.equals(rest)) {
\r
173 final Node result = new Node(word);
\r
174 final Object old2 = newChild.children.put(rest.substring(lcs.length())
\r
175 .intern(), result);
\r
176 assert old2 == null;
\r
177 // System.out.println(" newchildren=" + newChild.children);
\r
182 MemoryIndex.Node toIndexNode() {
\r
183 final MemoryIndex.Node result = new MemoryIndex.Node(children.size(), offsets
\r
186 for (final Map.Entry<String, Node> entry : children.entrySet()) {
\r
187 result.chars[i] = entry.getKey();
\r
188 result.children[i] = entry.getValue().toIndexNode();
\r
194 void forEachNode(final Function<Node> f) {
\r
196 for (final Node child : children.values()) {
\r
197 child.forEachNode(f);
\r
201 int descendantCount() {
\r
203 for (final Node child : children.values()) {
\r
204 count += child.descendantCount();
\r
209 void recursiveSetDescendantOffsetCount() {
\r
210 descendantOffsetCount = offsets.size();
\r
211 for (final Node child : children.values()) {
\r
212 child.recursiveSetDescendantOffsetCount();
\r
213 descendantOffsetCount += child.descendantOffsetCount;
\r
217 public void packDescendants(final int maxDescendants) {
\r
218 if (descendantOffsetCount <= maxDescendants) {
\r
219 final Set<EntryDescriptor> descendantOffsets = new LinkedHashSet<EntryDescriptor>();
\r
220 recursiveAddDescendants(descendantOffsets);
\r
221 assert descendantOffsets.size() <= maxDescendants;
\r
223 offsets.addAll(descendantOffsets);
\r
226 for (final Node child : children.values()) {
\r
227 child.packDescendants(maxDescendants);
\r
232 private void recursiveAddDescendants(final Set<EntryDescriptor> descendantOffsets) {
\r
233 descendantOffsets.addAll(this.offsets);
\r
234 for (final Node child : children.values()) {
\r
235 child.recursiveAddDescendants(descendantOffsets);
\r
240 public String toString() {
\r
241 return sequence + ":" + offsets.size();
\r
244 void dump(final RandomAccessFile file) throws IOException {
\r
245 if (indexFileLocation == -1) {
\r
246 indexFileLocation = (int) file.getFilePointer();
\r
248 assert indexFileLocation == file.getFilePointer();
\r
252 file.writeInt(children.size());
\r
253 for (final Map.Entry<String, Node> child : children.entrySet()) {
\r
254 file.writeUTF(child.getKey());
\r
255 file.writeInt(child.getValue().indexFileLocation);
\r
259 file.writeInt(offsets.size());
\r
260 for (int i = 0; i < offsets.size(); i++) {
\r
261 file.writeInt(offsets.get(i).offset);
\r
265 for (final Map.Entry<String, Node> child : children.entrySet()) {
\r
266 child.getValue().dump(file);
\r
271 // ----------------------------------------------------------------
\r
273 static Node createIndex(final String file, final byte lang) throws IOException {
\r
274 final Node root = new Node("");
\r
275 final RandomAccessFile raf = new RandomAccessFile(file, "r");
\r
277 final Entry entry = new Entry();
\r
279 long fileLocation = 0;
\r
280 while ((line = raf.readLine()) != null) {
\r
281 assert ((int) fileLocation) == fileLocation;
\r
283 line = line.trim();
\r
284 if (line.isEmpty() || line.startsWith("#") || !entry.parseFromLine(line)) {
\r
287 final String text = entry.getIndexableText(Entry.LANG1);
\r
288 final String[] tokens = WHITESPACE.split(text);
\r
289 final Set<String> tokenSet = new LinkedHashSet<String>();
\r
290 for (String token : tokens) {
\r
291 if (token.length() <= 1 || !Character.isLetter(token.charAt(0))) {
\r
294 tokenSet.add(entry.normalizeToken(token, lang));
\r
296 for (final String normalized : tokenSet) {
\r
297 // System.out.println("Inserting: " + normalized);
\r
298 if ("die".equals(normalized) || "eine".equals(normalized)) {
\r
299 // System.out.println("hello");
\r
301 final Node node = root.getIndexNode(normalized, 0, true);
\r
302 node.offsets.add(new EntryDescriptor((int) fileLocation, tokens.length));
\r
303 assert node == root.getIndexNode(normalized, 0, false);
\r
305 .equals(root.getIndexNode(normalized, 0, false).sequence);
\r
308 if (lineCount % 10000 == 0) {
\r
309 System.out.println("IndexBuilder: " + "lineCount=" + lineCount);
\r
313 fileLocation = raf.getFilePointer();
\r