package com.hughes.android.dictionary;\r
\r
-import java.io.DataOutputStream;\r
-import java.io.File;\r
-import java.io.FileOutputStream;\r
import java.io.IOException;\r
import java.io.RandomAccessFile;\r
import java.io.Serializable;\r
import java.util.ArrayList;\r
+import java.util.Collections;\r
import java.util.LinkedHashSet;\r
import java.util.List;\r
import java.util.Map;\r
import java.util.Set;\r
import java.util.TreeMap;\r
-import java.util.concurrent.atomic.AtomicInteger;\r
import java.util.regex.Pattern;\r
\r
-import com.hughes.android.dictionary.Index.Node;\r
import com.hughes.util.FileUtil;\r
\r
public class IndexBuilder {\r
final String file = args[0];\r
final byte lang = Entry.LANG1;\r
Node rootBuilder;\r
-// rootBuilder = createIndex(file, lang);\r
-// FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", file, lang));\r
+ rootBuilder = createIndex(file, lang);\r
+ FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", file, lang));\r
rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", file, lang));\r
- \r
- final AtomicInteger c = new AtomicInteger();\r
+\r
+// final AtomicInteger c = new AtomicInteger();\r
rootBuilder.forEachNode(new Function<Node>() {\r
@Override\r
public void invoke(Node t) {\r
- if (t.offsetsList.size() > 200) {\r
- System.out.println(t);\r
- c.incrementAndGet();\r
- }\r
+ Collections.sort(t.offsets);\r
+// if (t.offsets.size() > 128) {\r
+// System.out.println(t);\r
+// c.incrementAndGet();\r
+// }\r
}});\r
- System.out.println(c);\r
- \r
- rootBuilder.recursiveSetDescendantOffsetCount();\r
- rootBuilder.packDescendants(128);\r
-\r
- final DataOutputStream os = new DataOutputStream(new FileOutputStream(\r
- String.format("%s_index_%d", file, lang)));\r
- final Index.Node root = rootBuilder.toIndexNode();\r
- root.write(os);\r
- os.close();\r
+// System.out.println(c);\r
\r
- FileUtil.write(root, String.format("%s_index_%d.serialized", file, lang));\r
+// rootBuilder.recursiveSetDescendantOffsetCount();\r
+// rootBuilder.packDescendants(128);\r
+\r
+ // Dump twice to get accurate file locations.\r
+ for (int i = 0; i < 2; ++i) {\r
+ final RandomAccessFile raf = new RandomAccessFile(String.format(Dictionary.INDEX_FORMAT, file, lang), "rw"); \r
+ rootBuilder.dump(raf);\r
+ raf.close();\r
+ }\r
\r
- Object o = FileUtil.read(String.format("%s_index_%d.serialized", file, lang));\r
-\r
-\r
}\r
\r
// ----------------------------------------------------------------\r
+ \r
+ static final class EntryDescriptor implements Comparable<EntryDescriptor>, Serializable {\r
+ final int offset;\r
+ final int numTokens;\r
+ public EntryDescriptor(int offset, int numTokens) {\r
+ this.offset = offset;\r
+ this.numTokens = numTokens;\r
+ }\r
+ @Override\r
+ public boolean equals(Object obj) {\r
+ final EntryDescriptor that = (EntryDescriptor) obj;\r
+ return this.offset == that.offset;\r
+ }\r
+ @Override\r
+ public int hashCode() {\r
+ return offset;\r
+ }\r
+ @Override\r
+ public int compareTo(EntryDescriptor o) {\r
+ return this.numTokens < o.numTokens ? -1 : this.numTokens == o.numTokens ? 0 : 1;\r
+ }\r
+ }\r
\r
static final class Node implements Serializable {\r
private static final long serialVersionUID = -5423134653901704956L;\r
\r
- final TreeMap<String, Node> childrenMap = new TreeMap<String, Node>();\r
- final List<Integer> offsetsList = new ArrayList<Integer>();\r
+ final TreeMap<String, Node> children = new TreeMap<String, Node>();\r
+ final List<EntryDescriptor> offsets = new ArrayList<EntryDescriptor>();\r
final String sequence;\r
+ \r
int descendantOffsetCount = 0;\r
+ \r
+ int indexFileLocation = -1;\r
\r
public Node(String sequence) {\r
if (sequence.length() == 0) {\r
final Map.Entry<String, Node> lcsEntry;\r
final String lcs;\r
{\r
- final Map.Entry<String, Node> floorEntry = childrenMap.floorEntry(rest);\r
- final Map.Entry<String, Node> ceilingEntry = childrenMap\r
+ final Map.Entry<String, Node> floorEntry = children.floorEntry(rest);\r
+ final Map.Entry<String, Node> ceilingEntry = children\r
.ceilingEntry(rest);\r
final String floorLcs = floorEntry == null ? "" : StringUtil\r
.longestCommonSubstring(rest, floorEntry.getKey());\r
return null;\r
}\r
final Node result = new Node(word);\r
- final Object old = childrenMap.put(rest.intern(), result);\r
+ final Object old = children.put(rest.intern(), result);\r
assert old == null;\r
// System.out.println(" Adding final chunk: " + rest);\r
return result;\r
// System.out.println(" Splitting " + lcsEntry + "/" + word + " @ " +\r
// lcs);\r
final Node newChild = new Node(word.substring(0, pos + lcs.length()));\r
- final Object old = childrenMap.put(lcs.intern(), newChild);\r
+ final Object old = children.put(lcs.intern(), newChild);\r
assert old == null;\r
- childrenMap.remove(lcsEntry.getKey());\r
- newChild.childrenMap.put(lcsEntry.getKey().substring(lcs.length())\r
+ children.remove(lcsEntry.getKey());\r
+ newChild.children.put(lcsEntry.getKey().substring(lcs.length())\r
.intern(), lcsEntry.getValue());\r
\r
if (lcs.equals(rest)) {\r
return newChild;\r
}\r
final Node result = new Node(word);\r
- final Object old2 = newChild.childrenMap.put(rest.substring(lcs.length())\r
+ final Object old2 = newChild.children.put(rest.substring(lcs.length())\r
.intern(), result);\r
assert old2 == null;\r
- // System.out.println(" newChildrenMap=" + newChild.childrenMap);\r
+ // System.out.println(" newchildren=" + newChild.children);\r
\r
return result;\r
}\r
\r
- Index.Node toIndexNode() {\r
- final Index.Node result = new Index.Node(childrenMap.size(), offsetsList\r
+ MemoryIndex.Node toIndexNode() {\r
+ final MemoryIndex.Node result = new MemoryIndex.Node(children.size(), offsets\r
.size());\r
int i = 0;\r
- for (final Map.Entry<String, Node> entry : childrenMap.entrySet()) {\r
+ for (final Map.Entry<String, Node> entry : children.entrySet()) {\r
result.chars[i] = entry.getKey();\r
result.children[i] = entry.getValue().toIndexNode();\r
i++;\r
\r
void forEachNode(final Function<Node> f) {\r
f.invoke(this);\r
- for (final Node child : childrenMap.values()) {\r
+ for (final Node child : children.values()) {\r
child.forEachNode(f);\r
}\r
}\r
\r
int descendantCount() {\r
int count = 1;\r
- for (final Node child : childrenMap.values()) {\r
+ for (final Node child : children.values()) {\r
count += child.descendantCount();\r
}\r
return count;\r
}\r
\r
void recursiveSetDescendantOffsetCount() {\r
- descendantOffsetCount = offsetsList.size();\r
- for (final Node child : childrenMap.values()) {\r
+ descendantOffsetCount = offsets.size();\r
+ for (final Node child : children.values()) {\r
child.recursiveSetDescendantOffsetCount();\r
descendantOffsetCount += child.descendantOffsetCount;\r
}\r
\r
public void packDescendants(final int maxDescendants) {\r
if (descendantOffsetCount <= maxDescendants) {\r
- final Set<Integer> descendantOffsets = new LinkedHashSet<Integer>();\r
+ final Set<EntryDescriptor> descendantOffsets = new LinkedHashSet<EntryDescriptor>();\r
recursiveAddDescendants(descendantOffsets);\r
assert descendantOffsets.size() <= maxDescendants;\r
- offsetsList.clear();\r
- offsetsList.addAll(descendantOffsets);\r
- childrenMap.clear();\r
+ offsets.clear();\r
+ offsets.addAll(descendantOffsets);\r
+ children.clear();\r
} else {\r
- for (final Node child : childrenMap.values()) {\r
+ for (final Node child : children.values()) {\r
child.packDescendants(maxDescendants);\r
}\r
}\r
}\r
\r
- private void recursiveAddDescendants(final Set<Integer> descendantOffsets) {\r
- descendantOffsets.addAll(this.offsetsList);\r
- for (final Node child : childrenMap.values()) {\r
+ private void recursiveAddDescendants(final Set<EntryDescriptor> descendantOffsets) {\r
+ descendantOffsets.addAll(this.offsets);\r
+ for (final Node child : children.values()) {\r
child.recursiveAddDescendants(descendantOffsets);\r
}\r
}\r
\r
-\r
@Override\r
public String toString() {\r
- return sequence + ":" + offsetsList.size();\r
+ return sequence + ":" + offsets.size();\r
+ }\r
+ \r
+ void dump(final RandomAccessFile file) throws IOException {\r
+ if (indexFileLocation == -1) {\r
+ indexFileLocation = (int) file.getFilePointer();\r
+ } else {\r
+ assert indexFileLocation == file.getFilePointer();\r
+ }\r
+ \r
+ // Children.\r
+ file.writeInt(children.size());\r
+ for (final Map.Entry<String, Node> child : children.entrySet()) {\r
+ file.writeUTF(child.getKey());\r
+ file.writeInt(child.getValue().indexFileLocation);\r
+ }\r
+ \r
+ // Offsets.\r
+ file.writeInt(offsets.size());\r
+ for (int i = 0; i < offsets.size(); i++) {\r
+ file.writeInt(offsets.get(i).offset);\r
+ }\r
+ \r
+ // Dump children.\r
+ for (final Map.Entry<String, Node> child : children.entrySet()) {\r
+ child.getValue().dump(file);\r
+ }\r
}\r
-\r
}\r
\r
// ----------------------------------------------------------------\r
String line;\r
final Entry entry = new Entry();\r
int lineCount = 0;\r
+ long fileLocation = 0;\r
while ((line = raf.readLine()) != null) {\r
- final long fileLocation = raf.getFilePointer();\r
assert ((int) fileLocation) == fileLocation;\r
\r
line = line.trim();\r
// System.out.println("hello");\r
}\r
final Node node = root.getIndexNode(normalized, 0, true);\r
- node.offsetsList.add((int) fileLocation);\r
+ node.offsets.add(new EntryDescriptor((int) fileLocation, tokens.length));\r
assert node == root.getIndexNode(normalized, 0, false);\r
assert normalized\r
.equals(root.getIndexNode(normalized, 0, false).sequence);\r
if (lineCount % 10000 == 0) {\r
System.out.println("IndexBuilder: " + "lineCount=" + lineCount);\r
}\r
+ \r
lineCount++;\r
+ fileLocation = raf.getFilePointer();\r
}\r
raf.close();\r
return root;\r