1 package com.hughes.android.dictionary;
\r
3 import java.io.DataOutputStream;
\r
5 import java.io.FileOutputStream;
\r
6 import java.io.IOException;
\r
7 import java.io.RandomAccessFile;
\r
8 import java.io.Serializable;
\r
9 import java.util.ArrayList;
\r
10 import java.util.LinkedHashSet;
\r
11 import java.util.List;
\r
12 import java.util.Map;
\r
13 import java.util.Set;
\r
14 import java.util.TreeMap;
\r
15 import java.util.concurrent.atomic.AtomicInteger;
\r
16 import java.util.regex.Pattern;
\r
18 import com.hughes.android.dictionary.Index.Node;
\r
19 import com.hughes.util.FileUtil;
\r
21 public class IndexBuilder {
\r
23 static final Pattern WHITESPACE = Pattern.compile("\\s+");
\r
24 static final Pattern NONALPHA = Pattern.compile("[^A-Za-z]+");
\r
26 public static void main(String[] args) throws IOException,
\r
27 ClassNotFoundException {
\r
28 if (args.length != 1) {
\r
29 System.err.println("No input file.");
\r
33 final String file = args[0];
\r
34 final byte lang = Entry.LANG1;
\r
36 // rootBuilder = createIndex(file, lang);
\r
37 // FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", file, lang));
\r
38 rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", file, lang));
\r
40 final AtomicInteger c = new AtomicInteger();
\r
41 rootBuilder.forEachNode(new Function<Node>() {
\r
43 public void invoke(Node t) {
\r
44 if (t.offsetsList.size() > 200) {
\r
45 System.out.println(t);
\r
46 c.incrementAndGet();
\r
49 System.out.println(c);
\r
51 rootBuilder.recursiveSetDescendantOffsetCount();
\r
52 rootBuilder.packDescendants(128);
\r
54 final DataOutputStream os = new DataOutputStream(new FileOutputStream(
\r
55 String.format("%s_index_%d", file, lang)));
\r
56 final Index.Node root = rootBuilder.toIndexNode();
\r
60 FileUtil.write(root, String.format("%s_index_%d.serialized", file, lang));
\r
62 Object o = FileUtil.read(String.format("%s_index_%d.serialized", file, lang));
\r
67 // ----------------------------------------------------------------
\r
69 static final class Node implements Serializable {
\r
70 private static final long serialVersionUID = -5423134653901704956L;
\r
72 final TreeMap<String, Node> childrenMap = new TreeMap<String, Node>();
\r
73 final List<Integer> offsetsList = new ArrayList<Integer>();
\r
74 final String sequence;
\r
75 int descendantOffsetCount = 0;
\r
77 public Node(String sequence) {
\r
78 if (sequence.length() == 0) {
\r
79 System.out.println("Created root.");
\r
81 this.sequence = sequence.intern();
\r
84 public Node getIndexNode(final String word, final int pos,
\r
85 final boolean create) {
\r
86 assert this.sequence.equals(word.substring(0, pos));
\r
88 if (pos == word.length()) {
\r
89 assert sequence.equals(word);
\r
93 final String rest = word.substring(pos);
\r
94 assert rest.length() > 0;
\r
96 final Map.Entry<String, Node> lcsEntry;
\r
99 final Map.Entry<String, Node> floorEntry = childrenMap.floorEntry(rest);
\r
100 final Map.Entry<String, Node> ceilingEntry = childrenMap
\r
101 .ceilingEntry(rest);
\r
102 final String floorLcs = floorEntry == null ? "" : StringUtil
\r
103 .longestCommonSubstring(rest, floorEntry.getKey());
\r
104 final String ceilingLcs = ceilingEntry == null ? "" : StringUtil
\r
105 .longestCommonSubstring(rest, ceilingEntry.getKey());
\r
106 if (floorLcs.length() > ceilingLcs.length()) {
\r
107 lcsEntry = floorEntry;
\r
110 lcsEntry = ceilingEntry;
\r
115 // No LCS, have to add everything.
\r
116 if (lcs.length() == 0) {
\r
120 final Node result = new Node(word);
\r
121 final Object old = childrenMap.put(rest.intern(), result);
\r
122 assert old == null;
\r
123 // System.out.println(" Adding final chunk: " + rest);
\r
127 assert lcsEntry != null;
\r
129 // The map already contained the LCS.
\r
130 if (lcs.length() == lcsEntry.getKey().length()) {
\r
131 assert lcs.equals(lcsEntry.getKey());
\r
132 final Node result = lcsEntry.getValue().getIndexNode(word,
\r
133 pos + lcs.length(), create);
\r
134 assert result.sequence.equals(word);
\r
142 // Have to split, inserting the LCS.
\r
143 // System.out.println(" Splitting " + lcsEntry + "/" + word + " @ " +
\r
145 final Node newChild = new Node(word.substring(0, pos + lcs.length()));
\r
146 final Object old = childrenMap.put(lcs.intern(), newChild);
\r
147 assert old == null;
\r
148 childrenMap.remove(lcsEntry.getKey());
\r
149 newChild.childrenMap.put(lcsEntry.getKey().substring(lcs.length())
\r
150 .intern(), lcsEntry.getValue());
\r
152 if (lcs.equals(rest)) {
\r
155 final Node result = new Node(word);
\r
156 final Object old2 = newChild.childrenMap.put(rest.substring(lcs.length())
\r
157 .intern(), result);
\r
158 assert old2 == null;
\r
159 // System.out.println(" newChildrenMap=" + newChild.childrenMap);
\r
164 Index.Node toIndexNode() {
\r
165 final Index.Node result = new Index.Node(childrenMap.size(), offsetsList
\r
168 for (final Map.Entry<String, Node> entry : childrenMap.entrySet()) {
\r
169 result.chars[i] = entry.getKey();
\r
170 result.children[i] = entry.getValue().toIndexNode();
\r
176 void forEachNode(final Function<Node> f) {
\r
178 for (final Node child : childrenMap.values()) {
\r
179 child.forEachNode(f);
\r
183 int descendantCount() {
\r
185 for (final Node child : childrenMap.values()) {
\r
186 count += child.descendantCount();
\r
191 void recursiveSetDescendantOffsetCount() {
\r
192 descendantOffsetCount = offsetsList.size();
\r
193 for (final Node child : childrenMap.values()) {
\r
194 child.recursiveSetDescendantOffsetCount();
\r
195 descendantOffsetCount += child.descendantOffsetCount;
\r
199 public void packDescendants(final int maxDescendants) {
\r
200 if (descendantOffsetCount <= maxDescendants) {
\r
201 final Set<Integer> descendantOffsets = new LinkedHashSet<Integer>();
\r
202 recursiveAddDescendants(descendantOffsets);
\r
203 assert descendantOffsets.size() <= maxDescendants;
\r
204 offsetsList.clear();
\r
205 offsetsList.addAll(descendantOffsets);
\r
206 childrenMap.clear();
\r
208 for (final Node child : childrenMap.values()) {
\r
209 child.packDescendants(maxDescendants);
\r
214 private void recursiveAddDescendants(final Set<Integer> descendantOffsets) {
\r
215 descendantOffsets.addAll(this.offsetsList);
\r
216 for (final Node child : childrenMap.values()) {
\r
217 child.recursiveAddDescendants(descendantOffsets);
\r
223 public String toString() {
\r
224 return sequence + ":" + offsetsList.size();
\r
229 // ----------------------------------------------------------------
\r
231 static Node createIndex(final String file, final byte lang) throws IOException {
\r
232 final Node root = new Node("");
\r
233 final RandomAccessFile raf = new RandomAccessFile(file, "r");
\r
235 final Entry entry = new Entry();
\r
237 while ((line = raf.readLine()) != null) {
\r
238 final long fileLocation = raf.getFilePointer();
\r
239 assert ((int) fileLocation) == fileLocation;
\r
241 line = line.trim();
\r
242 if (line.isEmpty() || line.startsWith("#") || !entry.parseFromLine(line)) {
\r
245 final String text = entry.getIndexableText(Entry.LANG1);
\r
246 final String[] tokens = WHITESPACE.split(text);
\r
247 final Set<String> tokenSet = new LinkedHashSet<String>();
\r
248 for (String token : tokens) {
\r
249 if (token.length() <= 1 || !Character.isLetter(token.charAt(0))) {
\r
252 tokenSet.add(entry.normalizeToken(token, lang));
\r
254 for (final String normalized : tokenSet) {
\r
255 // System.out.println("Inserting: " + normalized);
\r
256 if ("die".equals(normalized) || "eine".equals(normalized)) {
\r
257 // System.out.println("hello");
\r
259 final Node node = root.getIndexNode(normalized, 0, true);
\r
260 node.offsetsList.add((int) fileLocation);
\r
261 assert node == root.getIndexNode(normalized, 0, false);
\r
263 .equals(root.getIndexNode(normalized, 0, false).sequence);
\r
266 if (lineCount % 10000 == 0) {
\r
267 System.out.println("IndexBuilder: " + "lineCount=" + lineCount);
\r