--- /dev/null
+package com.hughes.android.dictionary;\r
+\r
+import java.io.DataOutputStream;\r
+import java.io.File;\r
+import java.io.FileOutputStream;\r
+import java.io.IOException;\r
+import java.io.RandomAccessFile;\r
+import java.io.Serializable;\r
+import java.util.ArrayList;\r
+import java.util.LinkedHashSet;\r
+import java.util.List;\r
+import java.util.Map;\r
+import java.util.Set;\r
+import java.util.TreeMap;\r
+import java.util.concurrent.atomic.AtomicInteger;\r
+import java.util.regex.Pattern;\r
+\r
+import com.hughes.android.dictionary.Index.Node;\r
+import com.hughes.util.FileUtil;\r
+\r
+public class IndexBuilder {\r
+\r
+ static final Pattern WHITESPACE = Pattern.compile("\\s+");\r
+ static final Pattern NONALPHA = Pattern.compile("[^A-Za-z]+");\r
+\r
+ public static void main(String[] args) throws IOException,\r
+ ClassNotFoundException {\r
+ if (args.length != 1) {\r
+ System.err.println("No input file.");\r
+ return;\r
+ }\r
+\r
+ final String file = args[0];\r
+ final byte lang = Entry.LANG1;\r
+ Node rootBuilder;\r
+// rootBuilder = createIndex(file, lang);\r
+// FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", file, lang));\r
+ rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", file, lang));\r
+ \r
+ final AtomicInteger c = new AtomicInteger();\r
+ rootBuilder.forEachNode(new Function<Node>() {\r
+ @Override\r
+ public void invoke(Node t) {\r
+ if (t.offsetsList.size() > 200) {\r
+ System.out.println(t);\r
+ c.incrementAndGet();\r
+ }\r
+ }});\r
+ System.out.println(c);\r
+ \r
+ rootBuilder.recursiveSetDescendantOffsetCount();\r
+ rootBuilder.packDescendants(128);\r
+\r
+ final DataOutputStream os = new DataOutputStream(new FileOutputStream(\r
+ String.format("%s_index_%d", file, lang)));\r
+ final Index.Node root = rootBuilder.toIndexNode();\r
+ root.write(os);\r
+ os.close();\r
+ \r
+ FileUtil.write(root, String.format("%s_index_%d.serialized", file, lang));\r
+ \r
+ Object o = FileUtil.read(String.format("%s_index_%d.serialized", file, lang));\r
+\r
+\r
+ }\r
+\r
+ // ----------------------------------------------------------------\r
+\r
+ static final class Node implements Serializable {\r
+ private static final long serialVersionUID = -5423134653901704956L;\r
+ \r
+ final TreeMap<String, Node> childrenMap = new TreeMap<String, Node>();\r
+ final List<Integer> offsetsList = new ArrayList<Integer>();\r
+ final String sequence;\r
+ int descendantOffsetCount = 0;\r
+\r
+ public Node(String sequence) {\r
+ if (sequence.length() == 0) {\r
+ System.out.println("Created root.");\r
+ }\r
+ this.sequence = sequence.intern();\r
+ }\r
+\r
+ public Node getIndexNode(final String word, final int pos,\r
+ final boolean create) {\r
+ assert this.sequence.equals(word.substring(0, pos));\r
+\r
+ if (pos == word.length()) {\r
+ assert sequence.equals(word);\r
+ return this;\r
+ }\r
+\r
+ final String rest = word.substring(pos);\r
+ assert rest.length() > 0;\r
+\r
+ final Map.Entry<String, Node> lcsEntry;\r
+ final String lcs;\r
+ {\r
+ final Map.Entry<String, Node> floorEntry = childrenMap.floorEntry(rest);\r
+ final Map.Entry<String, Node> ceilingEntry = childrenMap\r
+ .ceilingEntry(rest);\r
+ final String floorLcs = floorEntry == null ? "" : StringUtil\r
+ .longestCommonSubstring(rest, floorEntry.getKey());\r
+ final String ceilingLcs = ceilingEntry == null ? "" : StringUtil\r
+ .longestCommonSubstring(rest, ceilingEntry.getKey());\r
+ if (floorLcs.length() > ceilingLcs.length()) {\r
+ lcsEntry = floorEntry;\r
+ lcs = floorLcs;\r
+ } else {\r
+ lcsEntry = ceilingEntry;\r
+ lcs = ceilingLcs;\r
+ }\r
+ }\r
+\r
+ // No LCS, have to add everything.\r
+ if (lcs.length() == 0) {\r
+ if (!create) {\r
+ return null;\r
+ }\r
+ final Node result = new Node(word);\r
+ final Object old = childrenMap.put(rest.intern(), result);\r
+ assert old == null;\r
+ // System.out.println(" Adding final chunk: " + rest);\r
+ return result;\r
+ }\r
+\r
+ assert lcsEntry != null;\r
+\r
+ // The map already contained the LCS.\r
+ if (lcs.length() == lcsEntry.getKey().length()) {\r
+ assert lcs.equals(lcsEntry.getKey());\r
+ final Node result = lcsEntry.getValue().getIndexNode(word,\r
+ pos + lcs.length(), create);\r
+ assert result.sequence.equals(word);\r
+ return result;\r
+ }\r
+\r
+ if (!create) {\r
+ return null;\r
+ }\r
+\r
+ // Have to split, inserting the LCS.\r
+ // System.out.println(" Splitting " + lcsEntry + "/" + word + " @ " +\r
+ // lcs);\r
+ final Node newChild = new Node(word.substring(0, pos + lcs.length()));\r
+ final Object old = childrenMap.put(lcs.intern(), newChild);\r
+ assert old == null;\r
+ childrenMap.remove(lcsEntry.getKey());\r
+ newChild.childrenMap.put(lcsEntry.getKey().substring(lcs.length())\r
+ .intern(), lcsEntry.getValue());\r
+\r
+ if (lcs.equals(rest)) {\r
+ return newChild;\r
+ }\r
+ final Node result = new Node(word);\r
+ final Object old2 = newChild.childrenMap.put(rest.substring(lcs.length())\r
+ .intern(), result);\r
+ assert old2 == null;\r
+ // System.out.println(" newChildrenMap=" + newChild.childrenMap);\r
+\r
+ return result;\r
+ }\r
+\r
+ Index.Node toIndexNode() {\r
+ final Index.Node result = new Index.Node(childrenMap.size(), offsetsList\r
+ .size());\r
+ int i = 0;\r
+ for (final Map.Entry<String, Node> entry : childrenMap.entrySet()) {\r
+ result.chars[i] = entry.getKey();\r
+ result.children[i] = entry.getValue().toIndexNode();\r
+ i++;\r
+ }\r
+ return result;\r
+ }\r
+\r
+ void forEachNode(final Function<Node> f) {\r
+ f.invoke(this);\r
+ for (final Node child : childrenMap.values()) {\r
+ child.forEachNode(f);\r
+ }\r
+ }\r
+\r
+ int descendantCount() {\r
+ int count = 1;\r
+ for (final Node child : childrenMap.values()) {\r
+ count += child.descendantCount();\r
+ }\r
+ return count;\r
+ }\r
+\r
+ void recursiveSetDescendantOffsetCount() {\r
+ descendantOffsetCount = offsetsList.size();\r
+ for (final Node child : childrenMap.values()) {\r
+ child.recursiveSetDescendantOffsetCount();\r
+ descendantOffsetCount += child.descendantOffsetCount;\r
+ }\r
+ }\r
+\r
+ public void packDescendants(final int maxDescendants) {\r
+ if (descendantOffsetCount <= maxDescendants) {\r
+ final Set<Integer> descendantOffsets = new LinkedHashSet<Integer>();\r
+ recursiveAddDescendants(descendantOffsets);\r
+ assert descendantOffsets.size() <= maxDescendants;\r
+ offsetsList.clear();\r
+ offsetsList.addAll(descendantOffsets);\r
+ childrenMap.clear();\r
+ } else {\r
+ for (final Node child : childrenMap.values()) {\r
+ child.packDescendants(maxDescendants);\r
+ }\r
+ }\r
+ }\r
+\r
+ private void recursiveAddDescendants(final Set<Integer> descendantOffsets) {\r
+ descendantOffsets.addAll(this.offsetsList);\r
+ for (final Node child : childrenMap.values()) {\r
+ child.recursiveAddDescendants(descendantOffsets);\r
+ }\r
+ }\r
+\r
+\r
+ @Override\r
+ public String toString() {\r
+ return sequence + ":" + offsetsList.size();\r
+ }\r
+\r
+ }\r
+\r
+ // ----------------------------------------------------------------\r
+\r
+ static Node createIndex(final String file, final byte lang) throws IOException {\r
+ final Node root = new Node("");\r
+ final RandomAccessFile raf = new RandomAccessFile(file, "r");\r
+ String line;\r
+ final Entry entry = new Entry();\r
+ int lineCount = 0;\r
+ while ((line = raf.readLine()) != null) {\r
+ final long fileLocation = raf.getFilePointer();\r
+ assert ((int) fileLocation) == fileLocation;\r
+\r
+ line = line.trim();\r
+ if (line.isEmpty() || line.startsWith("#") || !entry.parseFromLine(line)) {\r
+ continue;\r
+ }\r
+ final String text = entry.getIndexableText(Entry.LANG1);\r
+ final String[] tokens = WHITESPACE.split(text);\r
+ final Set<String> tokenSet = new LinkedHashSet<String>();\r
+ for (String token : tokens) {\r
+ if (token.length() <= 1 || !Character.isLetter(token.charAt(0))) {\r
+ continue;\r
+ }\r
+ tokenSet.add(entry.normalizeToken(token, lang));\r
+ }\r
+ for (final String normalized : tokenSet) {\r
+ // System.out.println("Inserting: " + normalized);\r
+ if ("die".equals(normalized) || "eine".equals(normalized)) {\r
+ // System.out.println("hello");\r
+ }\r
+ final Node node = root.getIndexNode(normalized, 0, true);\r
+ node.offsetsList.add((int) fileLocation);\r
+ assert node == root.getIndexNode(normalized, 0, false);\r
+ assert normalized\r
+ .equals(root.getIndexNode(normalized, 0, false).sequence);\r
+ }\r
+\r
+ if (lineCount % 10000 == 0) {\r
+ System.out.println("IndexBuilder: " + "lineCount=" + lineCount);\r
+ }\r
+ lineCount++;\r
+ }\r
+ raf.close();\r
+ return root;\r
+ }\r
+\r
+}\r