]> gitweb.fperrin.net Git - DictionaryPC.git/commitdiff
go
authorthadh <thadh@THADH-MTV.corp.google.com>
Wed, 28 Jan 2009 19:34:20 +0000 (11:34 -0800)
committerthadh <thadh@THADH-MTV.corp.google.com>
Wed, 28 Jan 2009 19:34:20 +0000 (11:34 -0800)
src/com/hughes/android/dictionary/Function.java [new file with mode: 0755]
src/com/hughes/android/dictionary/IndexBuilder.java [new file with mode: 0755]
src/com/hughes/android/dictionary/StringUtil.java [new file with mode: 0755]
stoplist-de.txt [new file with mode: 0755]

diff --git a/src/com/hughes/android/dictionary/Function.java b/src/com/hughes/android/dictionary/Function.java
new file mode 100755 (executable)
index 0000000..72d9c02
--- /dev/null
@@ -0,0 +1,7 @@
+package com.hughes.android.dictionary;\r
+\r
+public interface Function<T> {\r
+  \r
+  void invoke(T t);\r
+\r
+}\r
diff --git a/src/com/hughes/android/dictionary/IndexBuilder.java b/src/com/hughes/android/dictionary/IndexBuilder.java
new file mode 100755 (executable)
index 0000000..9f055ca
--- /dev/null
@@ -0,0 +1,275 @@
+package com.hughes.android.dictionary;\r
+\r
+import java.io.DataOutputStream;\r
+import java.io.File;\r
+import java.io.FileOutputStream;\r
+import java.io.IOException;\r
+import java.io.RandomAccessFile;\r
+import java.io.Serializable;\r
+import java.util.ArrayList;\r
+import java.util.LinkedHashSet;\r
+import java.util.List;\r
+import java.util.Map;\r
+import java.util.Set;\r
+import java.util.TreeMap;\r
+import java.util.concurrent.atomic.AtomicInteger;\r
+import java.util.regex.Pattern;\r
+\r
+import com.hughes.android.dictionary.Index.Node;\r
+import com.hughes.util.FileUtil;\r
+\r
+public class IndexBuilder {\r
+\r
+  static final Pattern WHITESPACE = Pattern.compile("\\s+");\r
+  static final Pattern NONALPHA = Pattern.compile("[^A-Za-z]+");\r
+\r
+  public static void main(String[] args) throws IOException,\r
+      ClassNotFoundException {\r
+    if (args.length != 1) {\r
+      System.err.println("No input file.");\r
+      return;\r
+    }\r
+\r
+    final String file = args[0];\r
+    final byte lang = Entry.LANG1;\r
+    Node rootBuilder;\r
+//    rootBuilder = createIndex(file, lang);\r
+//    FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", file, lang));\r
+    rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", file, lang));\r
+    \r
+    final AtomicInteger c = new AtomicInteger();\r
+    rootBuilder.forEachNode(new Function<Node>() {\r
+      @Override\r
+      public void invoke(Node t) {\r
+        if (t.offsetsList.size() > 200) {\r
+          System.out.println(t);\r
+          c.incrementAndGet();\r
+        }\r
+      }});\r
+    System.out.println(c);\r
+    \r
+    rootBuilder.recursiveSetDescendantOffsetCount();\r
+    rootBuilder.packDescendants(128);\r
+\r
+    final DataOutputStream os = new DataOutputStream(new FileOutputStream(\r
+        String.format("%s_index_%d", file, lang)));\r
+    final Index.Node root = rootBuilder.toIndexNode();\r
+    root.write(os);\r
+    os.close();\r
+    \r
+    FileUtil.write(root, String.format("%s_index_%d.serialized", file, lang));\r
+    \r
+    Object o = FileUtil.read(String.format("%s_index_%d.serialized", file, lang));\r
+\r
+\r
+  }\r
+\r
+  // ----------------------------------------------------------------\r
+\r
+  static final class Node implements Serializable {\r
+    private static final long serialVersionUID = -5423134653901704956L;\r
+    \r
+    final TreeMap<String, Node> childrenMap = new TreeMap<String, Node>();\r
+    final List<Integer> offsetsList = new ArrayList<Integer>();\r
+    final String sequence;\r
+    int descendantOffsetCount = 0;\r
+\r
+    public Node(String sequence) {\r
+      if (sequence.length() == 0) {\r
+        System.out.println("Created root.");\r
+      }\r
+      this.sequence = sequence.intern();\r
+    }\r
+\r
+    public Node getIndexNode(final String word, final int pos,\r
+        final boolean create) {\r
+      assert this.sequence.equals(word.substring(0, pos));\r
+\r
+      if (pos == word.length()) {\r
+        assert sequence.equals(word);\r
+        return this;\r
+      }\r
+\r
+      final String rest = word.substring(pos);\r
+      assert rest.length() > 0;\r
+\r
+      final Map.Entry<String, Node> lcsEntry;\r
+      final String lcs;\r
+      {\r
+        final Map.Entry<String, Node> floorEntry = childrenMap.floorEntry(rest);\r
+        final Map.Entry<String, Node> ceilingEntry = childrenMap\r
+            .ceilingEntry(rest);\r
+        final String floorLcs = floorEntry == null ? "" : StringUtil\r
+            .longestCommonSubstring(rest, floorEntry.getKey());\r
+        final String ceilingLcs = ceilingEntry == null ? "" : StringUtil\r
+            .longestCommonSubstring(rest, ceilingEntry.getKey());\r
+        if (floorLcs.length() > ceilingLcs.length()) {\r
+          lcsEntry = floorEntry;\r
+          lcs = floorLcs;\r
+        } else {\r
+          lcsEntry = ceilingEntry;\r
+          lcs = ceilingLcs;\r
+        }\r
+      }\r
+\r
+      // No LCS, have to add everything.\r
+      if (lcs.length() == 0) {\r
+        if (!create) {\r
+          return null;\r
+        }\r
+        final Node result = new Node(word);\r
+        final Object old = childrenMap.put(rest.intern(), result);\r
+        assert old == null;\r
+        // System.out.println("  Adding final chunk: " + rest);\r
+        return result;\r
+      }\r
+\r
+      assert lcsEntry != null;\r
+\r
+      // The map already contained the LCS.\r
+      if (lcs.length() == lcsEntry.getKey().length()) {\r
+        assert lcs.equals(lcsEntry.getKey());\r
+        final Node result = lcsEntry.getValue().getIndexNode(word,\r
+            pos + lcs.length(), create);\r
+        assert result.sequence.equals(word);\r
+        return result;\r
+      }\r
+\r
+      if (!create) {\r
+        return null;\r
+      }\r
+\r
+      // Have to split, inserting the LCS.\r
+      // System.out.println("  Splitting " + lcsEntry + "/" + word + " @ " +\r
+      // lcs);\r
+      final Node newChild = new Node(word.substring(0, pos + lcs.length()));\r
+      final Object old = childrenMap.put(lcs.intern(), newChild);\r
+      assert old == null;\r
+      childrenMap.remove(lcsEntry.getKey());\r
+      newChild.childrenMap.put(lcsEntry.getKey().substring(lcs.length())\r
+          .intern(), lcsEntry.getValue());\r
+\r
+      if (lcs.equals(rest)) {\r
+        return newChild;\r
+      }\r
+      final Node result = new Node(word);\r
+      final Object old2 = newChild.childrenMap.put(rest.substring(lcs.length())\r
+          .intern(), result);\r
+      assert old2 == null;\r
+      // System.out.println("  newChildrenMap=" + newChild.childrenMap);\r
+\r
+      return result;\r
+    }\r
+\r
+    Index.Node toIndexNode() {\r
+      final Index.Node result = new Index.Node(childrenMap.size(), offsetsList\r
+          .size());\r
+      int i = 0;\r
+      for (final Map.Entry<String, Node> entry : childrenMap.entrySet()) {\r
+        result.chars[i] = entry.getKey();\r
+        result.children[i] = entry.getValue().toIndexNode();\r
+        i++;\r
+      }\r
+      return result;\r
+    }\r
+\r
+    void forEachNode(final Function<Node> f) {\r
+      f.invoke(this);\r
+      for (final Node child : childrenMap.values()) {\r
+        child.forEachNode(f);\r
+      }\r
+    }\r
+\r
+    int descendantCount() {\r
+      int count = 1;\r
+      for (final Node child : childrenMap.values()) {\r
+        count += child.descendantCount();\r
+      }\r
+      return count;\r
+    }\r
+\r
+    void recursiveSetDescendantOffsetCount() {\r
+      descendantOffsetCount = offsetsList.size();\r
+      for (final Node child : childrenMap.values()) {\r
+        child.recursiveSetDescendantOffsetCount();\r
+        descendantOffsetCount += child.descendantOffsetCount;\r
+      }\r
+    }\r
+\r
+    public void packDescendants(final int maxDescendants) {\r
+      if (descendantOffsetCount <= maxDescendants) {\r
+        final Set<Integer> descendantOffsets = new LinkedHashSet<Integer>();\r
+        recursiveAddDescendants(descendantOffsets);\r
+        assert descendantOffsets.size() <= maxDescendants;\r
+        offsetsList.clear();\r
+        offsetsList.addAll(descendantOffsets);\r
+        childrenMap.clear();\r
+      } else {\r
+        for (final Node child : childrenMap.values()) {\r
+          child.packDescendants(maxDescendants);\r
+        }\r
+      }\r
+    }\r
+\r
+    private void recursiveAddDescendants(final Set<Integer> descendantOffsets) {\r
+      descendantOffsets.addAll(this.offsetsList);\r
+      for (final Node child : childrenMap.values()) {\r
+        child.recursiveAddDescendants(descendantOffsets);\r
+      }\r
+    }\r
+\r
+\r
+    @Override\r
+    public String toString() {\r
+      return sequence + ":" + offsetsList.size();\r
+    }\r
+\r
+  }\r
+\r
+  // ----------------------------------------------------------------\r
+\r
+  static Node createIndex(final String file, final byte lang) throws IOException {\r
+    final Node root = new Node("");\r
+    final RandomAccessFile raf = new RandomAccessFile(file, "r");\r
+    String line;\r
+    final Entry entry = new Entry();\r
+    int lineCount = 0;\r
+    while ((line = raf.readLine()) != null) {\r
+      final long fileLocation = raf.getFilePointer();\r
+      assert ((int) fileLocation) == fileLocation;\r
+\r
+      line = line.trim();\r
+      if (line.isEmpty() || line.startsWith("#") || !entry.parseFromLine(line)) {\r
+        continue;\r
+      }\r
+      final String text = entry.getIndexableText(Entry.LANG1);\r
+      final String[] tokens = WHITESPACE.split(text);\r
+      final Set<String> tokenSet = new LinkedHashSet<String>();\r
+      for (String token : tokens) {\r
+        if (token.length() <= 1 || !Character.isLetter(token.charAt(0))) {\r
+          continue;\r
+        }\r
+        tokenSet.add(entry.normalizeToken(token, lang));\r
+      }\r
+      for (final String normalized : tokenSet) {\r
+        // System.out.println("Inserting: " + normalized);\r
+        if ("die".equals(normalized) || "eine".equals(normalized)) {\r
+          // System.out.println("hello");\r
+        }\r
+        final Node node = root.getIndexNode(normalized, 0, true);\r
+        node.offsetsList.add((int) fileLocation);\r
+        assert node == root.getIndexNode(normalized, 0, false);\r
+        assert normalized\r
+            .equals(root.getIndexNode(normalized, 0, false).sequence);\r
+      }\r
+\r
+      if (lineCount % 10000 == 0) {\r
+        System.out.println("IndexBuilder: " + "lineCount=" + lineCount);\r
+      }\r
+      lineCount++;\r
+    }\r
+    raf.close();\r
+    return root;\r
+  }\r
+\r
+}\r
diff --git a/src/com/hughes/android/dictionary/StringUtil.java b/src/com/hughes/android/dictionary/StringUtil.java
new file mode 100755 (executable)
index 0000000..12e44ea
--- /dev/null
@@ -0,0 +1,14 @@
+package com.hughes.android.dictionary;\r
+\r
+public final class StringUtil {\r
+\r
+  public static String longestCommonSubstring(final String s1, final String s2) {\r
+    for (int i = 0; i < s1.length() && i < s2.length(); i++) {\r
+      if (s1.charAt(i) != s2.charAt(i)) {\r
+        return s1.substring(0, i);\r
+      }\r
+    }\r
+    return s1.length() < s2.length() ? s1 : s2;\r
+  }\r
+\r
+}\r
diff --git a/stoplist-de.txt b/stoplist-de.txt
new file mode 100755 (executable)
index 0000000..e69de29