From: thadh Date: Wed, 28 Jan 2009 19:34:20 +0000 (-0800) Subject: go X-Git-Url: http://gitweb.fperrin.net/?a=commitdiff_plain;h=cc2944df314a33299add12dd7390beea2a709e0a;p=DictionaryPC.git go --- cc2944df314a33299add12dd7390beea2a709e0a diff --git a/src/com/hughes/android/dictionary/Function.java b/src/com/hughes/android/dictionary/Function.java new file mode 100755 index 0000000..72d9c02 --- /dev/null +++ b/src/com/hughes/android/dictionary/Function.java @@ -0,0 +1,7 @@ +package com.hughes.android.dictionary; + +public interface Function { + + void invoke(T t); + +} diff --git a/src/com/hughes/android/dictionary/IndexBuilder.java b/src/com/hughes/android/dictionary/IndexBuilder.java new file mode 100755 index 0000000..9f055ca --- /dev/null +++ b/src/com/hughes/android/dictionary/IndexBuilder.java @@ -0,0 +1,275 @@ +package com.hughes.android.dictionary; + +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.regex.Pattern; + +import com.hughes.android.dictionary.Index.Node; +import com.hughes.util.FileUtil; + +public class IndexBuilder { + + static final Pattern WHITESPACE = Pattern.compile("\\s+"); + static final Pattern NONALPHA = Pattern.compile("[^A-Za-z]+"); + + public static void main(String[] args) throws IOException, + ClassNotFoundException { + if (args.length != 1) { + System.err.println("No input file."); + return; + } + + final String file = args[0]; + final byte lang = Entry.LANG1; + Node rootBuilder; +// rootBuilder = createIndex(file, lang); +// FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", file, lang)); + rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", file, lang)); + + final AtomicInteger c = new AtomicInteger(); + rootBuilder.forEachNode(new Function() { + @Override + public void invoke(Node t) { + if (t.offsetsList.size() > 200) { + System.out.println(t); + c.incrementAndGet(); + } + }}); + System.out.println(c); + + rootBuilder.recursiveSetDescendantOffsetCount(); + rootBuilder.packDescendants(128); + + final DataOutputStream os = new DataOutputStream(new FileOutputStream( + String.format("%s_index_%d", file, lang))); + final Index.Node root = rootBuilder.toIndexNode(); + root.write(os); + os.close(); + + FileUtil.write(root, String.format("%s_index_%d.serialized", file, lang)); + + Object o = FileUtil.read(String.format("%s_index_%d.serialized", file, lang)); + + + } + + // ---------------------------------------------------------------- + + static final class Node implements Serializable { + private static final long serialVersionUID = -5423134653901704956L; + + final TreeMap childrenMap = new TreeMap(); + final List offsetsList = new ArrayList(); + final String sequence; + int descendantOffsetCount = 0; + + public Node(String sequence) { + if (sequence.length() == 0) { + System.out.println("Created root."); + } + this.sequence = sequence.intern(); + } + + public Node getIndexNode(final String word, final int pos, + final boolean create) { + assert this.sequence.equals(word.substring(0, pos)); + + if (pos == word.length()) { + assert sequence.equals(word); + return this; + } + + final String rest = word.substring(pos); + assert rest.length() > 0; + + final Map.Entry lcsEntry; + final String lcs; + { + final Map.Entry floorEntry = childrenMap.floorEntry(rest); + final Map.Entry ceilingEntry = childrenMap + .ceilingEntry(rest); + final String floorLcs = floorEntry == null ? "" : StringUtil + .longestCommonSubstring(rest, floorEntry.getKey()); + final String ceilingLcs = ceilingEntry == null ? "" : StringUtil + .longestCommonSubstring(rest, ceilingEntry.getKey()); + if (floorLcs.length() > ceilingLcs.length()) { + lcsEntry = floorEntry; + lcs = floorLcs; + } else { + lcsEntry = ceilingEntry; + lcs = ceilingLcs; + } + } + + // No LCS, have to add everything. + if (lcs.length() == 0) { + if (!create) { + return null; + } + final Node result = new Node(word); + final Object old = childrenMap.put(rest.intern(), result); + assert old == null; + // System.out.println(" Adding final chunk: " + rest); + return result; + } + + assert lcsEntry != null; + + // The map already contained the LCS. + if (lcs.length() == lcsEntry.getKey().length()) { + assert lcs.equals(lcsEntry.getKey()); + final Node result = lcsEntry.getValue().getIndexNode(word, + pos + lcs.length(), create); + assert result.sequence.equals(word); + return result; + } + + if (!create) { + return null; + } + + // Have to split, inserting the LCS. + // System.out.println(" Splitting " + lcsEntry + "/" + word + " @ " + + // lcs); + final Node newChild = new Node(word.substring(0, pos + lcs.length())); + final Object old = childrenMap.put(lcs.intern(), newChild); + assert old == null; + childrenMap.remove(lcsEntry.getKey()); + newChild.childrenMap.put(lcsEntry.getKey().substring(lcs.length()) + .intern(), lcsEntry.getValue()); + + if (lcs.equals(rest)) { + return newChild; + } + final Node result = new Node(word); + final Object old2 = newChild.childrenMap.put(rest.substring(lcs.length()) + .intern(), result); + assert old2 == null; + // System.out.println(" newChildrenMap=" + newChild.childrenMap); + + return result; + } + + Index.Node toIndexNode() { + final Index.Node result = new Index.Node(childrenMap.size(), offsetsList + .size()); + int i = 0; + for (final Map.Entry entry : childrenMap.entrySet()) { + result.chars[i] = entry.getKey(); + result.children[i] = entry.getValue().toIndexNode(); + i++; + } + return result; + } + + void forEachNode(final Function f) { + f.invoke(this); + for (final Node child : childrenMap.values()) { + child.forEachNode(f); + } + } + + int descendantCount() { + int count = 1; + for (final Node child : childrenMap.values()) { + count += child.descendantCount(); + } + return count; + } + + void recursiveSetDescendantOffsetCount() { + descendantOffsetCount = offsetsList.size(); + for (final Node child : childrenMap.values()) { + child.recursiveSetDescendantOffsetCount(); + descendantOffsetCount += child.descendantOffsetCount; + } + } + + public void packDescendants(final int maxDescendants) { + if (descendantOffsetCount <= maxDescendants) { + final Set descendantOffsets = new LinkedHashSet(); + recursiveAddDescendants(descendantOffsets); + assert descendantOffsets.size() <= maxDescendants; + offsetsList.clear(); + offsetsList.addAll(descendantOffsets); + childrenMap.clear(); + } else { + for (final Node child : childrenMap.values()) { + child.packDescendants(maxDescendants); + } + } + } + + private void recursiveAddDescendants(final Set descendantOffsets) { + descendantOffsets.addAll(this.offsetsList); + for (final Node child : childrenMap.values()) { + child.recursiveAddDescendants(descendantOffsets); + } + } + + + @Override + public String toString() { + return sequence + ":" + offsetsList.size(); + } + + } + + // ---------------------------------------------------------------- + + static Node createIndex(final String file, final byte lang) throws IOException { + final Node root = new Node(""); + final RandomAccessFile raf = new RandomAccessFile(file, "r"); + String line; + final Entry entry = new Entry(); + int lineCount = 0; + while ((line = raf.readLine()) != null) { + final long fileLocation = raf.getFilePointer(); + assert ((int) fileLocation) == fileLocation; + + line = line.trim(); + if (line.isEmpty() || line.startsWith("#") || !entry.parseFromLine(line)) { + continue; + } + final String text = entry.getIndexableText(Entry.LANG1); + final String[] tokens = WHITESPACE.split(text); + final Set tokenSet = new LinkedHashSet(); + for (String token : tokens) { + if (token.length() <= 1 || !Character.isLetter(token.charAt(0))) { + continue; + } + tokenSet.add(entry.normalizeToken(token, lang)); + } + for (final String normalized : tokenSet) { + // System.out.println("Inserting: " + normalized); + if ("die".equals(normalized) || "eine".equals(normalized)) { + // System.out.println("hello"); + } + final Node node = root.getIndexNode(normalized, 0, true); + node.offsetsList.add((int) fileLocation); + assert node == root.getIndexNode(normalized, 0, false); + assert normalized + .equals(root.getIndexNode(normalized, 0, false).sequence); + } + + if (lineCount % 10000 == 0) { + System.out.println("IndexBuilder: " + "lineCount=" + lineCount); + } + lineCount++; + } + raf.close(); + return root; + } + +} diff --git a/src/com/hughes/android/dictionary/StringUtil.java b/src/com/hughes/android/dictionary/StringUtil.java new file mode 100755 index 0000000..12e44ea --- /dev/null +++ b/src/com/hughes/android/dictionary/StringUtil.java @@ -0,0 +1,14 @@ +package com.hughes.android.dictionary; + +public final class StringUtil { + + public static String longestCommonSubstring(final String s1, final String s2) { + for (int i = 0; i < s1.length() && i < s2.length(); i++) { + if (s1.charAt(i) != s2.charAt(i)) { + return s1.substring(0, i); + } + } + return s1.length() < s2.length() ? s1 : s2; + } + +} diff --git a/stoplist-de.txt b/stoplist-de.txt new file mode 100755 index 0000000..e69de29