From: thadh Date: Tue, 10 Feb 2009 07:28:50 +0000 (-0800) Subject: go X-Git-Url: http://gitweb.fperrin.net/?a=commitdiff_plain;h=ea94c461120799b546263a9006125d450b2b3c7b;p=DictionaryPC.git go --- diff --git a/src/com/hughes/android/dictionary/IndexBuilder.java b/src/com/hughes/android/dictionary/IndexBuilder.java index 9f055ca..e92d136 100755 --- a/src/com/hughes/android/dictionary/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/IndexBuilder.java @@ -1,21 +1,17 @@ package com.hughes.android.dictionary; -import java.io.DataOutputStream; -import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; import java.io.RandomAccessFile; import java.io.Serializable; import java.util.ArrayList; +import java.util.Collections; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; -import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; -import com.hughes.android.dictionary.Index.Node; import com.hughes.util.FileUtil; public class IndexBuilder { @@ -33,46 +29,68 @@ public class IndexBuilder { final String file = args[0]; final byte lang = Entry.LANG1; Node rootBuilder; -// rootBuilder = createIndex(file, lang); -// FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", file, lang)); + rootBuilder = createIndex(file, lang); + FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", file, lang)); rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", file, lang)); - - final AtomicInteger c = new AtomicInteger(); + +// final AtomicInteger c = new AtomicInteger(); rootBuilder.forEachNode(new Function() { @Override public void invoke(Node t) { - if (t.offsetsList.size() > 200) { - System.out.println(t); - c.incrementAndGet(); - } + Collections.sort(t.offsets); +// if (t.offsets.size() > 128) { +// System.out.println(t); +// c.incrementAndGet(); +// } }}); - System.out.println(c); - - rootBuilder.recursiveSetDescendantOffsetCount(); - rootBuilder.packDescendants(128); - - final DataOutputStream os = new DataOutputStream(new FileOutputStream( - String.format("%s_index_%d", file, lang))); - final Index.Node root = rootBuilder.toIndexNode(); - root.write(os); - os.close(); +// System.out.println(c); - FileUtil.write(root, String.format("%s_index_%d.serialized", file, lang)); +// rootBuilder.recursiveSetDescendantOffsetCount(); +// rootBuilder.packDescendants(128); + + // Dump twice to get accurate file locations. + for (int i = 0; i < 2; ++i) { + final RandomAccessFile raf = new RandomAccessFile(String.format(Dictionary.INDEX_FORMAT, file, lang), "rw"); + rootBuilder.dump(raf); + raf.close(); + } - Object o = FileUtil.read(String.format("%s_index_%d.serialized", file, lang)); - - } // ---------------------------------------------------------------- + + static final class EntryDescriptor implements Comparable, Serializable { + final int offset; + final int numTokens; + public EntryDescriptor(int offset, int numTokens) { + this.offset = offset; + this.numTokens = numTokens; + } + @Override + public boolean equals(Object obj) { + final EntryDescriptor that = (EntryDescriptor) obj; + return this.offset == that.offset; + } + @Override + public int hashCode() { + return offset; + } + @Override + public int compareTo(EntryDescriptor o) { + return this.numTokens < o.numTokens ? -1 : this.numTokens == o.numTokens ? 0 : 1; + } + } static final class Node implements Serializable { private static final long serialVersionUID = -5423134653901704956L; - final TreeMap childrenMap = new TreeMap(); - final List offsetsList = new ArrayList(); + final TreeMap children = new TreeMap(); + final List offsets = new ArrayList(); final String sequence; + int descendantOffsetCount = 0; + + int indexFileLocation = -1; public Node(String sequence) { if (sequence.length() == 0) { @@ -96,8 +114,8 @@ public class IndexBuilder { final Map.Entry lcsEntry; final String lcs; { - final Map.Entry floorEntry = childrenMap.floorEntry(rest); - final Map.Entry ceilingEntry = childrenMap + final Map.Entry floorEntry = children.floorEntry(rest); + final Map.Entry ceilingEntry = children .ceilingEntry(rest); final String floorLcs = floorEntry == null ? "" : StringUtil .longestCommonSubstring(rest, floorEntry.getKey()); @@ -118,7 +136,7 @@ public class IndexBuilder { return null; } final Node result = new Node(word); - final Object old = childrenMap.put(rest.intern(), result); + final Object old = children.put(rest.intern(), result); assert old == null; // System.out.println(" Adding final chunk: " + rest); return result; @@ -143,29 +161,29 @@ public class IndexBuilder { // System.out.println(" Splitting " + lcsEntry + "/" + word + " @ " + // lcs); final Node newChild = new Node(word.substring(0, pos + lcs.length())); - final Object old = childrenMap.put(lcs.intern(), newChild); + final Object old = children.put(lcs.intern(), newChild); assert old == null; - childrenMap.remove(lcsEntry.getKey()); - newChild.childrenMap.put(lcsEntry.getKey().substring(lcs.length()) + children.remove(lcsEntry.getKey()); + newChild.children.put(lcsEntry.getKey().substring(lcs.length()) .intern(), lcsEntry.getValue()); if (lcs.equals(rest)) { return newChild; } final Node result = new Node(word); - final Object old2 = newChild.childrenMap.put(rest.substring(lcs.length()) + final Object old2 = newChild.children.put(rest.substring(lcs.length()) .intern(), result); assert old2 == null; - // System.out.println(" newChildrenMap=" + newChild.childrenMap); + // System.out.println(" newchildren=" + newChild.children); return result; } - Index.Node toIndexNode() { - final Index.Node result = new Index.Node(childrenMap.size(), offsetsList + MemoryIndex.Node toIndexNode() { + final MemoryIndex.Node result = new MemoryIndex.Node(children.size(), offsets .size()); int i = 0; - for (final Map.Entry entry : childrenMap.entrySet()) { + for (final Map.Entry entry : children.entrySet()) { result.chars[i] = entry.getKey(); result.children[i] = entry.getValue().toIndexNode(); i++; @@ -175,22 +193,22 @@ public class IndexBuilder { void forEachNode(final Function f) { f.invoke(this); - for (final Node child : childrenMap.values()) { + for (final Node child : children.values()) { child.forEachNode(f); } } int descendantCount() { int count = 1; - for (final Node child : childrenMap.values()) { + for (final Node child : children.values()) { count += child.descendantCount(); } return count; } void recursiveSetDescendantOffsetCount() { - descendantOffsetCount = offsetsList.size(); - for (final Node child : childrenMap.values()) { + descendantOffsetCount = offsets.size(); + for (final Node child : children.values()) { child.recursiveSetDescendantOffsetCount(); descendantOffsetCount += child.descendantOffsetCount; } @@ -198,32 +216,56 @@ public class IndexBuilder { public void packDescendants(final int maxDescendants) { if (descendantOffsetCount <= maxDescendants) { - final Set descendantOffsets = new LinkedHashSet(); + final Set descendantOffsets = new LinkedHashSet(); recursiveAddDescendants(descendantOffsets); assert descendantOffsets.size() <= maxDescendants; - offsetsList.clear(); - offsetsList.addAll(descendantOffsets); - childrenMap.clear(); + offsets.clear(); + offsets.addAll(descendantOffsets); + children.clear(); } else { - for (final Node child : childrenMap.values()) { + for (final Node child : children.values()) { child.packDescendants(maxDescendants); } } } - private void recursiveAddDescendants(final Set descendantOffsets) { - descendantOffsets.addAll(this.offsetsList); - for (final Node child : childrenMap.values()) { + private void recursiveAddDescendants(final Set descendantOffsets) { + descendantOffsets.addAll(this.offsets); + for (final Node child : children.values()) { child.recursiveAddDescendants(descendantOffsets); } } - @Override public String toString() { - return sequence + ":" + offsetsList.size(); + return sequence + ":" + offsets.size(); + } + + void dump(final RandomAccessFile file) throws IOException { + if (indexFileLocation == -1) { + indexFileLocation = (int) file.getFilePointer(); + } else { + assert indexFileLocation == file.getFilePointer(); + } + + // Children. + file.writeInt(children.size()); + for (final Map.Entry child : children.entrySet()) { + file.writeUTF(child.getKey()); + file.writeInt(child.getValue().indexFileLocation); + } + + // Offsets. + file.writeInt(offsets.size()); + for (int i = 0; i < offsets.size(); i++) { + file.writeInt(offsets.get(i).offset); + } + + // Dump children. + for (final Map.Entry child : children.entrySet()) { + child.getValue().dump(file); + } } - } // ---------------------------------------------------------------- @@ -234,8 +276,8 @@ public class IndexBuilder { String line; final Entry entry = new Entry(); int lineCount = 0; + long fileLocation = 0; while ((line = raf.readLine()) != null) { - final long fileLocation = raf.getFilePointer(); assert ((int) fileLocation) == fileLocation; line = line.trim(); @@ -257,7 +299,7 @@ public class IndexBuilder { // System.out.println("hello"); } final Node node = root.getIndexNode(normalized, 0, true); - node.offsetsList.add((int) fileLocation); + node.offsets.add(new EntryDescriptor((int) fileLocation, tokens.length)); assert node == root.getIndexNode(normalized, 0, false); assert normalized .equals(root.getIndexNode(normalized, 0, false).sequence); @@ -266,7 +308,9 @@ public class IndexBuilder { if (lineCount % 10000 == 0) { System.out.println("IndexBuilder: " + "lineCount=" + lineCount); } + lineCount++; + fileLocation = raf.getFilePointer(); } raf.close(); return root; diff --git a/src/com/hughes/android/dictionary/IndexTest.java b/src/com/hughes/android/dictionary/IndexTest.java new file mode 100755 index 0000000..5c0b847 --- /dev/null +++ b/src/com/hughes/android/dictionary/IndexTest.java @@ -0,0 +1,49 @@ +package com.hughes.android.dictionary; + +import java.io.IOException; +import java.io.RandomAccessFile; +import java.util.LinkedHashSet; +import java.util.Set; + +import junit.framework.TestCase; + +import com.hughes.android.dictionary.Index.Node; +import com.hughes.util.FileUtil; + +public class IndexTest extends TestCase { + + static final String file = "c:\\dict-de-en.txt"; + static final String file_index = file + "_index_0"; + + public void testLookup() throws IOException { + System.out.println("testLookup"); + final Index index = new Index(file_index); + final Node node = index.lookup("handhubwagen"); + assertNotNull(node); + + final RandomAccessFile raf = new RandomAccessFile(file, "r"); + for (int i = 0; i < node.offsets.length; ++i) { + final String entry = FileUtil.readLine(raf, node.offsets[i]); + System.out.println(entry); + assertTrue(entry.toLowerCase().contains("handhubwagen")); + } + } + + public void testGetDescendantOffsets() throws IOException { + System.out.println("testGetDescendantOffsets"); + final Index index = new Index(file_index); + + final Node node = index.lookup("handhebe"); + assertNotNull(node); + assertEquals("handhebel", node.text); + final Set offsets = new LinkedHashSet(); + node.getDescendantEntryOffsets(offsets, 10); + final RandomAccessFile raf = new RandomAccessFile(file, "r"); + for (final Integer offset : offsets) { + final String entry = FileUtil.readLine(raf, offset); + System.out.println(entry); + assertTrue(entry.toLowerCase().contains(node.text)); + } + } + +}