X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2FIndexBuilder.java;h=49d393df1c76f85d68b76dec2086d767eaecc3bc;hb=7ce755ee7faeba246f8e8078556f83b2b4dac108;hp=9f055cad499cb7297d433157f6a79598643de641;hpb=cc2944df314a33299add12dd7390beea2a709e0a;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/IndexBuilder.java b/src/com/hughes/android/dictionary/IndexBuilder.java index 9f055ca..49d393d 100755 --- a/src/com/hughes/android/dictionary/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/IndexBuilder.java @@ -1,21 +1,17 @@ package com.hughes.android.dictionary; -import java.io.DataOutputStream; -import java.io.File; -import java.io.FileOutputStream; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.RandomAccessFile; import java.io.Serializable; import java.util.ArrayList; -import java.util.LinkedHashSet; +import java.util.Collections; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.TreeMap; -import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; -import com.hughes.android.dictionary.Index.Node; import com.hughes.util.FileUtil; public class IndexBuilder { @@ -29,213 +25,70 @@ public class IndexBuilder { System.err.println("No input file."); return; } + final String dictionaryFileName = args[0]; + createIndex(dictionaryFileName, Entry.LANG1); + createIndex(dictionaryFileName, Entry.LANG2); + } - final String file = args[0]; - final byte lang = Entry.LANG1; + private static void createIndex(final String dictionaryFileName, + final byte lang) throws IOException, FileNotFoundException, + ClassNotFoundException { Node rootBuilder; -// rootBuilder = createIndex(file, lang); -// FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", file, lang)); - rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", file, lang)); - - final AtomicInteger c = new AtomicInteger(); + rootBuilder = processDictionaryLines(dictionaryFileName, lang); + FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", dictionaryFileName, lang)); + rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", dictionaryFileName, lang)); + rootBuilder.forEachNode(new Function() { @Override - public void invoke(Node t) { - if (t.offsetsList.size() > 200) { - System.out.println(t); - c.incrementAndGet(); + public void invoke(final Node node) { + for (final List entryDescriptors : node.entryDescriptorsMap.values()) { + Collections.sort(entryDescriptors); } }}); - System.out.println(c); - rootBuilder.recursiveSetDescendantOffsetCount(); - rootBuilder.packDescendants(128); - - final DataOutputStream os = new DataOutputStream(new FileOutputStream( - String.format("%s_index_%d", file, lang))); - final Index.Node root = rootBuilder.toIndexNode(); - root.write(os); - os.close(); - - FileUtil.write(root, String.format("%s_index_%d.serialized", file, lang)); - - Object o = FileUtil.read(String.format("%s_index_%d.serialized", file, lang)); - - + // Dump twice to get accurate file locations. + for (int i = 0; i < 2; ++i) { + final RandomAccessFile raf = new RandomAccessFile(String.format(Dictionary.INDEX_FORMAT, dictionaryFileName, lang), "rw"); + rootBuilder.dump(raf); + raf.close(); + } } // ---------------------------------------------------------------- - - static final class Node implements Serializable { - private static final long serialVersionUID = -5423134653901704956L; - - final TreeMap childrenMap = new TreeMap(); - final List offsetsList = new ArrayList(); - final String sequence; - int descendantOffsetCount = 0; - - public Node(String sequence) { - if (sequence.length() == 0) { - System.out.println("Created root."); - } - this.sequence = sequence.intern(); - } - - public Node getIndexNode(final String word, final int pos, - final boolean create) { - assert this.sequence.equals(word.substring(0, pos)); - - if (pos == word.length()) { - assert sequence.equals(word); - return this; - } - - final String rest = word.substring(pos); - assert rest.length() > 0; - - final Map.Entry lcsEntry; - final String lcs; - { - final Map.Entry floorEntry = childrenMap.floorEntry(rest); - final Map.Entry ceilingEntry = childrenMap - .ceilingEntry(rest); - final String floorLcs = floorEntry == null ? "" : StringUtil - .longestCommonSubstring(rest, floorEntry.getKey()); - final String ceilingLcs = ceilingEntry == null ? "" : StringUtil - .longestCommonSubstring(rest, ceilingEntry.getKey()); - if (floorLcs.length() > ceilingLcs.length()) { - lcsEntry = floorEntry; - lcs = floorLcs; - } else { - lcsEntry = ceilingEntry; - lcs = ceilingLcs; - } - } - - // No LCS, have to add everything. - if (lcs.length() == 0) { - if (!create) { - return null; - } - final Node result = new Node(word); - final Object old = childrenMap.put(rest.intern(), result); - assert old == null; - // System.out.println(" Adding final chunk: " + rest); - return result; - } - - assert lcsEntry != null; - - // The map already contained the LCS. - if (lcs.length() == lcsEntry.getKey().length()) { - assert lcs.equals(lcsEntry.getKey()); - final Node result = lcsEntry.getValue().getIndexNode(word, - pos + lcs.length(), create); - assert result.sequence.equals(word); - return result; - } - - if (!create) { - return null; - } - - // Have to split, inserting the LCS. - // System.out.println(" Splitting " + lcsEntry + "/" + word + " @ " + - // lcs); - final Node newChild = new Node(word.substring(0, pos + lcs.length())); - final Object old = childrenMap.put(lcs.intern(), newChild); - assert old == null; - childrenMap.remove(lcsEntry.getKey()); - newChild.childrenMap.put(lcsEntry.getKey().substring(lcs.length()) - .intern(), lcsEntry.getValue()); - - if (lcs.equals(rest)) { - return newChild; - } - final Node result = new Node(word); - final Object old2 = newChild.childrenMap.put(rest.substring(lcs.length()) - .intern(), result); - assert old2 == null; - // System.out.println(" newChildrenMap=" + newChild.childrenMap); - - return result; - } - - Index.Node toIndexNode() { - final Index.Node result = new Index.Node(childrenMap.size(), offsetsList - .size()); - int i = 0; - for (final Map.Entry entry : childrenMap.entrySet()) { - result.chars[i] = entry.getKey(); - result.children[i] = entry.getValue().toIndexNode(); - i++; - } - return result; - } - - void forEachNode(final Function f) { - f.invoke(this); - for (final Node child : childrenMap.values()) { - child.forEachNode(f); - } - } - - int descendantCount() { - int count = 1; - for (final Node child : childrenMap.values()) { - count += child.descendantCount(); - } - return count; - } - - void recursiveSetDescendantOffsetCount() { - descendantOffsetCount = offsetsList.size(); - for (final Node child : childrenMap.values()) { - child.recursiveSetDescendantOffsetCount(); - descendantOffsetCount += child.descendantOffsetCount; - } + + static final class EntryDescriptor implements Comparable, Serializable { + final int offset; + final int numTokens; + public EntryDescriptor(int offset, int numTokens) { + this.offset = offset; + this.numTokens = numTokens; } - - public void packDescendants(final int maxDescendants) { - if (descendantOffsetCount <= maxDescendants) { - final Set descendantOffsets = new LinkedHashSet(); - recursiveAddDescendants(descendantOffsets); - assert descendantOffsets.size() <= maxDescendants; - offsetsList.clear(); - offsetsList.addAll(descendantOffsets); - childrenMap.clear(); - } else { - for (final Node child : childrenMap.values()) { - child.packDescendants(maxDescendants); - } - } + @Override + public boolean equals(Object obj) { + final EntryDescriptor that = (EntryDescriptor) obj; + return this.offset == that.offset; } - - private void recursiveAddDescendants(final Set descendantOffsets) { - descendantOffsets.addAll(this.offsetsList); - for (final Node child : childrenMap.values()) { - child.recursiveAddDescendants(descendantOffsets); - } + @Override + public int hashCode() { + return offset; } - - @Override - public String toString() { - return sequence + ":" + offsetsList.size(); + public int compareTo(EntryDescriptor o) { + return this.numTokens < o.numTokens ? -1 : this.numTokens == o.numTokens ? 0 : 1; } - } + // ---------------------------------------------------------------- - static Node createIndex(final String file, final byte lang) throws IOException { + static Node processDictionaryLines(final String dictionaryFileName, final byte lang) throws IOException { final Node root = new Node(""); - final RandomAccessFile raf = new RandomAccessFile(file, "r"); + final RandomAccessFile dictionaryFile = new RandomAccessFile(dictionaryFileName, "r"); String line; final Entry entry = new Entry(); int lineCount = 0; - while ((line = raf.readLine()) != null) { - final long fileLocation = raf.getFilePointer(); + long fileLocation = 0; + while ((line = dictionaryFile.readLine()) != null) { assert ((int) fileLocation) == fileLocation; line = line.trim(); @@ -244,31 +97,33 @@ public class IndexBuilder { } final String text = entry.getIndexableText(Entry.LANG1); final String[] tokens = WHITESPACE.split(text); - final Set tokenSet = new LinkedHashSet(); + final Map tokenToNormalizedMap = new LinkedHashMap(); for (String token : tokens) { if (token.length() <= 1 || !Character.isLetter(token.charAt(0))) { continue; } - tokenSet.add(entry.normalizeToken(token, lang)); + tokenToNormalizedMap.put(token, EntryFactory.entryFactory.normalizeToken(token)); } - for (final String normalized : tokenSet) { - // System.out.println("Inserting: " + normalized); - if ("die".equals(normalized) || "eine".equals(normalized)) { - // System.out.println("hello"); - } - final Node node = root.getIndexNode(normalized, 0, true); - node.offsetsList.add((int) fileLocation); - assert node == root.getIndexNode(normalized, 0, false); - assert normalized - .equals(root.getIndexNode(normalized, 0, false).sequence); + for (final Map.Entry tokenToNormalized : tokenToNormalizedMap.entrySet()) { + final String normalizedToken = tokenToNormalized.getValue(); + final Node node = root.getNode(normalizedToken, 0, true); + node.addToken(tokenToNormalized.getKey(), new EntryDescriptor((int) fileLocation, tokens.length)); + assert node == root.getNode(normalizedToken, 0, false); + assert normalizedToken + .equals(root.getNode(normalizedToken, 0, false).normalizedToken); } if (lineCount % 10000 == 0) { System.out.println("IndexBuilder: " + "lineCount=" + lineCount); } + lineCount++; + fileLocation = dictionaryFile.getFilePointer(); } - raf.close(); + dictionaryFile.close(); + + root.recursiveSetDescendantCounts(); + return root; }