X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2FIndexBuilder.java;h=49d393df1c76f85d68b76dec2086d767eaecc3bc;hb=7ce755ee7faeba246f8e8078556f83b2b4dac108;hp=13dc1dbd59e8fd110bcf4e08409e75e98826f360;hpb=02e472d515152911bd15b8a3b88923854322941b;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/IndexBuilder.java b/src/com/hughes/android/dictionary/IndexBuilder.java index 13dc1db..49d393d 100755 --- a/src/com/hughes/android/dictionary/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/IndexBuilder.java @@ -1,14 +1,14 @@ package com.hughes.android.dictionary; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.RandomAccessFile; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; -import java.util.LinkedHashSet; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.TreeMap; import java.util.regex.Pattern; @@ -25,13 +25,18 @@ public class IndexBuilder { System.err.println("No input file."); return; } + final String dictionaryFileName = args[0]; + createIndex(dictionaryFileName, Entry.LANG1); + createIndex(dictionaryFileName, Entry.LANG2); + } - final String file = args[0]; - final byte lang = Entry.LANG1; + private static void createIndex(final String dictionaryFileName, + final byte lang) throws IOException, FileNotFoundException, + ClassNotFoundException { Node rootBuilder; - rootBuilder = createIndex(file, lang); - FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", file, lang)); - rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", file, lang)); + rootBuilder = processDictionaryLines(dictionaryFileName, lang); + FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", dictionaryFileName, lang)); + rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", dictionaryFileName, lang)); rootBuilder.forEachNode(new Function() { @Override @@ -43,11 +48,10 @@ public class IndexBuilder { // Dump twice to get accurate file locations. for (int i = 0; i < 2; ++i) { - final RandomAccessFile raf = new RandomAccessFile(String.format(Dictionary.INDEX_FORMAT, file, lang), "rw"); + final RandomAccessFile raf = new RandomAccessFile(String.format(Dictionary.INDEX_FORMAT, dictionaryFileName, lang), "rw"); rootBuilder.dump(raf); raf.close(); } - } // ---------------------------------------------------------------- @@ -74,173 +78,17 @@ public class IndexBuilder { } } - static final class Node implements Serializable { - final String normalizedWord; - - final TreeMap children = new TreeMap(); - final TreeMap> entryDescriptorsMap = new TreeMap>(); - -// final List offsets = new ArrayList(); - - int descendantOffsetCount = 0; - - int indexFileLocation = -1; - - public Node(final String normalizedWord) { - if (normalizedWord.length() == 0) { - System.out.println("Created root."); - } - this.normalizedWord = normalizedWord.intern(); - } - - public Node getNode(final String nWord, final int pos, - final boolean create) { - assert this.normalizedWord.equals(nWord.substring(0, pos)); - - if (pos == nWord.length()) { - assert normalizedWord.equals(nWord); - return this; - } - - final String rest = nWord.substring(pos); - assert rest.length() > 0; - - final Map.Entry lcsEntry; - final String lcs; - { - final Map.Entry floorEntry = children.floorEntry(rest); - final Map.Entry ceilingEntry = children - .ceilingEntry(rest); - final String floorLcs = floorEntry == null ? "" : StringUtil - .longestCommonSubstring(rest, floorEntry.getKey()); - final String ceilingLcs = ceilingEntry == null ? "" : StringUtil - .longestCommonSubstring(rest, ceilingEntry.getKey()); - if (floorLcs.length() > ceilingLcs.length()) { - lcsEntry = floorEntry; - lcs = floorLcs; - } else { - lcsEntry = ceilingEntry; - lcs = ceilingLcs; - } - } - - // No LCS, have to add everything. - if (lcs.length() == 0) { - if (!create) { - return null; - } - final Node result = new Node(nWord); - final Object old = children.put(rest.intern(), result); - assert old == null; - // System.out.println(" Adding final chunk: " + rest); - return result; - } - - assert lcsEntry != null; - - // The map already contained the LCS. - if (lcs.length() == lcsEntry.getKey().length()) { - assert lcs.equals(lcsEntry.getKey()); - final Node result = lcsEntry.getValue().getNode(nWord, - pos + lcs.length(), create); - assert result.normalizedWord.equals(nWord); - return result; - } - - if (!create) { - return null; - } - - // Have to split, inserting the LCS. - // System.out.println(" Splitting " + lcsEntry + "/" + word + " @ " + - // lcs); - final Node newChild = new Node(nWord.substring(0, pos + lcs.length())); - final Object old = children.put(lcs.intern(), newChild); - assert old == null; - children.remove(lcsEntry.getKey()); - newChild.children.put(lcsEntry.getKey().substring(lcs.length()) - .intern(), lcsEntry.getValue()); - - if (lcs.equals(rest)) { - return newChild; - } - final Node result = new Node(nWord); - final Object old2 = newChild.children.put(rest.substring(lcs.length()) - .intern(), result); - assert old2 == null; - // System.out.println(" newchildren=" + newChild.children); - - return result; - } - - void forEachNode(final Function f) { - f.invoke(this); - for (final Node child : children.values()) { - child.forEachNode(f); - } - } - - int descendantCount() { - int count = 1; - for (final Node child : children.values()) { - count += child.descendantCount(); - } - return count; - } - - void recursiveSetDescendantOffsetCount() { - descendantOffsetCount = offsets.size(); - for (final Node child : children.values()) { - child.recursiveSetDescendantOffsetCount(); - descendantOffsetCount += child.descendantOffsetCount; - } - } - - @Override - public String toString() { - return normalizedWord; - } - - void dump(final RandomAccessFile file) throws IOException { - if (indexFileLocation == -1) { - indexFileLocation = (int) file.getFilePointer(); - } else { - assert indexFileLocation == file.getFilePointer(); - } - - // Children. - file.writeInt(children.size()); - for (final Map.Entry child : children.entrySet()) { - file.writeUTF(child.getKey()); - file.writeInt(child.getValue().indexFileLocation); - } - - // Entries. - file.writeInt(entryDescriptorsMap.size()); - for (final Map.Entry> entry : entryDescriptorsMap.entrySet()) { - file.writeUTF(entry.getKey()); - file.writeInt(entry.getValue().size()); - - file.writeInt(offsets.get(i).offset); - } - - // Dump children. - for (final Map.Entry child : children.entrySet()) { - child.getValue().dump(file); - } - } - } // ---------------------------------------------------------------- - static Node createIndex(final String file, final byte lang) throws IOException { + static Node processDictionaryLines(final String dictionaryFileName, final byte lang) throws IOException { final Node root = new Node(""); - final RandomAccessFile raf = new RandomAccessFile(file, "r"); + final RandomAccessFile dictionaryFile = new RandomAccessFile(dictionaryFileName, "r"); String line; final Entry entry = new Entry(); int lineCount = 0; long fileLocation = 0; - while ((line = raf.readLine()) != null) { + while ((line = dictionaryFile.readLine()) != null) { assert ((int) fileLocation) == fileLocation; line = line.trim(); @@ -249,23 +97,20 @@ public class IndexBuilder { } final String text = entry.getIndexableText(Entry.LANG1); final String[] tokens = WHITESPACE.split(text); - final Set tokenSet = new LinkedHashSet(); + final Map tokenToNormalizedMap = new LinkedHashMap(); for (String token : tokens) { if (token.length() <= 1 || !Character.isLetter(token.charAt(0))) { continue; } - tokenSet.add(EntryFactory.entryFactory.normalizeToken(token, lang)); + tokenToNormalizedMap.put(token, EntryFactory.entryFactory.normalizeToken(token)); } - for (final String normalized : tokenSet) { - // System.out.println("Inserting: " + normalized); - if ("die".equals(normalized) || "eine".equals(normalized)) { - // System.out.println("hello"); - } - final Node node = root.getNode(normalized, 0, true); - node.offsets.add(new EntryDescriptor((int) fileLocation, tokens.length)); - assert node == root.getNode(normalized, 0, false); - assert normalized - .equals(root.getNode(normalized, 0, false).normalizedWord); + for (final Map.Entry tokenToNormalized : tokenToNormalizedMap.entrySet()) { + final String normalizedToken = tokenToNormalized.getValue(); + final Node node = root.getNode(normalizedToken, 0, true); + node.addToken(tokenToNormalized.getKey(), new EntryDescriptor((int) fileLocation, tokens.length)); + assert node == root.getNode(normalizedToken, 0, false); + assert normalizedToken + .equals(root.getNode(normalizedToken, 0, false).normalizedToken); } if (lineCount % 10000 == 0) { @@ -273,9 +118,12 @@ public class IndexBuilder { } lineCount++; - fileLocation = raf.getFilePointer(); + fileLocation = dictionaryFile.getFilePointer(); } - raf.close(); + dictionaryFile.close(); + + root.recursiveSetDescendantCounts(); + return root; }