package com.hughes.android.dictionary;\r
\r
+import java.io.FileNotFoundException;\r
import java.io.IOException;\r
import java.io.RandomAccessFile;\r
import java.io.Serializable;\r
import java.util.ArrayList;\r
import java.util.Collections;\r
-import java.util.LinkedHashSet;\r
+import java.util.LinkedHashMap;\r
import java.util.List;\r
import java.util.Map;\r
-import java.util.Set;\r
import java.util.TreeMap;\r
import java.util.regex.Pattern;\r
\r
System.err.println("No input file.");\r
return;\r
}\r
+ final String dictionaryFileName = args[0];\r
+ createIndex(dictionaryFileName, Entry.LANG1);\r
+ createIndex(dictionaryFileName, Entry.LANG2);\r
+ }\r
\r
- final String file = args[0];\r
- final byte lang = Entry.LANG1;\r
+ private static void createIndex(final String dictionaryFileName,\r
+ final byte lang) throws IOException, FileNotFoundException,\r
+ ClassNotFoundException {\r
Node rootBuilder;\r
- rootBuilder = createIndex(file, lang);\r
- FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", file, lang));\r
- rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", file, lang));\r
+ rootBuilder = processDictionaryLines(dictionaryFileName, lang);\r
+ FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", dictionaryFileName, lang));\r
+ rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", dictionaryFileName, lang));\r
\r
rootBuilder.forEachNode(new Function<Node>() {\r
@Override\r
\r
// Dump twice to get accurate file locations.\r
for (int i = 0; i < 2; ++i) {\r
- final RandomAccessFile raf = new RandomAccessFile(String.format(Dictionary.INDEX_FORMAT, file, lang), "rw"); \r
+ final RandomAccessFile raf = new RandomAccessFile(String.format(Dictionary.INDEX_FORMAT, dictionaryFileName, lang), "rw"); \r
rootBuilder.dump(raf);\r
raf.close();\r
}\r
- \r
}\r
\r
// ----------------------------------------------------------------\r
}\r
\r
static final class Node implements Serializable {\r
- final String normalizedWord;\r
+ final String normalizedToken;\r
\r
final TreeMap<String, Node> children = new TreeMap<String, Node>();\r
final TreeMap<String,List<EntryDescriptor>> entryDescriptorsMap = new TreeMap<String, List<EntryDescriptor>>();\r
\r
// final List<EntryDescriptor> offsets = new ArrayList<EntryDescriptor>();\r
\r
- int descendantOffsetCount = 0;\r
- \r
int indexFileLocation = -1;\r
\r
- public Node(final String normalizedWord) {\r
- if (normalizedWord.length() == 0) {\r
+ private int descendantTokenCount;\r
+ private int descendantEntryCount = 0;\r
+\r
+ public Node(final String normalizedToken) {\r
+ if (normalizedToken.length() == 0) {\r
System.out.println("Created root.");\r
}\r
- this.normalizedWord = normalizedWord.intern();\r
+ this.normalizedToken = normalizedToken.intern();\r
}\r
\r
- public Node getNode(final String nWord, final int pos,\r
+ public Node getNode(final String nToken, final int pos,\r
final boolean create) {\r
- assert this.normalizedWord.equals(nWord.substring(0, pos));\r
+ assert this.normalizedToken.equals(nToken.substring(0, pos));\r
\r
- if (pos == nWord.length()) {\r
- assert normalizedWord.equals(nWord);\r
+ if (pos == nToken.length()) {\r
+ assert normalizedToken.equals(nToken);\r
return this;\r
}\r
\r
- final String rest = nWord.substring(pos);\r
+ final String rest = nToken.substring(pos);\r
assert rest.length() > 0;\r
\r
final Map.Entry<String, Node> lcsEntry;\r
if (!create) {\r
return null;\r
}\r
- final Node result = new Node(nWord);\r
+ final Node result = new Node(nToken);\r
final Object old = children.put(rest.intern(), result);\r
assert old == null;\r
// System.out.println(" Adding final chunk: " + rest);\r
// The map already contained the LCS.\r
if (lcs.length() == lcsEntry.getKey().length()) {\r
assert lcs.equals(lcsEntry.getKey());\r
- final Node result = lcsEntry.getValue().getNode(nWord,\r
+ final Node result = lcsEntry.getValue().getNode(nToken,\r
pos + lcs.length(), create);\r
- assert result.normalizedWord.equals(nWord);\r
+ assert result.normalizedToken.equals(nToken);\r
return result;\r
}\r
\r
// Have to split, inserting the LCS.\r
// System.out.println(" Splitting " + lcsEntry + "/" + word + " @ " +\r
// lcs);\r
- final Node newChild = new Node(nWord.substring(0, pos + lcs.length()));\r
+ final Node newChild = new Node(nToken.substring(0, pos + lcs.length()));\r
final Object old = children.put(lcs.intern(), newChild);\r
assert old == null;\r
children.remove(lcsEntry.getKey());\r
if (lcs.equals(rest)) {\r
return newChild;\r
}\r
- final Node result = new Node(nWord);\r
+ final Node result = new Node(nToken);\r
final Object old2 = newChild.children.put(rest.substring(lcs.length())\r
.intern(), result);\r
assert old2 == null;\r
}\r
\r
void recursiveSetDescendantOffsetCount() {\r
- descendantOffsetCount = offsets.size();\r
+ descendantEntryCount = 0;\r
+ descendantTokenCount = 0;\r
+ for (final List<EntryDescriptor> entryDescriptors : entryDescriptorsMap.values()) {\r
+ descendantTokenCount += 1;\r
+ descendantEntryCount += entryDescriptors.size();\r
+ }\r
for (final Node child : children.values()) {\r
child.recursiveSetDescendantOffsetCount();\r
- descendantOffsetCount += child.descendantOffsetCount;\r
+ descendantTokenCount += child.descendantTokenCount;\r
+ descendantEntryCount += child.descendantEntryCount;\r
}\r
}\r
\r
@Override\r
public String toString() {\r
- return normalizedWord;\r
+ return normalizedToken;\r
}\r
\r
void dump(final RandomAccessFile file) throws IOException {\r
for (final Map.Entry<String, List<EntryDescriptor>> entry : entryDescriptorsMap.entrySet()) {\r
file.writeUTF(entry.getKey());\r
file.writeInt(entry.getValue().size());\r
- \r
- file.writeInt(offsets.get(i).offset);\r
+ for (int i = 0; i < entry.getValue().size(); ++i) {\r
+ file.writeInt(entry.getValue().get(i).offset);\r
+ }\r
}\r
\r
// Dump children.\r
child.getValue().dump(file);\r
}\r
}\r
+\r
+ public void addToken(final String token, final EntryDescriptor entryDescriptor) {\r
+ List<EntryDescriptor> entryDescriptors = this.entryDescriptorsMap.get(token);\r
+ if (entryDescriptors == null) {\r
+ entryDescriptors = new ArrayList<EntryDescriptor>();\r
+ this.entryDescriptorsMap.put(token, entryDescriptors);\r
+ }\r
+ entryDescriptors.add(entryDescriptor);\r
+ }\r
}\r
\r
// ----------------------------------------------------------------\r
\r
- static Node createIndex(final String file, final byte lang) throws IOException {\r
+ static Node processDictionaryLines(final String dictionaryFileName, final byte lang) throws IOException {\r
final Node root = new Node("");\r
- final RandomAccessFile raf = new RandomAccessFile(file, "r");\r
+ final RandomAccessFile dictionaryFile = new RandomAccessFile(dictionaryFileName, "r");\r
String line;\r
final Entry entry = new Entry();\r
int lineCount = 0;\r
long fileLocation = 0;\r
- while ((line = raf.readLine()) != null) {\r
+ while ((line = dictionaryFile.readLine()) != null) {\r
assert ((int) fileLocation) == fileLocation;\r
\r
line = line.trim();\r
}\r
final String text = entry.getIndexableText(Entry.LANG1);\r
final String[] tokens = WHITESPACE.split(text);\r
- final Set<String> tokenSet = new LinkedHashSet<String>();\r
+ final Map<String,String> tokenToNormalizedMap = new LinkedHashMap<String,String>();\r
for (String token : tokens) {\r
if (token.length() <= 1 || !Character.isLetter(token.charAt(0))) {\r
continue;\r
}\r
- tokenSet.add(EntryFactory.entryFactory.normalizeToken(token, lang));\r
+ tokenToNormalizedMap.put(token, EntryFactory.entryFactory.normalizeToken(token, lang));\r
}\r
- for (final String normalized : tokenSet) {\r
- // System.out.println("Inserting: " + normalized);\r
- if ("die".equals(normalized) || "eine".equals(normalized)) {\r
- // System.out.println("hello");\r
- }\r
- final Node node = root.getNode(normalized, 0, true);\r
- node.offsets.add(new EntryDescriptor((int) fileLocation, tokens.length));\r
- assert node == root.getNode(normalized, 0, false);\r
- assert normalized\r
- .equals(root.getNode(normalized, 0, false).normalizedWord);\r
+ for (final Map.Entry<String, String> tokenToNormalized : tokenToNormalizedMap.entrySet()) {\r
+ final String normalizedToken = tokenToNormalized.getValue();\r
+ final Node node = root.getNode(normalizedToken, 0, true);\r
+ node.addToken(tokenToNormalized.getKey(), new EntryDescriptor((int) fileLocation, tokens.length));\r
+ assert node == root.getNode(normalizedToken, 0, false);\r
+ assert normalizedToken\r
+ .equals(root.getNode(normalizedToken, 0, false).normalizedToken);\r
}\r
\r
if (lineCount % 10000 == 0) {\r
}\r
\r
lineCount++;\r
- fileLocation = raf.getFilePointer();\r
+ fileLocation = dictionaryFile.getFilePointer();\r
}\r
- raf.close();\r
+ dictionaryFile.close();\r
return root;\r
}\r
\r