FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", file, lang));\r
rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", file, lang));\r
\r
-// final AtomicInteger c = new AtomicInteger();\r
rootBuilder.forEachNode(new Function<Node>() {\r
@Override\r
- public void invoke(Node t) {\r
- Collections.sort(t.offsets);\r
-// if (t.offsets.size() > 128) {\r
-// System.out.println(t);\r
-// c.incrementAndGet();\r
-// }\r
+ public void invoke(final Node node) {\r
+ for (final List<EntryDescriptor> entryDescriptors : node.entries.values()) {\r
+ Collections.sort(entryDescriptors);\r
+ }\r
}});\r
-// System.out.println(c);\r
\r
-// rootBuilder.recursiveSetDescendantOffsetCount();\r
-// rootBuilder.packDescendants(128);\r
-\r
// Dump twice to get accurate file locations.\r
for (int i = 0; i < 2; ++i) {\r
final RandomAccessFile raf = new RandomAccessFile(String.format(Dictionary.INDEX_FORMAT, file, lang), "rw"); \r
}\r
\r
static final class Node implements Serializable {\r
- private static final long serialVersionUID = -5423134653901704956L;\r
+ final String normalizedWord;\r
\r
final TreeMap<String, Node> children = new TreeMap<String, Node>();\r
- final List<EntryDescriptor> offsets = new ArrayList<EntryDescriptor>();\r
- final String sequence;\r
+ final TreeMap<String,List<EntryDescriptor>> entries = new TreeMap<String, List<EntryDescriptor>>();\r
+ \r
+// final List<EntryDescriptor> offsets = new ArrayList<EntryDescriptor>();\r
\r
int descendantOffsetCount = 0;\r
\r
int indexFileLocation = -1;\r
\r
- public Node(String sequence) {\r
- if (sequence.length() == 0) {\r
+ public Node(final String normalizedWord) {\r
+ if (normalizedWord.length() == 0) {\r
System.out.println("Created root.");\r
}\r
- this.sequence = sequence.intern();\r
+ this.normalizedWord = normalizedWord.intern();\r
}\r
\r
- public Node getIndexNode(final String word, final int pos,\r
+ public Node getNode(final String nWord, final int pos,\r
final boolean create) {\r
- assert this.sequence.equals(word.substring(0, pos));\r
+ assert this.normalizedWord.equals(nWord.substring(0, pos));\r
\r
- if (pos == word.length()) {\r
- assert sequence.equals(word);\r
+ if (pos == nWord.length()) {\r
+ assert normalizedWord.equals(nWord);\r
return this;\r
}\r
\r
- final String rest = word.substring(pos);\r
+ final String rest = nWord.substring(pos);\r
assert rest.length() > 0;\r
\r
final Map.Entry<String, Node> lcsEntry;\r
if (!create) {\r
return null;\r
}\r
- final Node result = new Node(word);\r
+ final Node result = new Node(nWord);\r
final Object old = children.put(rest.intern(), result);\r
assert old == null;\r
// System.out.println(" Adding final chunk: " + rest);\r
// The map already contained the LCS.\r
if (lcs.length() == lcsEntry.getKey().length()) {\r
assert lcs.equals(lcsEntry.getKey());\r
- final Node result = lcsEntry.getValue().getIndexNode(word,\r
+ final Node result = lcsEntry.getValue().getNode(nWord,\r
pos + lcs.length(), create);\r
- assert result.sequence.equals(word);\r
+ assert result.normalizedWord.equals(nWord);\r
return result;\r
}\r
\r
// Have to split, inserting the LCS.\r
// System.out.println(" Splitting " + lcsEntry + "/" + word + " @ " +\r
// lcs);\r
- final Node newChild = new Node(word.substring(0, pos + lcs.length()));\r
+ final Node newChild = new Node(nWord.substring(0, pos + lcs.length()));\r
final Object old = children.put(lcs.intern(), newChild);\r
assert old == null;\r
children.remove(lcsEntry.getKey());\r
if (lcs.equals(rest)) {\r
return newChild;\r
}\r
- final Node result = new Node(word);\r
+ final Node result = new Node(nWord);\r
final Object old2 = newChild.children.put(rest.substring(lcs.length())\r
.intern(), result);\r
assert old2 == null;\r
return result;\r
}\r
\r
- MemoryIndex.Node toIndexNode() {\r
- final MemoryIndex.Node result = new MemoryIndex.Node(children.size(), offsets\r
- .size());\r
- int i = 0;\r
- for (final Map.Entry<String, Node> entry : children.entrySet()) {\r
- result.chars[i] = entry.getKey();\r
- result.children[i] = entry.getValue().toIndexNode();\r
- i++;\r
- }\r
- return result;\r
- }\r
-\r
void forEachNode(final Function<Node> f) {\r
f.invoke(this);\r
for (final Node child : children.values()) {\r
}\r
}\r
\r
- public void packDescendants(final int maxDescendants) {\r
- if (descendantOffsetCount <= maxDescendants) {\r
- final Set<EntryDescriptor> descendantOffsets = new LinkedHashSet<EntryDescriptor>();\r
- recursiveAddDescendants(descendantOffsets);\r
- assert descendantOffsets.size() <= maxDescendants;\r
- offsets.clear();\r
- offsets.addAll(descendantOffsets);\r
- children.clear();\r
- } else {\r
- for (final Node child : children.values()) {\r
- child.packDescendants(maxDescendants);\r
- }\r
- }\r
- }\r
-\r
- private void recursiveAddDescendants(final Set<EntryDescriptor> descendantOffsets) {\r
- descendantOffsets.addAll(this.offsets);\r
- for (final Node child : children.values()) {\r
- child.recursiveAddDescendants(descendantOffsets);\r
- }\r
- }\r
-\r
@Override\r
public String toString() {\r
- return sequence + ":" + offsets.size();\r
+ return normalizedWord + ":" + offsets.size();\r
}\r
\r
void dump(final RandomAccessFile file) throws IOException {\r
if (token.length() <= 1 || !Character.isLetter(token.charAt(0))) {\r
continue;\r
}\r
- tokenSet.add(entry.normalizeToken(token, lang));\r
+ tokenSet.add(EntryFactory.entryFactory.normalizeToken(token, lang));\r
}\r
for (final String normalized : tokenSet) {\r
// System.out.println("Inserting: " + normalized);\r
if ("die".equals(normalized) || "eine".equals(normalized)) {\r
// System.out.println("hello");\r
}\r
- final Node node = root.getIndexNode(normalized, 0, true);\r
+ final Node node = root.getNode(normalized, 0, true);\r
node.offsets.add(new EntryDescriptor((int) fileLocation, tokens.length));\r
- assert node == root.getIndexNode(normalized, 0, false);\r
+ assert node == root.getNode(normalized, 0, false);\r
assert normalized\r
- .equals(root.getIndexNode(normalized, 0, false).sequence);\r
+ .equals(root.getNode(normalized, 0, false).normalizedWord);\r
}\r
\r
if (lineCount % 10000 == 0) {\r