--- /dev/null
+package com.hughes.android.dictionary;\r
+\r
+import java.util.ArrayList;\r
+import java.util.Arrays;\r
+import java.util.Collections;\r
+import java.util.Comparator;\r
+import java.util.LinkedHashSet;\r
+import java.util.List;\r
+import java.util.Set;\r
+import java.util.SortedMap;\r
+import java.util.TreeMap;\r
+import java.util.regex.Pattern;\r
+\r
+import com.hughes.android.dictionary.Dictionary.IndexEntry;\r
+import com.hughes.android.dictionary.Dictionary.Row;\r
+\r
+public class DictionaryBuilder {\r
+\r
+ static final Pattern WHITESPACE = Pattern.compile("\\s+");\r
+\r
+ public static void createIndex(final Dictionary dict, final byte lang) {\r
+\r
+ final SortedMap<String, TokenData> sortedIndex = new TreeMap<String, TokenData>(\r
+ EntryFactory.entryFactory.getEntryComparator());\r
+ final EntryData entryDatas[] = new EntryData[dict.entries.size()];\r
+\r
+ for (int e = 0; e < dict.entries.size(); ++e) {\r
+ final Entry entry = dict.entries.get(e);\r
+ final String text = entry.getIndexableText(lang);\r
+ final Set<String> tokens = new LinkedHashSet<String>(Arrays\r
+ .asList(WHITESPACE.split(text.trim())));\r
+ entryDatas[e] = new EntryData(tokens.size());\r
+ for (final String token : tokens) {\r
+ TokenData tokenData = sortedIndex.get(token);\r
+ if (tokenData == null) {\r
+ tokenData = new TokenData(token);\r
+ sortedIndex.put(token, tokenData);\r
+ }\r
+ tokenData.entries.add(e);\r
+ }\r
+ }\r
+\r
+ // Sort it.\r
+\r
+ final Comparator<Integer> entryComparator = new Comparator<Integer>() {\r
+ @Override\r
+ public int compare(Integer o1, Integer o2) {\r
+ return entryDatas[o1].numTokens < entryDatas[o2].numTokens ? -1\r
+ : entryDatas[o1].numTokens == entryDatas[o2].numTokens ? 0 : 1;\r
+ }\r
+ };\r
+\r
+ for (final TokenData tokenData : sortedIndex.values()) {\r
+ Collections.sort(tokenData.entries, entryComparator);\r
+ }\r
+\r
+ // Put it all together.\r
+\r
+ final List<Row> rows = dict.languages[lang].rows;\r
+ final List<IndexEntry> indexEntries = dict.languages[lang].sortedIndex;\r
+\r
+ int tokenDataIndex = 0;\r
+ for (final TokenData tokenData : sortedIndex.values()) {\r
+ final int startRow = rows.size();\r
+ final IndexEntry indexEntry = new IndexEntry(tokenData.token, startRow);\r
+ indexEntries.add(indexEntry);\r
+\r
+ final Row tokenRow = new Row(-(tokenDataIndex + 1));\r
+ rows.add(tokenRow);\r
+\r
+ for (final Integer e : tokenData.entries) {\r
+ final Row entryRow = new Row(e);\r
+ rows.add(entryRow);\r
+ }\r
+ ++tokenDataIndex;\r
+ }\r
+\r
+ }\r
+\r
+ static final class EntryData {\r
+ final int numTokens;\r
+\r
+ public EntryData(int numTokens) {\r
+ this.numTokens = numTokens;\r
+ }\r
+ }\r
+\r
+ static final class TokenData {\r
+ final String token;\r
+ final List<Integer> entries = new ArrayList<Integer>();\r
+\r
+ int startRow;\r
+\r
+ public TokenData(String token) {\r
+ this.token = token;\r
+ }\r
+ }\r
+\r
+}\r
--- /dev/null
+package com.hughes.android.dictionary;\r
+\r
+import java.io.File;\r
+import java.io.IOException;\r
+import java.io.RandomAccessFile;\r
+import java.util.Arrays;\r
+import java.util.List;\r
+\r
+import junit.framework.TestCase;\r
+\r
+import com.hughes.android.dictionary.Dictionary.IndexEntry;\r
+import com.hughes.android.dictionary.Dictionary.Row;\r
+\r
+public class DictionaryTest extends TestCase {\r
+\r
+ public void testDictionary() throws IOException {\r
+ final File file = File.createTempFile("asdf", "asdf");\r
+ file.deleteOnExit();\r
+\r
+ final Dictionary goldenDict;\r
+ final List<Entry> entries = Arrays.asList(\r
+ new Entry("der Hund", "the dog"),\r
+ new Entry("Die grosse Katze", "The big cat"), \r
+ new Entry("die Katze", "the cat"),\r
+ new Entry("gross", "big"),\r
+ new Entry("Dieb", "thief"),\r
+ new Entry("rennen", "run"));\r
+\r
+ {\r
+ final Dictionary dict = new Dictionary("de", "en");\r
+ for (final Entry entry : entries) {\r
+ dict.entries.add(entry);\r
+ }\r
+ DictionaryBuilder.createIndex(dict, Entry.LANG1);\r
+ DictionaryBuilder.createIndex(dict, Entry.LANG2);\r
+ final RandomAccessFile raf = new RandomAccessFile(file, "rw");\r
+ dict.write(raf);\r
+ raf.close();\r
+ \r
+ goldenDict = dict;\r
+ }\r
+\r
+ final RandomAccessFile raf = new RandomAccessFile(file, "r");\r
+ final Dictionary dict = new Dictionary(raf);\r
+ \r
+ assertEquals(entries, dict.entries);\r
+ \r
+ assertEquals("der", dict.languages[0].sortedIndex.get(0).word);\r
+ assertEquals("Die", dict.languages[0].sortedIndex.get(1).word);\r
+ \r
+ for (final IndexEntry indexEntry : dict.languages[0].sortedIndex) {\r
+ System.out.println(indexEntry);\r
+ }\r
+\r
+ int rowCount = 0;\r
+ for (final Row row : dict.languages[0].rows) {\r
+ if (row.index >= 0) {\r
+ System.out.println(" " + rowCount + ":" + dict.entries.get(row.index));\r
+ } else {\r
+ System.out.println(rowCount + ":" + dict.languages[0].sortedIndex.get(-row.index - 1));\r
+ }\r
+ ++rowCount;\r
+ }\r
+\r
+\r
+ }\r
+\r
+}\r
}\r
}\r
\r
- static final class Node implements Serializable {\r
- final String normalizedToken;\r
- \r
- final TreeMap<String, Node> children = new TreeMap<String, Node>();\r
- final TreeMap<String,List<EntryDescriptor>> entryDescriptorsMap = new TreeMap<String, List<EntryDescriptor>>();\r
- \r
-// final List<EntryDescriptor> offsets = new ArrayList<EntryDescriptor>();\r
- \r
- int indexFileLocation = -1;\r
-\r
- private int descendantTokenCount;\r
- private int descendantEntryCount = 0;\r
-\r
- public Node(final String normalizedToken) {\r
- if (normalizedToken.length() == 0) {\r
- System.out.println("Created root.");\r
- }\r
- this.normalizedToken = normalizedToken.intern();\r
- }\r
-\r
- public Node getNode(final String nToken, final int pos,\r
- final boolean create) {\r
- assert this.normalizedToken.equals(nToken.substring(0, pos));\r
-\r
- if (pos == nToken.length()) {\r
- assert normalizedToken.equals(nToken);\r
- return this;\r
- }\r
-\r
- final String rest = nToken.substring(pos);\r
- assert rest.length() > 0;\r
-\r
- final Map.Entry<String, Node> lcsEntry;\r
- final String lcs;\r
- {\r
- final Map.Entry<String, Node> floorEntry = children.floorEntry(rest);\r
- final Map.Entry<String, Node> ceilingEntry = children\r
- .ceilingEntry(rest);\r
- final String floorLcs = floorEntry == null ? "" : StringUtil\r
- .longestCommonSubstring(rest, floorEntry.getKey());\r
- final String ceilingLcs = ceilingEntry == null ? "" : StringUtil\r
- .longestCommonSubstring(rest, ceilingEntry.getKey());\r
- if (floorLcs.length() > ceilingLcs.length()) {\r
- lcsEntry = floorEntry;\r
- lcs = floorLcs;\r
- } else {\r
- lcsEntry = ceilingEntry;\r
- lcs = ceilingLcs;\r
- }\r
- }\r
-\r
- // No LCS, have to add everything.\r
- if (lcs.length() == 0) {\r
- if (!create) {\r
- return null;\r
- }\r
- final Node result = new Node(nToken);\r
- final Object old = children.put(rest.intern(), result);\r
- assert old == null;\r
- // System.out.println(" Adding final chunk: " + rest);\r
- return result;\r
- }\r
-\r
- assert lcsEntry != null;\r
-\r
- // The map already contained the LCS.\r
- if (lcs.length() == lcsEntry.getKey().length()) {\r
- assert lcs.equals(lcsEntry.getKey());\r
- final Node result = lcsEntry.getValue().getNode(nToken,\r
- pos + lcs.length(), create);\r
- assert result.normalizedToken.equals(nToken);\r
- return result;\r
- }\r
-\r
- if (!create) {\r
- return null;\r
- }\r
-\r
- // Have to split, inserting the LCS.\r
- // System.out.println(" Splitting " + lcsEntry + "/" + word + " @ " +\r
- // lcs);\r
- final Node newChild = new Node(nToken.substring(0, pos + lcs.length()));\r
- final Object old = children.put(lcs.intern(), newChild);\r
- assert old == null;\r
- children.remove(lcsEntry.getKey());\r
- newChild.children.put(lcsEntry.getKey().substring(lcs.length())\r
- .intern(), lcsEntry.getValue());\r
-\r
- if (lcs.equals(rest)) {\r
- return newChild;\r
- }\r
- final Node result = new Node(nToken);\r
- final Object old2 = newChild.children.put(rest.substring(lcs.length())\r
- .intern(), result);\r
- assert old2 == null;\r
- // System.out.println(" newchildren=" + newChild.children);\r
-\r
- return result;\r
- }\r
-\r
- void forEachNode(final Function<Node> f) {\r
- f.invoke(this);\r
- for (final Node child : children.values()) {\r
- child.forEachNode(f);\r
- }\r
- }\r
-\r
- int descendantCount() {\r
- int count = 1;\r
- for (final Node child : children.values()) {\r
- count += child.descendantCount();\r
- }\r
- return count;\r
- }\r
-\r
- void recursiveSetDescendantCounts() {\r
- descendantTokenCount = entryDescriptorsMap.size();\r
- descendantEntryCount = 0;\r
-\r
- for (final Node child : children.values()) {\r
- child.recursiveSetDescendantCounts();\r
- descendantTokenCount += child.descendantTokenCount;\r
- descendantEntryCount += child.descendantEntryCount;\r
- }\r
-\r
- for (final List<EntryDescriptor> entryDescriptors : entryDescriptorsMap.values()) {\r
- descendantEntryCount += entryDescriptors.size();\r
- }\r
- }\r
-\r
- @Override\r
- public String toString() {\r
- return normalizedToken;\r
- }\r
- \r
- void dump(final RandomAccessFile file) throws IOException {\r
- if (indexFileLocation == -1) {\r
- indexFileLocation = (int) file.getFilePointer();\r
- } else {\r
- assert indexFileLocation == file.getFilePointer();\r
- }\r
- \r
- // Children to location.\r
- file.writeInt(children.size());\r
- for (final Map.Entry<String, Node> child : children.entrySet()) {\r
- file.writeUTF(child.getKey());\r
- file.writeInt(child.getValue().indexFileLocation);\r
- }\r
- \r
- // Entries.\r
- file.writeInt(entryDescriptorsMap.size());\r
- for (final Map.Entry<String, List<EntryDescriptor>> entry : entryDescriptorsMap.entrySet()) {\r
- file.writeUTF(entry.getKey());\r
- file.writeInt(entry.getValue().size());\r
- for (int i = 0; i < entry.getValue().size(); ++i) {\r
- file.writeInt(entry.getValue().get(i).offset);\r
- }\r
- }\r
-\r
- // Dump counts.\r
- file.writeInt(descendantTokenCount);\r
- file.writeInt(descendantEntryCount);\r
- \r
- // Dump children.\r
- for (final Map.Entry<String, Node> child : children.entrySet()) {\r
- child.getValue().dump(file);\r
- }\r
- }\r
-\r
- public void addToken(final String token, final EntryDescriptor entryDescriptor) {\r
- List<EntryDescriptor> entryDescriptors = this.entryDescriptorsMap.get(token);\r
- if (entryDescriptors == null) {\r
- entryDescriptors = new ArrayList<EntryDescriptor>();\r
- this.entryDescriptorsMap.put(token, entryDescriptors);\r
- }\r
- entryDescriptors.add(entryDescriptor);\r
- }\r
- }\r
\r
// ----------------------------------------------------------------\r
\r