From: thadh Date: Mon, 9 Mar 2009 05:21:04 +0000 (-0700) Subject: go X-Git-Url: http://gitweb.fperrin.net/?a=commitdiff_plain;h=7ce755ee7faeba246f8e8078556f83b2b4dac108;p=DictionaryPC.git go --- diff --git a/src/com/hughes/android/dictionary/DictionaryBuilder.java b/src/com/hughes/android/dictionary/DictionaryBuilder.java new file mode 100755 index 0000000..acf0345 --- /dev/null +++ b/src/com/hughes/android/dictionary/DictionaryBuilder.java @@ -0,0 +1,99 @@ +package com.hughes.android.dictionary; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.regex.Pattern; + +import com.hughes.android.dictionary.Dictionary.IndexEntry; +import com.hughes.android.dictionary.Dictionary.Row; + +public class DictionaryBuilder { + + static final Pattern WHITESPACE = Pattern.compile("\\s+"); + + public static void createIndex(final Dictionary dict, final byte lang) { + + final SortedMap sortedIndex = new TreeMap( + EntryFactory.entryFactory.getEntryComparator()); + final EntryData entryDatas[] = new EntryData[dict.entries.size()]; + + for (int e = 0; e < dict.entries.size(); ++e) { + final Entry entry = dict.entries.get(e); + final String text = entry.getIndexableText(lang); + final Set tokens = new LinkedHashSet(Arrays + .asList(WHITESPACE.split(text.trim()))); + entryDatas[e] = new EntryData(tokens.size()); + for (final String token : tokens) { + TokenData tokenData = sortedIndex.get(token); + if (tokenData == null) { + tokenData = new TokenData(token); + sortedIndex.put(token, tokenData); + } + tokenData.entries.add(e); + } + } + + // Sort it. + + final Comparator entryComparator = new Comparator() { + @Override + public int compare(Integer o1, Integer o2) { + return entryDatas[o1].numTokens < entryDatas[o2].numTokens ? -1 + : entryDatas[o1].numTokens == entryDatas[o2].numTokens ? 0 : 1; + } + }; + + for (final TokenData tokenData : sortedIndex.values()) { + Collections.sort(tokenData.entries, entryComparator); + } + + // Put it all together. + + final List rows = dict.languages[lang].rows; + final List indexEntries = dict.languages[lang].sortedIndex; + + int tokenDataIndex = 0; + for (final TokenData tokenData : sortedIndex.values()) { + final int startRow = rows.size(); + final IndexEntry indexEntry = new IndexEntry(tokenData.token, startRow); + indexEntries.add(indexEntry); + + final Row tokenRow = new Row(-(tokenDataIndex + 1)); + rows.add(tokenRow); + + for (final Integer e : tokenData.entries) { + final Row entryRow = new Row(e); + rows.add(entryRow); + } + ++tokenDataIndex; + } + + } + + static final class EntryData { + final int numTokens; + + public EntryData(int numTokens) { + this.numTokens = numTokens; + } + } + + static final class TokenData { + final String token; + final List entries = new ArrayList(); + + int startRow; + + public TokenData(String token) { + this.token = token; + } + } + +} diff --git a/src/com/hughes/android/dictionary/DictionaryTest.java b/src/com/hughes/android/dictionary/DictionaryTest.java new file mode 100755 index 0000000..43fb7ab --- /dev/null +++ b/src/com/hughes/android/dictionary/DictionaryTest.java @@ -0,0 +1,68 @@ +package com.hughes.android.dictionary; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.util.Arrays; +import java.util.List; + +import junit.framework.TestCase; + +import com.hughes.android.dictionary.Dictionary.IndexEntry; +import com.hughes.android.dictionary.Dictionary.Row; + +public class DictionaryTest extends TestCase { + + public void testDictionary() throws IOException { + final File file = File.createTempFile("asdf", "asdf"); + file.deleteOnExit(); + + final Dictionary goldenDict; + final List entries = Arrays.asList( + new Entry("der Hund", "the dog"), + new Entry("Die grosse Katze", "The big cat"), + new Entry("die Katze", "the cat"), + new Entry("gross", "big"), + new Entry("Dieb", "thief"), + new Entry("rennen", "run")); + + { + final Dictionary dict = new Dictionary("de", "en"); + for (final Entry entry : entries) { + dict.entries.add(entry); + } + DictionaryBuilder.createIndex(dict, Entry.LANG1); + DictionaryBuilder.createIndex(dict, Entry.LANG2); + final RandomAccessFile raf = new RandomAccessFile(file, "rw"); + dict.write(raf); + raf.close(); + + goldenDict = dict; + } + + final RandomAccessFile raf = new RandomAccessFile(file, "r"); + final Dictionary dict = new Dictionary(raf); + + assertEquals(entries, dict.entries); + + assertEquals("der", dict.languages[0].sortedIndex.get(0).word); + assertEquals("Die", dict.languages[0].sortedIndex.get(1).word); + + for (final IndexEntry indexEntry : dict.languages[0].sortedIndex) { + System.out.println(indexEntry); + } + + int rowCount = 0; + for (final Row row : dict.languages[0].rows) { + if (row.index >= 0) { + System.out.println(" " + rowCount + ":" + dict.entries.get(row.index)); + } else { + System.out.println(rowCount + ":" + dict.languages[0].sortedIndex.get(-row.index - 1)); + } + ++rowCount; + } + + + } + +} diff --git a/src/com/hughes/android/dictionary/IndexBuilder.java b/src/com/hughes/android/dictionary/IndexBuilder.java index bc5d72d..49d393d 100755 --- a/src/com/hughes/android/dictionary/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/IndexBuilder.java @@ -78,184 +78,6 @@ public class IndexBuilder { } } - static final class Node implements Serializable { - final String normalizedToken; - - final TreeMap children = new TreeMap(); - final TreeMap> entryDescriptorsMap = new TreeMap>(); - -// final List offsets = new ArrayList(); - - int indexFileLocation = -1; - - private int descendantTokenCount; - private int descendantEntryCount = 0; - - public Node(final String normalizedToken) { - if (normalizedToken.length() == 0) { - System.out.println("Created root."); - } - this.normalizedToken = normalizedToken.intern(); - } - - public Node getNode(final String nToken, final int pos, - final boolean create) { - assert this.normalizedToken.equals(nToken.substring(0, pos)); - - if (pos == nToken.length()) { - assert normalizedToken.equals(nToken); - return this; - } - - final String rest = nToken.substring(pos); - assert rest.length() > 0; - - final Map.Entry lcsEntry; - final String lcs; - { - final Map.Entry floorEntry = children.floorEntry(rest); - final Map.Entry ceilingEntry = children - .ceilingEntry(rest); - final String floorLcs = floorEntry == null ? "" : StringUtil - .longestCommonSubstring(rest, floorEntry.getKey()); - final String ceilingLcs = ceilingEntry == null ? "" : StringUtil - .longestCommonSubstring(rest, ceilingEntry.getKey()); - if (floorLcs.length() > ceilingLcs.length()) { - lcsEntry = floorEntry; - lcs = floorLcs; - } else { - lcsEntry = ceilingEntry; - lcs = ceilingLcs; - } - } - - // No LCS, have to add everything. - if (lcs.length() == 0) { - if (!create) { - return null; - } - final Node result = new Node(nToken); - final Object old = children.put(rest.intern(), result); - assert old == null; - // System.out.println(" Adding final chunk: " + rest); - return result; - } - - assert lcsEntry != null; - - // The map already contained the LCS. - if (lcs.length() == lcsEntry.getKey().length()) { - assert lcs.equals(lcsEntry.getKey()); - final Node result = lcsEntry.getValue().getNode(nToken, - pos + lcs.length(), create); - assert result.normalizedToken.equals(nToken); - return result; - } - - if (!create) { - return null; - } - - // Have to split, inserting the LCS. - // System.out.println(" Splitting " + lcsEntry + "/" + word + " @ " + - // lcs); - final Node newChild = new Node(nToken.substring(0, pos + lcs.length())); - final Object old = children.put(lcs.intern(), newChild); - assert old == null; - children.remove(lcsEntry.getKey()); - newChild.children.put(lcsEntry.getKey().substring(lcs.length()) - .intern(), lcsEntry.getValue()); - - if (lcs.equals(rest)) { - return newChild; - } - final Node result = new Node(nToken); - final Object old2 = newChild.children.put(rest.substring(lcs.length()) - .intern(), result); - assert old2 == null; - // System.out.println(" newchildren=" + newChild.children); - - return result; - } - - void forEachNode(final Function f) { - f.invoke(this); - for (final Node child : children.values()) { - child.forEachNode(f); - } - } - - int descendantCount() { - int count = 1; - for (final Node child : children.values()) { - count += child.descendantCount(); - } - return count; - } - - void recursiveSetDescendantCounts() { - descendantTokenCount = entryDescriptorsMap.size(); - descendantEntryCount = 0; - - for (final Node child : children.values()) { - child.recursiveSetDescendantCounts(); - descendantTokenCount += child.descendantTokenCount; - descendantEntryCount += child.descendantEntryCount; - } - - for (final List entryDescriptors : entryDescriptorsMap.values()) { - descendantEntryCount += entryDescriptors.size(); - } - } - - @Override - public String toString() { - return normalizedToken; - } - - void dump(final RandomAccessFile file) throws IOException { - if (indexFileLocation == -1) { - indexFileLocation = (int) file.getFilePointer(); - } else { - assert indexFileLocation == file.getFilePointer(); - } - - // Children to location. - file.writeInt(children.size()); - for (final Map.Entry child : children.entrySet()) { - file.writeUTF(child.getKey()); - file.writeInt(child.getValue().indexFileLocation); - } - - // Entries. - file.writeInt(entryDescriptorsMap.size()); - for (final Map.Entry> entry : entryDescriptorsMap.entrySet()) { - file.writeUTF(entry.getKey()); - file.writeInt(entry.getValue().size()); - for (int i = 0; i < entry.getValue().size(); ++i) { - file.writeInt(entry.getValue().get(i).offset); - } - } - - // Dump counts. - file.writeInt(descendantTokenCount); - file.writeInt(descendantEntryCount); - - // Dump children. - for (final Map.Entry child : children.entrySet()) { - child.getValue().dump(file); - } - } - - public void addToken(final String token, final EntryDescriptor entryDescriptor) { - List entryDescriptors = this.entryDescriptorsMap.get(token); - if (entryDescriptors == null) { - entryDescriptors = new ArrayList(); - this.entryDescriptorsMap.put(token, entryDescriptors); - } - entryDescriptors.add(entryDescriptor); - } - } // ----------------------------------------------------------------