package com.hughes.android.dictionary;\r
\r
+import java.io.IOException;\r
+import java.io.RandomAccessFile;\r
import java.util.ArrayList;\r
import java.util.Arrays;\r
import java.util.Collections;\r
import java.util.Comparator;\r
+import java.util.HashMap;\r
import java.util.LinkedHashSet;\r
import java.util.List;\r
+import java.util.Map;\r
import java.util.Set;\r
-import java.util.SortedMap;\r
-import java.util.TreeMap;\r
import java.util.regex.Pattern;\r
\r
import com.hughes.android.dictionary.Dictionary.IndexEntry;\r
\r
static final Pattern WHITESPACE = Pattern.compile("\\s+");\r
\r
+ public static void main(String[] args) throws IOException,\r
+ ClassNotFoundException {\r
+ if (args.length != 2) {\r
+ System.err.println("inputfile outputfile");\r
+ return;\r
+ }\r
+\r
+ final Dictionary dict = new Dictionary("de", "en");\r
+ final RandomAccessFile dictionaryFile = new RandomAccessFile(args[0], "r");\r
+ String line;\r
+ int lineCount = 0;\r
+ long fileLocation = 0;\r
+ while ((line = dictionaryFile.readLine()) != null) {\r
+ assert ((int) fileLocation) == fileLocation;\r
+ line = line.trim();\r
+ if (line.isEmpty() || line.startsWith("#")) {\r
+ continue;\r
+ }\r
+\r
+ final Entry entry = Entry.parseFromLine(line);\r
+ if (entry == null) {\r
+ System.err.println("Invalid entry: " + line);\r
+ continue;\r
+ }\r
+\r
+ dict.entries.add(entry);\r
+\r
+ if (lineCount % 10000 == 0) {\r
+ System.out.println("IndexBuilder: " + "lineCount=" + lineCount);\r
+ }\r
+ lineCount++;\r
+ fileLocation = dictionaryFile.getFilePointer();\r
+ }\r
+ dictionaryFile.close();\r
+\r
+ createIndex(dict, Entry.LANG1);\r
+ createIndex(dict, Entry.LANG2);\r
+\r
+ System.out.println("Writing dictionary.");\r
+ final RandomAccessFile dictOut = new RandomAccessFile(args[1], "rw");\r
+ dictOut.setLength(0);\r
+ dict.write(dictOut);\r
+ dictOut.close();\r
+ }\r
+\r
public static void createIndex(final Dictionary dict, final byte lang) {\r
+ System.out.println("Creating index: " + lang);\r
\r
- final SortedMap<String, TokenData> sortedIndex = new TreeMap<String, TokenData>(\r
- EntryFactory.entryFactory.getEntryComparator());\r
+ final Map<String, TokenData> tokenDatas = new HashMap<String, TokenData>();\r
final EntryData entryDatas[] = new EntryData[dict.entries.size()];\r
\r
for (int e = 0; e < dict.entries.size(); ++e) {\r
.asList(WHITESPACE.split(text.trim())));\r
entryDatas[e] = new EntryData(tokens.size());\r
for (final String token : tokens) {\r
- TokenData tokenData = sortedIndex.get(token);\r
+ TokenData tokenData = tokenDatas.get(token);\r
if (tokenData == null) {\r
tokenData = new TokenData(token);\r
- sortedIndex.put(token, tokenData);\r
+ tokenDatas.put(token, tokenData);\r
}\r
tokenData.entries.add(e);\r
}\r
+\r
+ if (e % 10000 == 0) {\r
+ System.out.println("createIndex: " + "e=" + e);\r
+ }\r
}\r
\r
// Sort it.\r
\r
+ final List<TokenData> sortedIndex = new ArrayList<TokenData>(tokenDatas\r
+ .values());\r
+ Collections.sort(sortedIndex);\r
+\r
final Comparator<Integer> entryComparator = new Comparator<Integer>() {\r
@Override\r
public int compare(Integer o1, Integer o2) {\r
}\r
};\r
\r
- for (final TokenData tokenData : sortedIndex.values()) {\r
+ for (final TokenData tokenData : tokenDatas.values()) {\r
Collections.sort(tokenData.entries, entryComparator);\r
}\r
\r
final List<Row> rows = dict.languages[lang].rows;\r
final List<IndexEntry> indexEntries = dict.languages[lang].sortedIndex;\r
\r
- int tokenDataIndex = 0;\r
- for (final TokenData tokenData : sortedIndex.values()) {\r
+ for (int t = 0; t < sortedIndex.size(); ++t) {\r
+ final TokenData tokenData = sortedIndex.get(t);\r
final int startRow = rows.size();\r
final IndexEntry indexEntry = new IndexEntry(tokenData.token, startRow);\r
indexEntries.add(indexEntry);\r
\r
- final Row tokenRow = new Row(-(tokenDataIndex + 1));\r
+ final Row tokenRow = new Row(-(t + 1));\r
rows.add(tokenRow);\r
\r
for (final Integer e : tokenData.entries) {\r
final Row entryRow = new Row(e);\r
rows.add(entryRow);\r
}\r
- ++tokenDataIndex;\r
}\r
\r
}\r
}\r
}\r
\r
- static final class TokenData {\r
+ static final class TokenData implements Comparable<TokenData> {\r
final String token;\r
final List<Integer> entries = new ArrayList<Integer>();\r
\r
public TokenData(String token) {\r
this.token = token;\r
}\r
+\r
+ @Override\r
+ public int compareTo(TokenData that) {\r
+ return EntryFactory.entryFactory.getEntryComparator().compare(this.token,\r
+ that.token);\r
+ }\r
}\r
\r
}\r
import java.io.RandomAccessFile;\r
import java.util.Arrays;\r
import java.util.List;\r
+import java.util.concurrent.atomic.AtomicBoolean;\r
\r
import junit.framework.TestCase;\r
\r
import com.hughes.android.dictionary.Dictionary.IndexEntry;\r
+import com.hughes.android.dictionary.Dictionary.Language;\r
import com.hughes.android.dictionary.Dictionary.Row;\r
\r
public class DictionaryTest extends TestCase {\r
final File file = File.createTempFile("asdf", "asdf");\r
file.deleteOnExit();\r
\r
- final Dictionary goldenDict;\r
+// final Dictionary goldenDict;\r
final List<Entry> entries = Arrays.asList(\r
new Entry("der Hund", "the dog"),\r
new Entry("Die grosse Katze", "The big cat"), \r
dict.write(raf);\r
raf.close();\r
\r
- goldenDict = dict;\r
+// goldenDict = dict;\r
}\r
\r
final RandomAccessFile raf = new RandomAccessFile(file, "r");\r
++rowCount;\r
}\r
\r
+ for (int l = 0; l <= 1; l++) {\r
+ final Language lang = dict.languages[l];\r
+ for (int i = 0; i < lang.sortedIndex.size(); i++) {\r
+ final IndexEntry indexEntry = lang.sortedIndex.get(i);\r
+ if (indexEntry.word.toLowerCase().equals("dieb"))\r
+ System.out.println();\r
+ final IndexEntry lookedUpEntry = lang.sortedIndex.get(lang.lookup(indexEntry.word, new AtomicBoolean(false)));\r
+ if (!indexEntry.word.toLowerCase().equals(lookedUpEntry.word.toLowerCase()))\r
+ System.out.println();\r
+ assertEquals(indexEntry.word.toLowerCase(), lookedUpEntry.word.toLowerCase());\r
+ }\r
+ }\r
+ \r
+ assertEquals("Die", dict.languages[0].sortedIndex.get(dict.languages[0].lookup("die", new AtomicBoolean())).word);\r
+\r
+ }\r
+ \r
+ public void testTextNorm() throws IOException {\r
+// final File file = File.createTempFile("asdf", "asdf");\r
+// file.deleteOnExit();\r
+\r
+// final Dictionary goldenDict;\r
+ final List<Entry> entries = Arrays.asList(\r
+ new Entry("der Hund", "the dog"),\r
+ new Entry("Die grosse Katze", "The big cat"), \r
+ new Entry("die Katze", "the cat"),\r
+ new Entry("gross", "big"),\r
+ new Entry("Dieb", "thief"),\r
+ new Entry("rennen", "run"));\r
\r
}\r
\r