From e624542f38775fe7891cfb724437e8f9639bd503 Mon Sep 17 00:00:00 2001 From: thadh Date: Tue, 17 Mar 2009 22:49:15 -0700 Subject: [PATCH] go --- .../android/dictionary/DictionaryBuilder.java | 84 ++++++++++++++++--- .../android/dictionary/DictionaryTest.java | 35 +++++++- 2 files changed, 105 insertions(+), 14 deletions(-) diff --git a/src/com/hughes/android/dictionary/DictionaryBuilder.java b/src/com/hughes/android/dictionary/DictionaryBuilder.java index acf0345..ae6fece 100755 --- a/src/com/hughes/android/dictionary/DictionaryBuilder.java +++ b/src/com/hughes/android/dictionary/DictionaryBuilder.java @@ -1,14 +1,16 @@ package com.hughes.android.dictionary; +import java.io.IOException; +import java.io.RandomAccessFile; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; import java.util.LinkedHashSet; import java.util.List; +import java.util.Map; import java.util.Set; -import java.util.SortedMap; -import java.util.TreeMap; import java.util.regex.Pattern; import com.hughes.android.dictionary.Dictionary.IndexEntry; @@ -18,10 +20,55 @@ public class DictionaryBuilder { static final Pattern WHITESPACE = Pattern.compile("\\s+"); + public static void main(String[] args) throws IOException, + ClassNotFoundException { + if (args.length != 2) { + System.err.println("inputfile outputfile"); + return; + } + + final Dictionary dict = new Dictionary("de", "en"); + final RandomAccessFile dictionaryFile = new RandomAccessFile(args[0], "r"); + String line; + int lineCount = 0; + long fileLocation = 0; + while ((line = dictionaryFile.readLine()) != null) { + assert ((int) fileLocation) == fileLocation; + line = line.trim(); + if (line.isEmpty() || line.startsWith("#")) { + continue; + } + + final Entry entry = Entry.parseFromLine(line); + if (entry == null) { + System.err.println("Invalid entry: " + line); + continue; + } + + dict.entries.add(entry); + + if (lineCount % 10000 == 0) { + System.out.println("IndexBuilder: " + "lineCount=" + lineCount); + } + lineCount++; + fileLocation = dictionaryFile.getFilePointer(); + } + dictionaryFile.close(); + + createIndex(dict, Entry.LANG1); + createIndex(dict, Entry.LANG2); + + System.out.println("Writing dictionary."); + final RandomAccessFile dictOut = new RandomAccessFile(args[1], "rw"); + dictOut.setLength(0); + dict.write(dictOut); + dictOut.close(); + } + public static void createIndex(final Dictionary dict, final byte lang) { + System.out.println("Creating index: " + lang); - final SortedMap sortedIndex = new TreeMap( - EntryFactory.entryFactory.getEntryComparator()); + final Map tokenDatas = new HashMap(); final EntryData entryDatas[] = new EntryData[dict.entries.size()]; for (int e = 0; e < dict.entries.size(); ++e) { @@ -31,17 +78,25 @@ public class DictionaryBuilder { .asList(WHITESPACE.split(text.trim()))); entryDatas[e] = new EntryData(tokens.size()); for (final String token : tokens) { - TokenData tokenData = sortedIndex.get(token); + TokenData tokenData = tokenDatas.get(token); if (tokenData == null) { tokenData = new TokenData(token); - sortedIndex.put(token, tokenData); + tokenDatas.put(token, tokenData); } tokenData.entries.add(e); } + + if (e % 10000 == 0) { + System.out.println("createIndex: " + "e=" + e); + } } // Sort it. + final List sortedIndex = new ArrayList(tokenDatas + .values()); + Collections.sort(sortedIndex); + final Comparator entryComparator = new Comparator() { @Override public int compare(Integer o1, Integer o2) { @@ -50,7 +105,7 @@ public class DictionaryBuilder { } }; - for (final TokenData tokenData : sortedIndex.values()) { + for (final TokenData tokenData : tokenDatas.values()) { Collections.sort(tokenData.entries, entryComparator); } @@ -59,20 +114,19 @@ public class DictionaryBuilder { final List rows = dict.languages[lang].rows; final List indexEntries = dict.languages[lang].sortedIndex; - int tokenDataIndex = 0; - for (final TokenData tokenData : sortedIndex.values()) { + for (int t = 0; t < sortedIndex.size(); ++t) { + final TokenData tokenData = sortedIndex.get(t); final int startRow = rows.size(); final IndexEntry indexEntry = new IndexEntry(tokenData.token, startRow); indexEntries.add(indexEntry); - final Row tokenRow = new Row(-(tokenDataIndex + 1)); + final Row tokenRow = new Row(-(t + 1)); rows.add(tokenRow); for (final Integer e : tokenData.entries) { final Row entryRow = new Row(e); rows.add(entryRow); } - ++tokenDataIndex; } } @@ -85,7 +139,7 @@ public class DictionaryBuilder { } } - static final class TokenData { + static final class TokenData implements Comparable { final String token; final List entries = new ArrayList(); @@ -94,6 +148,12 @@ public class DictionaryBuilder { public TokenData(String token) { this.token = token; } + + @Override + public int compareTo(TokenData that) { + return EntryFactory.entryFactory.getEntryComparator().compare(this.token, + that.token); + } } } diff --git a/src/com/hughes/android/dictionary/DictionaryTest.java b/src/com/hughes/android/dictionary/DictionaryTest.java index 43fb7ab..806e836 100755 --- a/src/com/hughes/android/dictionary/DictionaryTest.java +++ b/src/com/hughes/android/dictionary/DictionaryTest.java @@ -5,10 +5,12 @@ import java.io.IOException; import java.io.RandomAccessFile; import java.util.Arrays; import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; import junit.framework.TestCase; import com.hughes.android.dictionary.Dictionary.IndexEntry; +import com.hughes.android.dictionary.Dictionary.Language; import com.hughes.android.dictionary.Dictionary.Row; public class DictionaryTest extends TestCase { @@ -17,7 +19,7 @@ public class DictionaryTest extends TestCase { final File file = File.createTempFile("asdf", "asdf"); file.deleteOnExit(); - final Dictionary goldenDict; +// final Dictionary goldenDict; final List entries = Arrays.asList( new Entry("der Hund", "the dog"), new Entry("Die grosse Katze", "The big cat"), @@ -37,7 +39,7 @@ public class DictionaryTest extends TestCase { dict.write(raf); raf.close(); - goldenDict = dict; +// goldenDict = dict; } final RandomAccessFile raf = new RandomAccessFile(file, "r"); @@ -62,6 +64,35 @@ public class DictionaryTest extends TestCase { ++rowCount; } + for (int l = 0; l <= 1; l++) { + final Language lang = dict.languages[l]; + for (int i = 0; i < lang.sortedIndex.size(); i++) { + final IndexEntry indexEntry = lang.sortedIndex.get(i); + if (indexEntry.word.toLowerCase().equals("dieb")) + System.out.println(); + final IndexEntry lookedUpEntry = lang.sortedIndex.get(lang.lookup(indexEntry.word, new AtomicBoolean(false))); + if (!indexEntry.word.toLowerCase().equals(lookedUpEntry.word.toLowerCase())) + System.out.println(); + assertEquals(indexEntry.word.toLowerCase(), lookedUpEntry.word.toLowerCase()); + } + } + + assertEquals("Die", dict.languages[0].sortedIndex.get(dict.languages[0].lookup("die", new AtomicBoolean())).word); + + } + + public void testTextNorm() throws IOException { +// final File file = File.createTempFile("asdf", "asdf"); +// file.deleteOnExit(); + +// final Dictionary goldenDict; + final List entries = Arrays.asList( + new Entry("der Hund", "the dog"), + new Entry("Die grosse Katze", "The big cat"), + new Entry("die Katze", "the cat"), + new Entry("gross", "big"), + new Entry("Dieb", "thief"), + new Entry("rennen", "run")); } -- 2.43.0