From: Thad Hughes Date: Tue, 31 Jan 2012 22:56:05 +0000 (-0800) Subject: Moved normalization, more tests. X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=1ae83ab5b046cb0c912620aa9f81de091cbdb8e5 Moved normalization, more tests. --- diff --git a/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java b/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java index ca67a8c..bfc3661 100644 --- a/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java +++ b/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java @@ -11,13 +11,13 @@ import com.hughes.android.dictionary.DictionaryInfo; public class CheckDictionariesMain { static final String BASE_URL = "http://quickdic-dictionary.googlecode.com/files/"; - static final String VERSION_CODE = "v002"; + static final String VERSION_CODE = "v003"; public static void main(String[] args) throws IOException { final File dictDir = new File(DictionaryBuilderMain.OUTPUTS); final PrintWriter dictionaryInfoOut = new PrintWriter(new File("../Dictionary/res/raw/dictionary_info.txt")); - dictionaryInfoOut.println("# LANG_1\t%LANG_2\tFILENAME\tVERSION_CODE\tFILESIZE\tNUM_MAIN_WORDS_1\tNUM_MAIN_WORDS_2\tNUM_ALL_WORDS_1\tNUM_ALL_WORDS_2"); +// dictionaryInfoOut.println("# LANG_1\t%LANG_2\tFILENAME\tVERSION_CODE\tFILESIZE\tNUM_MAIN_WORDS_1\tNUM_MAIN_WORDS_2\tNUM_ALL_WORDS_1\tNUM_ALL_WORDS_2"); final File[] files = dictDir.listFiles(); Arrays.sort(files); @@ -37,7 +37,7 @@ public class CheckDictionariesMain { dictionaryInfo.downloadUrl = BASE_URL + dictFile.getName() + "." + VERSION_CODE + ".zip"; // TODO: zip it right here.... dictionaryInfo.uncompressedBytes = dictFile.length(); - final File zipFile = new File(dictFile.getPath() + ".zip"); + final File zipFile = new File(dictFile.getPath() + "." + VERSION_CODE + ".zip"); dictionaryInfo.zipBytes = zipFile.canRead() ? zipFile.length() : -1; // Print it. diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 9998860..6e640d6 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -63,9 +63,9 @@ public class DictionaryBuilderMain extends TestCase { // isoToWikiName.keySet().retainAll(Arrays.asList("UK", "HR", "FI")); //isoToWikiName.clear(); - boolean go = true; + boolean go = false; for (final String foreignIso : isoToWikiName.keySet()) { - if (foreignIso.equals("blah")) { + if (foreignIso.equals("BO")) { go = true; } if (!go) { diff --git a/src/com/hughes/android/dictionary/engine/DictionaryTest.java b/src/com/hughes/android/dictionary/engine/DictionaryTest.java index c3f4c4e..52c3161 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryTest.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.io.RandomAccessFile; import java.util.Arrays; import java.util.List; +import java.util.Random; import java.util.concurrent.atomic.AtomicBoolean; import junit.framework.TestCase; @@ -198,13 +199,102 @@ public class DictionaryTest extends TestCase { { final List rows = enIndex.multiWordSearch(Arrays.asList("a", "station"), new AtomicBoolean(false)); - // TODO: bug, "a" isn't in stoplist for now... System.out.println(CollectionUtil.join(rows, "\n ")); - assertTrue(rows.toString(), rows.size() == 0); - //assertEquals("Bahnhofsuhr {{de-noun|g=f|plural=Bahnhofsuhren}}\tstation clock (at a train station)", rows.get(0).toString()); + assertTrue(rows.toString(), rows.size() > 0); + assertEquals("Kraftwerk {n}\tpower plant (a station built for the production of electric power) (noun)", rows.get(0).toString()); + } + + { + // Should print: Giving up, too many words with prefix: p + final List rows = enIndex.multiWordSearch(Arrays.asList("p", "eat"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertTrue(rows.toString().contains("verschlingen; verputzen\tto dispatch (eat)")); + } + + { + // Should print: Giving up, too many words with prefix: p + final List rows = enIndex.multiWordSearch(Arrays.asList("p", "p"), new AtomicBoolean(false)); + assertTrue(rows.size() >= 1000); + } + + { + // Should print: Giving up, too many words with prefix: a + final List rows = enIndex.multiWordSearch(Arrays.asList("a", "a"), new AtomicBoolean(false)); + assertTrue(rows.size() >= 1000); + } + + { + // Should print: Giving up, too many words with prefix: a + final List rows = enIndex.multiWordSearch(Arrays.asList("b", "ba"), new AtomicBoolean(false)); + assertTrue(rows.size() >= 1000); + } + + { + // Should print: Giving up, too many words with prefix: a + final List rows = enIndex.multiWordSearch(Arrays.asList("b", "ba"), new AtomicBoolean(false)); + assertTrue(rows.size() >= 1000); } raf.close(); } + public void testMultiSearchBigAF() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-AF_enwiktionary.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + final Index enIndex = dict.indices.get(0); + + { + final List rows = enIndex.multiWordSearch(Arrays.asList("pig", "eats"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertEquals("pig (someone who overeats or eats rapidly) (noun)\tvark", rows.get(0).toString()); + } + + { + final List rows = enIndex.multiWordSearch(Arrays.asList("pig", "eat"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertEquals("pig (someone who overeats or eats rapidly) (noun)\tvark", rows.get(0).toString()); + } + + { + final List rows = enIndex.multiWordSearch(Arrays.asList("pi", "ea"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertTrue(rows.toString().contains("pig (someone who overeats or eats rapidly) (noun)\tvark")); + } + + { + final List rows = enIndex.multiWordSearch(Arrays.asList("p", "eat"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertTrue(rows.toString().contains("pig (someone who overeats or eats rapidly) (noun)\tvark")); + } + + + raf.close(); + } + + + public void testExactSearch() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-ZH_enwiktionary.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + final Index zhIndex = dict.indices.get(1); + + final Random random = new Random(10); + + for (int i = 0; i < 1000; ++i) { + final int ii = random.nextInt(zhIndex.sortedIndexEntries.size()); + final IndexEntry indexEntry = zhIndex.sortedIndexEntries.get(ii); + final IndexEntry found = zhIndex.findExact(indexEntry.token); + assertNotNull(found); + assertEquals(indexEntry.token, found.token); + assertEquals(indexEntry, found); // Test of caching.... + } + + raf.close(); + } + + } diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java index 95a6b07..0d11191 100644 --- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java @@ -39,7 +39,7 @@ public class IndexBuilder { IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final Set stoplist, final boolean swapPairEntries) { this.dictionaryBuilder = dictionaryBuilder; index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries, stoplist); - tokenToData = new TreeMap(new NormalizeComparator(index.normalizer(), language.getCollator())); + tokenToData = new TreeMap(index.getSortComparator()); this.stoplist = stoplist; } @@ -58,7 +58,7 @@ public class IndexBuilder { index.mainTokenCount++; } // System.out.println("Added TokenRow: " + rows.get(rows.size() - 1)); - int numRows = 0; + int numRows = 0; // off by one--doesn't count the token row! // System.out.println("TOKEN: " + tokenData.token); for (final Map.Entry> typeToEntry : tokenData.typeToEntries.entrySet()) { for (final IndexedEntry entryData : typeToEntry.getValue()) { diff --git a/src/com/hughes/android/dictionary/engine/LanguageTest.java b/src/com/hughes/android/dictionary/engine/LanguageTest.java index f729b22..5e21348 100644 --- a/src/com/hughes/android/dictionary/engine/LanguageTest.java +++ b/src/com/hughes/android/dictionary/engine/LanguageTest.java @@ -168,7 +168,7 @@ public class LanguageTest extends TestCase { } public void testEnWiktionaryNames() { - assertEquals(EnWiktionaryLangs.isoCodeToWikiName.keySet(), Language.isoCodeToResourceId.keySet()); + assertEquals(EnWiktionaryLangs.isoCodeToWikiName.keySet(), Language.isoCodeToResources.keySet()); } } diff --git a/src/com/hughes/android/dictionary/engine/NormalizeComparator.java b/src/com/hughes/android/dictionary/engine/NormalizeComparator.java deleted file mode 100644 index b0efdee..0000000 --- a/src/com/hughes/android/dictionary/engine/NormalizeComparator.java +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package com.hughes.android.dictionary.engine; - -import java.util.Comparator; - -import com.ibm.icu.text.Transliterator; - -public class NormalizeComparator implements Comparator { - - final Transliterator normalizer; - final Comparator comparator; - - public NormalizeComparator(final Transliterator normalizer, - final Comparator comparator) { - this.normalizer = normalizer; - this.comparator = comparator; - } - - @Override - public int compare(final String s1, final String s2) { - final String n1 = normalizer.transform(s1); - final String n2 = normalizer.transform(s2); - final int cn = comparator.compare(n1, n2); - if (cn != 0) { - return cn; - } - return comparator.compare(s1, s2); - } - -} diff --git a/todo.txt b/todo.txt index 1ba238d..00c264d 100644 --- a/todo.txt +++ b/todo.txt @@ -1,6 +1,6 @@ For next release: +reload dictionaryInfo sometime... downloads -about dict dialog history dialog fix up dictionary manager: thread that handles unzipping, downloading for the life of the application (so screen changes don't screw it up). @@ -98,3 +98,4 @@ num words in UI. multiword find. ! enter should hide keyboard eng_urdu +about dict dialog