From 681f96a678518952d14327f435b98d5cff0e36d6 Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Fri, 5 Nov 2010 12:26:34 -0700 Subject: [PATCH] Moved around testdata. --- .../dictionary/engine/DictFileParser.java | 10 ++- .../dictionary/engine/DictionaryBuilder.java | 2 - .../engine/DictionaryBuilderTest.java | 8 +- .../dictionary/engine/DictionaryTest.java | 74 ++++++++++++------- .../dictionary/engine/IndexBuilder.java | 36 ++++++--- 5 files changed, 84 insertions(+), 46 deletions(-) diff --git a/src/com/hughes/android/dictionary/engine/DictFileParser.java b/src/com/hughes/android/dictionary/engine/DictFileParser.java index 2119a10..ebdbaef 100644 --- a/src/com/hughes/android/dictionary/engine/DictFileParser.java +++ b/src/com/hughes/android/dictionary/engine/DictFileParser.java @@ -22,7 +22,7 @@ public class DictFileParser { // Chemnitz static final Pattern DOUBLE_COLON = Pattern.compile(" :: "); - static final Pattern PIPE = Pattern.compile(" \\| "); + static final Pattern PIPE = Pattern.compile("\\|"); static final Pattern SPACES = Pattern.compile("\\s+"); static final Pattern DE_NOUN = Pattern.compile("([^ ]+) *\\{(m|f|n|pl)\\}"); @@ -65,8 +65,13 @@ public class DictFileParser { public void parseFile(final File file) throws IOException { final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)); String line; + int count = 0; while ((line = reader.readLine()) != null) { + if (count % 10000 == 0) { + logger.info("count=" + count + ", line=" + line); + } parseLine(line); + ++count; } } @@ -104,12 +109,13 @@ public class DictFileParser { final Pair[] pairs = new Pair[subfields[0].length]; for (int i = 0; i < pairs.length; ++i) { + subfields[0][i] = subfields[0][i].trim(); + subfields[1][i] = subfields[1][i].trim(); pairs[i] = new Pair(subfields[0][i], subfields[1][i]); } final PairEntry pairEntry = new PairEntry(pairs); final EntryData entryData = new EntryData(dictBuilder.dictionary.pairEntries.size(), pairEntry); dictBuilder.dictionary.pairEntries.add(pairEntry); - dictBuilder.entryDatas.add(entryData); // TODO: delete me. for (int l = 0; l < 2; ++l) { // alreadyDone.clear(); diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java index 9ee0347..6bb1115 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java @@ -39,8 +39,6 @@ public class DictionaryBuilder { final Dictionary dictionary; - final List entryDatas = new ArrayList(); - final List indexBuilders = new ArrayList(); public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1) { diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index 4937015..e68bf5e 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -18,19 +18,19 @@ public class DictionaryBuilderTest extends TestCase { "--dictOut=" + result.getAbsolutePath(), "--lang1=DE", "--lang2=EN", - "--dictInfo=@testdata/de_en_dictInfo.txt", + "--dictInfo=@testdata/de-en_dictInfo.txt", - "--input1=testdata/de-en-chemnitz_100", + "--input1=testdata/de-en_chemnitz_100", "--input1Name=dictcc", "--input1Charset=UTF8", "--input1Format=chemnitz", - "--input2=testdata/de-en-dictcc_100", + "--input2=testdata/de-en_dictcc_100", "--input2Name=dictcc", "--input2Charset=UTF8", "--input2Format=dictcc", - "--print=testdata/de_en.test", + "--print=testdata/de-en.test", }); // Check it once: diff --git a/src/com/hughes/android/dictionary/engine/DictionaryTest.java b/src/com/hughes/android/dictionary/engine/DictionaryTest.java index bfceeff..4b45348 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryTest.java @@ -14,39 +14,23 @@ import com.hughes.android.dictionary.engine.Index.SearchResult; public class DictionaryTest extends TestCase { - - RandomAccessFile raf; - Dictionary dict; - Index deIndex; - - @Override - public void setUp() { - try { - raf = new RandomAccessFile("testdata/de_en.dict", "r"); - dict = new Dictionary(raf); - } catch (IOException e) { - throw new RuntimeException(e); - } - - deIndex = dict.indices.get(0); -} - - @Override - public void tearDown() { - try { - raf.close(); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - + public void testGermanMetadata() throws IOException { + final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.dict", "r"); + final Dictionary dict = new Dictionary(raf); + final Index deIndex = dict.indices.get(0); + assertEquals("de", deIndex.shortName); assertEquals("de->en", deIndex.longName); + + raf.close(); } public void testGermanIndex() throws IOException { + final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.dict", "r"); + final Dictionary dict = new Dictionary(raf); + final Index deIndex = dict.indices.get(0); + for (final Index.IndexEntry indexEntry : deIndex.sortedIndexEntries) { System.out.println("testing: " + indexEntry.token); final Index.SearchResult searchResult = deIndex.findLongestSubstring(indexEntry.token, new AtomicBoolean( @@ -62,6 +46,7 @@ public class DictionaryTest extends TestCase { assertSearchResult("aaac", "aaac", deIndex.findLongestSubstring("aAac", new AtomicBoolean(false))); // Before the beginning. + assertSearchResult("40", "40" /* special case */, deIndex.findLongestSubstring("", new AtomicBoolean(false))); assertSearchResult("40", "40" /* special case */, deIndex.findLongestSubstring("__", new AtomicBoolean(false))); // After the end. @@ -70,6 +55,8 @@ public class DictionaryTest extends TestCase { assertSearchResult("ab", "aaac", deIndex.findLongestSubstring("aaaca", new AtomicBoolean(false))); assertSearchResult("machen", "machen", deIndex.findLongestSubstring("m", new AtomicBoolean(false))); + assertFalse(deIndex.findLongestSubstring("macdddd", new AtomicBoolean(false)).success); + assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("ueberprüfe", new AtomicBoolean(false))); assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("ueberpruefe", new AtomicBoolean(false))); @@ -79,6 +66,12 @@ public class DictionaryTest extends TestCase { assertSearchResult("überprüfen", "überprüfe", deIndex.findLongestSubstring("überprüfeBLEH", new AtomicBoolean(false))); + // Check that search in lowercase works. + assertSearchResult("Alibi", "Alibi", deIndex.findLongestSubstring("alib", new AtomicBoolean(false))); + assertTrue(deIndex.findLongestSubstring("alib", new AtomicBoolean(false)).success); + System.out.println(deIndex.findLongestSubstring("alib", new AtomicBoolean(false)).toString()); + + raf.close(); } private void assertSearchResult(final String insertionPoint, final String longestPrefix, @@ -87,7 +80,11 @@ public class DictionaryTest extends TestCase { assertEquals(longestPrefix, actual.longestPrefix.token); } - public void testGermanTokenRows() { + public void testGermanTokenRows() throws IOException { + final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.dict", "r"); + final Dictionary dict = new Dictionary(raf); + final Index deIndex = dict.indices.get(0); + // Pre-cache a few of these, just to make sure that's working. for (int i = 0; i < deIndex.rows.size(); i += 7) { deIndex.rows.get(i).getTokenRow(true); @@ -110,6 +107,8 @@ public class DictionaryTest extends TestCase { // This will break if the Row cache isn't big enough. assertEquals(lastTokenRow, row.getTokenRow(false)); } + + raf.close(); } public void testGermanSort() { @@ -130,6 +129,10 @@ public class DictionaryTest extends TestCase { "Großformats", "Großpoo", "Großpoos", + "Hörweite", + "hos", + "Höschen", + "Hostel", "hulle", "Hulle", "hülle", @@ -188,5 +191,20 @@ public class DictionaryTest extends TestCase { assertEquals("es", Language.lookup("es").getSymbol()); } + public void testTextNorm() { + assertEquals("hoschen", "Höschen".toLowerCase(Language.de.locale)); + } + + public void testChemnitz() throws IOException { + final RandomAccessFile raf = new RandomAccessFile("testdata/de-en_chemnitz.dict", "r"); + final Dictionary dict = new Dictionary(raf); + final Index deIndex = dict.indices.get(0); + + //assertSearchResult("Höschen", "Hos", deIndex.findLongestSubstring("Hos", new AtomicBoolean(false))); + //assertSearchResult("Höschen", "hos", deIndex.findLongestSubstring("hos", new AtomicBoolean(false))); + + + raf.close(); + } } diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java index 2db6290..0e25e33 100644 --- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java @@ -1,6 +1,8 @@ package com.hughes.android.dictionary.engine; import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; import java.util.EnumMap; import java.util.HashSet; import java.util.List; @@ -9,6 +11,8 @@ import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; +import com.hughes.android.dictionary.engine.Index.IndexEntry; + public class IndexBuilder { @@ -29,24 +33,36 @@ public class IndexBuilder { final List rows = index.rows; for (final TokenData tokenData : tokenToData.values()) { tokenEntryDatas.clear(); - final int indexRow = index.sortedIndexEntries.size(); - index.sortedIndexEntries.add(new Index.IndexEntry(tokenData.token, rows.size())); - rows.add(new TokenRow(indexRow, rows.size(), index)); - System.out.println("Added TokenRow: " + rows.get(rows.size() - 1)); - int count = 0; - System.out.println("TOKEN: " + tokenData.token); + final int indexIndex = index.sortedIndexEntries.size(); + final int startRow = rows.size(); + rows.add(new TokenRow(indexIndex, rows.size(), index)); +// System.out.println("Added TokenRow: " + rows.get(rows.size() - 1)); + int numRows = 0; +// System.out.println("TOKEN: " + tokenData.token); for (final Map.Entry> typeToEntry : tokenData.typeToEntries.entrySet()) { for (final EntryData entryData : typeToEntry.getValue()) { if (tokenEntryDatas.add(entryData)) { rows.add(new PairEntry.Row(entryData.index(), rows.size(), index)); - ++count; + ++numRows; - System.out.print(" " + typeToEntry.getKey() + ": "); - rows.get(rows.size() - 1).print(System.out); - System.out.println(); +// System.out.print(" " + typeToEntry.getKey() + ": "); + // rows.get(rows.size() - 1).print(System.out); +// System.out.println(); } } } + index.sortedIndexEntries.add(new Index.IndexEntry(tokenData.token, startRow, numRows)); + } + + final List sortedEntries = new ArrayList(index.sortedIndexEntries); + Collections.sort(sortedEntries, new Comparator() { + @Override + public int compare(IndexEntry object1, IndexEntry object2) { + return object2.numRows - object1.numRows; + }}); + System.out.println("Most common tokens:"); + for (int i = 0; i < 50 && i < sortedEntries.size(); ++i) { + System.out.println(" " + sortedEntries.get(i)); } } -- 2.43.0