Moved around testdata.

author Thad Hughes <thad.hughes@gmail.com>

Fri, 5 Nov 2010 19:26:34 +0000 (12:26 -0700)

committer Thad Hughes <thad.hughes@gmail.com>

Tue, 13 Dec 2011 18:38:49 +0000 (10:38 -0800)
author Thad Hughes <thad.hughes@gmail.com>
Fri, 5 Nov 2010 19:26:34 +0000 (12:26 -0700)
committer Thad Hughes <thad.hughes@gmail.com>
Tue, 13 Dec 2011 18:38:49 +0000 (10:38 -0800)
diff --git a/src/com/hughes/android/dictionary/engine/DictFileParser.java b/src/com/hughes/android/dictionary/engine/DictFileParser.java

index 2119a10fb61aecf5abcd34cfe8f9f8c9b21be416..ebdbaefb727fd9c338042c582aa3145ced4d27af 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/DictFileParser.java
+++ b/src/com/hughes/android/dictionary/engine/DictFileParser.java
@@ -22,7 +22,7 @@ public class DictFileParser {
  
    // Chemnitz
    static final Pattern DOUBLE_COLON = Pattern.compile(" :: ");
-  static final Pattern PIPE = Pattern.compile(" \\| ");
+  static final Pattern PIPE = Pattern.compile("\\|");
    
    static final Pattern SPACES = Pattern.compile("\\s+");
    static final Pattern DE_NOUN = Pattern.compile("([^ ]+) *\\{(m|f|n|pl)\\}");
@@ -65,8 +65,13 @@ public class DictFileParser {
    public void parseFile(final File file) throws IOException {
      final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
      String line;
+    int count = 0;
      while ((line = reader.readLine()) != null) {
+      if (count % 10000 == 0) {
+        logger.info("count=" + count + ", line=" + line);
+      }
        parseLine(line);
+      ++count;
      }
    }
    
@@ -104,12 +109,13 @@ public class DictFileParser {
      
      final Pair[] pairs = new Pair[subfields[0].length];
      for (int i = 0; i < pairs.length; ++i) {
+      subfields[0][i] = subfields[0][i].trim();
+      subfields[1][i] = subfields[1][i].trim();
        pairs[i] = new Pair(subfields[0][i], subfields[1][i]);
      }
      final PairEntry pairEntry = new PairEntry(pairs);
      final EntryData entryData = new EntryData(dictBuilder.dictionary.pairEntries.size(), pairEntry);
      dictBuilder.dictionary.pairEntries.add(pairEntry);
-    dictBuilder.entryDatas.add(entryData);  // TODO: delete me.
      
      for (int l = 0; l < 2; ++l) {
        // alreadyDone.clear();
diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java

index 9ee034719dae879a433b09df722d08bba8271bb1..6bb1115e29e648ec0acea9a1b47028574bf997ca 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java
+++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java
@@ -39,8 +39,6 @@ public class DictionaryBuilder {
    
    final Dictionary dictionary;
    
-  final List<EntryData> entryDatas = new ArrayList<EntryData>();
-  
    final List<IndexBuilder> indexBuilders = new ArrayList<IndexBuilder>();
    
    public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1) {
diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java

index 493701560797633dd78ebaa67db30a38de59bdfb..e68bf5e29aceccb23427e2c7260ff3affacd46fe 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
+++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
@@ -18,19 +18,19 @@ public class DictionaryBuilderTest extends TestCase {
          "--dictOut=" + result.getAbsolutePath(),
          "--lang1=DE",
          "--lang2=EN",
-        "--dictInfo=@testdata/de_en_dictInfo.txt",
+        "--dictInfo=@testdata/de-en_dictInfo.txt",
  
-        "--input1=testdata/de-en-chemnitz_100",
+        "--input1=testdata/de-en_chemnitz_100",
          "--input1Name=dictcc",
          "--input1Charset=UTF8",
          "--input1Format=chemnitz",
  
-        "--input2=testdata/de-en-dictcc_100",
+        "--input2=testdata/de-en_dictcc_100",
          "--input2Name=dictcc",
          "--input2Charset=UTF8",
          "--input2Format=dictcc",
          
-        "--print=testdata/de_en.test",
+        "--print=testdata/de-en.test",
      });
      
      // Check it once:
diff --git a/src/com/hughes/android/dictionary/engine/DictionaryTest.java b/src/com/hughes/android/dictionary/engine/DictionaryTest.java

index bfceeffa515a57b5a22f61f9eb68210d3213d212..4b45348ae8e6ae342f3338c675548dbf2450148c 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/DictionaryTest.java
+++ b/src/com/hughes/android/dictionary/engine/DictionaryTest.java
@@ -14,39 +14,23 @@ import com.hughes.android.dictionary.engine.Index.SearchResult;
  
  
  public class DictionaryTest extends TestCase {
-  
-  RandomAccessFile raf;
-  Dictionary dict;
-  Index deIndex; 
-  
-  @Override
-  public void setUp() {
-    try {
-      raf = new RandomAccessFile("testdata/de_en.dict", "r");
-      dict = new Dictionary(raf);
-    } catch (IOException e) {
-      throw new RuntimeException(e);
-    }
-
-    deIndex = dict.indices.get(0);
-}
-  
-  @Override
-  public void tearDown() {
-    try {
-      raf.close();
-    } catch (IOException e) {
-      throw new RuntimeException(e);
-    }
-  }
-  
-
+    
    public void testGermanMetadata() throws IOException {
+    final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.dict", "r");
+    final Dictionary dict = new Dictionary(raf);
+    final Index deIndex = dict.indices.get(0);
+    
      assertEquals("de", deIndex.shortName);
      assertEquals("de->en", deIndex.longName);
+    
+    raf.close();
    }
    
    public void testGermanIndex() throws IOException {
+    final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.dict", "r");
+    final Dictionary dict = new Dictionary(raf);
+    final Index deIndex = dict.indices.get(0);
+    
      for (final Index.IndexEntry indexEntry : deIndex.sortedIndexEntries) {
        System.out.println("testing: " + indexEntry.token);
        final Index.SearchResult searchResult = deIndex.findLongestSubstring(indexEntry.token, new AtomicBoolean(
@@ -62,6 +46,7 @@ public class DictionaryTest extends TestCase {
      assertSearchResult("aaac", "aaac", deIndex.findLongestSubstring("aAac", new AtomicBoolean(false)));
  
      // Before the beginning.
+    assertSearchResult("40", "40" /* special case */, deIndex.findLongestSubstring("", new AtomicBoolean(false)));
      assertSearchResult("40", "40" /* special case */, deIndex.findLongestSubstring("__", new AtomicBoolean(false)));
      
      // After the end.
@@ -70,6 +55,8 @@ public class DictionaryTest extends TestCase {
      assertSearchResult("ab", "aaac", deIndex.findLongestSubstring("aaaca", new AtomicBoolean(false)));
      assertSearchResult("machen", "machen", deIndex.findLongestSubstring("m", new AtomicBoolean(false)));
  
+    assertFalse(deIndex.findLongestSubstring("macdddd", new AtomicBoolean(false)).success);
+
  
      assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("ueberprüfe", new AtomicBoolean(false)));
      assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("ueberpruefe", new AtomicBoolean(false)));
@@ -79,6 +66,12 @@ public class DictionaryTest extends TestCase {
  
      assertSearchResult("überprüfen", "überprüfe", deIndex.findLongestSubstring("überprüfeBLEH", new AtomicBoolean(false)));
  
+    // Check that search in lowercase works.
+    assertSearchResult("Alibi", "Alibi", deIndex.findLongestSubstring("alib", new AtomicBoolean(false)));
+    assertTrue(deIndex.findLongestSubstring("alib", new AtomicBoolean(false)).success);
+    System.out.println(deIndex.findLongestSubstring("alib", new AtomicBoolean(false)).toString());
+    
+    raf.close();
    }
    
    private void assertSearchResult(final String insertionPoint, final String longestPrefix,
@@ -87,7 +80,11 @@ public class DictionaryTest extends TestCase {
      assertEquals(longestPrefix, actual.longestPrefix.token);
    }
  
-  public void testGermanTokenRows() {
+  public void testGermanTokenRows() throws IOException {
+    final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.dict", "r");
+    final Dictionary dict = new Dictionary(raf);
+    final Index deIndex = dict.indices.get(0);
+    
      // Pre-cache a few of these, just to make sure that's working.
      for (int i = 0; i < deIndex.rows.size(); i += 7) {
        deIndex.rows.get(i).getTokenRow(true);
@@ -110,6 +107,8 @@ public class DictionaryTest extends TestCase {
        // This will break if the Row cache isn't big enough.
        assertEquals(lastTokenRow, row.getTokenRow(false));
      }
+    
+    raf.close();
    }
    
    public void testGermanSort() {
@@ -130,6 +129,10 @@ public class DictionaryTest extends TestCase {
          "Großformats",
          "Großpoo",
          "Großpoos",
+        "Hörweite",
+        "hos",
+        "Höschen",
+        "Hostel",
          "hulle",
          "Hulle",
          "hülle",
@@ -188,5 +191,20 @@ public class DictionaryTest extends TestCase {
      assertEquals("es", Language.lookup("es").getSymbol());
    }
  
+  public void testTextNorm() {
+    assertEquals("hoschen", "Höschen".toLowerCase(Language.de.locale));
+  }
+
+  public void testChemnitz() throws IOException {
+    final RandomAccessFile raf = new RandomAccessFile("testdata/de-en_chemnitz.dict", "r");
+    final Dictionary dict = new Dictionary(raf);
+    final Index deIndex = dict.indices.get(0);
+    
+    //assertSearchResult("Höschen", "Hos", deIndex.findLongestSubstring("Hos", new AtomicBoolean(false)));
+    //assertSearchResult("Höschen", "hos", deIndex.findLongestSubstring("hos", new AtomicBoolean(false)));
+ 
+
+    raf.close();
+  }
  
  }
diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java

index 2db62905eb899342e22ba4f6eb84557c45b233ee..0e25e3388b6dbbed2070dd7f43325188a92e75f7 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java
+++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java
@@ -1,6 +1,8 @@
  package com.hughes.android.dictionary.engine;
  
  import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
  import java.util.EnumMap;
  import java.util.HashSet;
  import java.util.List;
@@ -9,6 +11,8 @@ import java.util.Set;
  import java.util.SortedMap;
  import java.util.TreeMap;
  
+import com.hughes.android.dictionary.engine.Index.IndexEntry;
+
  
  public class IndexBuilder {
    
@@ -29,24 +33,36 @@ public class IndexBuilder {
      final List<RowBase> rows = index.rows;
      for (final TokenData tokenData : tokenToData.values()) {
        tokenEntryDatas.clear();
-      final int indexRow = index.sortedIndexEntries.size();
-      index.sortedIndexEntries.add(new Index.IndexEntry(tokenData.token, rows.size()));
-      rows.add(new TokenRow(indexRow, rows.size(), index));
-      System.out.println("Added TokenRow: " + rows.get(rows.size() - 1));
-      int count = 0;
-      System.out.println("TOKEN: " + tokenData.token);
+      final int indexIndex = index.sortedIndexEntries.size();
+      final int startRow = rows.size();
+      rows.add(new TokenRow(indexIndex, rows.size(), index));
+//      System.out.println("Added TokenRow: " + rows.get(rows.size() - 1));
+      int numRows = 0;
+//      System.out.println("TOKEN: " + tokenData.token);
        for (final Map.Entry<EntryTypeName, List<EntryData>> typeToEntry : tokenData.typeToEntries.entrySet()) {
          for (final EntryData entryData : typeToEntry.getValue()) {
            if (tokenEntryDatas.add(entryData)) {
              rows.add(new PairEntry.Row(entryData.index(), rows.size(), index));
-            ++count;
+            ++numRows;
              
-            System.out.print("  " + typeToEntry.getKey() + ": ");
-            rows.get(rows.size() - 1).print(System.out);
-            System.out.println();
+//            System.out.print("  " + typeToEntry.getKey() + ": ");
+  //          rows.get(rows.size() - 1).print(System.out);
+//            System.out.println();
            }
          }
        }
+      index.sortedIndexEntries.add(new Index.IndexEntry(tokenData.token, startRow, numRows));
+    }
+    
+    final List<IndexEntry> sortedEntries = new ArrayList<IndexEntry>(index.sortedIndexEntries);
+    Collections.sort(sortedEntries, new Comparator<IndexEntry>() {
+      @Override
+      public int compare(IndexEntry object1, IndexEntry object2) {
+        return object2.numRows - object1.numRows;
+      }});
+    System.out.println("Most common tokens:");
+    for (int i = 0; i < 50 && i < sortedEntries.size(); ++i) {
+      System.out.println("  " + sortedEntries.get(i));
      }
    }
author	Thad Hughes <thad.hughes@gmail.com>
	Fri, 5 Nov 2010 19:26:34 +0000 (12:26 -0700)
committer	Thad Hughes <thad.hughes@gmail.com>
	Tue, 13 Dec 2011 18:38:49 +0000 (10:38 -0800)
src/com/hughes/android/dictionary/engine/DictFileParser.java		patch \| blob \| history
src/com/hughes/android/dictionary/engine/DictionaryBuilder.java		patch \| blob \| history
src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java		patch \| blob \| history
src/com/hughes/android/dictionary/engine/DictionaryTest.java		patch \| blob \| history
src/com/hughes/android/dictionary/engine/IndexBuilder.java		patch \| blob \| history