Multi word search now looks for exact matches of TokenRows.

[Dictionary.git] / src / com / hughes / android / dictionary / engine / Index.java
diff --git a/src/com/hughes/android/dictionary/engine/Index.java b/src/com/hughes/android/dictionary/engine/Index.java

index db0544081cdd67f288b5394bfa3597d1ae0e54b6..68a0dc27a9d6ee3645cde0a322839bafa98bbfef 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/Index.java
+++ b/src/com/hughes/android/dictionary/engine/Index.java
@@ -17,6 +17,20 @@
   */
  package com.hughes.android.dictionary.engine;
  
+import com.hughes.android.dictionary.DictionaryInfo;
+import com.hughes.android.dictionary.DictionaryInfo.IndexInfo;
+import com.hughes.android.dictionary.engine.RowBase.RowKey;
+import com.hughes.util.CachingList;
+import com.hughes.util.CollectionUtil;
+import com.hughes.util.TransformingList;
+import com.hughes.util.raf.RAFList;
+import com.hughes.util.raf.RAFSerializable;
+import com.hughes.util.raf.RAFSerializer;
+import com.hughes.util.raf.SerializableSerializer;
+import com.hughes.util.raf.UniformRAFList;
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.Transliterator;
+
  import java.io.IOException;
  import java.io.PrintStream;
  import java.io.RandomAccessFile;
@@ -33,24 +47,11 @@ import java.util.Set;
  import java.util.concurrent.atomic.AtomicBoolean;
  import java.util.regex.Pattern;
  
-import com.hughes.android.dictionary.DictionaryInfo;
-import com.hughes.android.dictionary.DictionaryInfo.IndexInfo;
-import com.hughes.android.dictionary.engine.RowBase.RowKey;
-import com.hughes.util.CachingList;
-import com.hughes.util.TransformingList;
-import com.hughes.util.raf.RAFList;
-import com.hughes.util.raf.RAFSerializable;
-import com.hughes.util.raf.RAFSerializer;
-import com.hughes.util.raf.SerializableSerializer;
-import com.hughes.util.raf.UniformRAFList;
-import com.ibm.icu.text.Collator;
-import com.ibm.icu.text.Transliterator;
-
  public final class Index implements RAFSerializable<Index> {
    
    static final int CACHE_SIZE = 5000;
    
-  final Dictionary dict;
+  public final Dictionary dict;
    
    public final String shortName;  // Typically the ISO code for the language.
    public final String longName;
@@ -124,7 +125,7 @@ public final class Index implements RAFSerializable<Index> {
      if (dict.dictFileVersion >= 2) {
        mainTokenCount = raf.readInt();
      }
-    sortedIndexEntries = CachingList.create(RAFList.create(raf, IndexEntry.SERIALIZER, raf.getFilePointer()), CACHE_SIZE);
+    sortedIndexEntries = CachingList.create(RAFList.create(raf, indexEntrySerializer, raf.getFilePointer()), CACHE_SIZE);
      if (dict.dictFileVersion >= 4) {
        stoplist = new SerializableSerializer<Set<String>>().read(raf);
      } else {
@@ -143,7 +144,7 @@ public final class Index implements RAFSerializable<Index> {
      if (dict.dictFileVersion >= 2) {
        raf.writeInt(mainTokenCount);
      }
-    RAFList.write(raf, sortedIndexEntries, IndexEntry.SERIALIZER);
+    RAFList.write(raf, sortedIndexEntries, indexEntrySerializer);
      new SerializableSerializer<Set<String>>().write(raf, stoplist);
      UniformRAFList.write(raf, (Collection<RowBase>) rows, new RowBase.Serializer(this), 5 /* bytes per entry */);
    }
@@ -154,38 +155,49 @@ public final class Index implements RAFSerializable<Index> {
      }
    }
    
-  public static final class IndexEntry implements RAFSerializable<Index.IndexEntry> {
-    public final String token;
-    private final String normalizedToken;
-    public final int startRow;
-    public final int numRows;  // doesn't count the token row!
-    
-    
-    static final RAFSerializer<IndexEntry> SERIALIZER = new RAFSerializer<IndexEntry> () {
+  private final RAFSerializer<IndexEntry> indexEntrySerializer = new RAFSerializer<IndexEntry> () {
        @Override
        public IndexEntry read(RandomAccessFile raf) throws IOException {
-        return new IndexEntry(raf);
+        return new IndexEntry(Index.this, raf);
        }
        @Override
        public void write(RandomAccessFile raf, IndexEntry t) throws IOException {
          t.write(raf);
        }};
        
-    public IndexEntry(final String token, final String normalizedToken, final int startRow, final int numRows) {
+
+  public static final class IndexEntry implements RAFSerializable<Index.IndexEntry> {
+    private final Index index;
+    public final String token;
+    private final String normalizedToken;
+    public final int startRow;
+    public final int numRows;  // doesn't count the token row!
+    public final List<HtmlEntry> htmlEntries;
+    
+    
+    public IndexEntry(final Index index, final String token, final String normalizedToken, final int startRow, final int numRows) {
+      this.index = index;
        assert token.equals(token.trim());
        assert token.length() > 0;
        this.token = token;
        this.normalizedToken = normalizedToken;
        this.startRow = startRow;
        this.numRows = numRows;
+      this.htmlEntries = new ArrayList<HtmlEntry>();
      }
      
-    public IndexEntry(final RandomAccessFile raf) throws IOException {
+    public IndexEntry(final Index index, final RandomAccessFile raf) throws IOException {
+      this.index = index;
        token = raf.readUTF();
        startRow = raf.readInt();
        numRows = raf.readInt();
        final boolean hasNormalizedForm = raf.readBoolean();
        normalizedToken = hasNormalizedForm ? raf.readUTF() : token;
+      if (index.dict.dictFileVersion >= 6) {
+        this.htmlEntries = CachingList.create(RAFList.create(raf, index.dict.htmlEntryIndexSerializer, raf.getFilePointer()), 1);
+      } else {
+        this.htmlEntries = Collections.emptyList();
+      }
      }
      
      public void write(RandomAccessFile raf) throws IOException {
@@ -197,6 +209,7 @@ public final class Index implements RAFSerializable<Index> {
        if (hasNormalizedForm) {
          raf.writeUTF(normalizedToken);
        }
+      RAFList.write(raf, htmlEntries, index.dict.htmlEntryIndexSerializer);
      }
  
      public String toString() {
@@ -247,10 +260,10 @@ public final class Index implements RAFSerializable<Index> {
          final int result = windBackCase(token, mid, interrupted);
          return result;
        } else if (comp < 0) {
-        //System.out.println("Upper bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
+        // System.out.println("Upper bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
          end = mid;
        } else {
-        //System.out.println("Lower bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
+        // System.out.println("Lower bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
          start = mid + 1;
        }
      }
@@ -303,7 +316,8 @@ public final class Index implements RAFSerializable<Index> {
    }
    
    
-  public final List<RowBase> multiWordSearch(final List<String> searchTokens, final AtomicBoolean interrupted) {
+  public final List<RowBase> multiWordSearch(
+          final String searchText, final List<String> searchTokens, final AtomicBoolean interrupted) {
      final long startMills = System.currentTimeMillis();
      final List<RowBase> result = new ArrayList<RowBase>();
      
@@ -311,7 +325,7 @@ public final class Index implements RAFSerializable<Index> {
      
      String bestPrefix = null;
      int leastRows = Integer.MAX_VALUE;
-    final StringBuilder regex = new StringBuilder();
+    final StringBuilder searchTokensRegex = new StringBuilder();
      for (int i = 0; i < searchTokens.size(); ++i) {
        if (interrupted.get()) { return null; }
        final String searchToken = searchTokens.get(i);
@@ -333,12 +347,12 @@ public final class Index implements RAFSerializable<Index> {
          }
        }
  
-      if (regex.length() > 0) {
-        regex.append("[\\s]*");
+      if (searchTokensRegex.length() > 0) {
+        searchTokensRegex.append("[\\s]*");
        }
-      regex.append(Pattern.quote(normalized));
+      searchTokensRegex.append(Pattern.quote(normalized));
      }
-    final Pattern pattern = Pattern.compile(regex.toString());
+    final Pattern pattern = Pattern.compile(searchTokensRegex.toString());
      
      if (bestPrefix == null) {
        bestPrefix = searchTokens.get(0);
@@ -355,14 +369,22 @@ public final class Index implements RAFSerializable<Index> {
      }
      
      int matchCount = 0;
-    final Set<RowKey> cachedRowKeys = new HashSet<RowBase.RowKey>();
      
-//    for (final String searchToken : searchTokens) {
-    final String searchToken = bestPrefix;
+    final int exactMatchIndex = findInsertionPointIndex(searchText, interrupted);
+    if (exactMatchIndex != -1) {
+        final IndexEntry exactMatch = sortedIndexEntries.get(exactMatchIndex);
+        if (pattern.matcher(exactMatch.token).matches()) {
+            matches.get(RowMatchType.TITLE_MATCH).add(rows.get(exactMatch.startRow));
+        }
+    }
+
      
+    final String searchToken = bestPrefix;
      final int insertionPointIndex = findInsertionPointIndex(searchToken, interrupted);
-
-    for (int index = insertionPointIndex; index < sortedIndexEntries.size() && matchCount < MAX_SEARCH_ROWS; ++index) {
+    final Set<RowKey> rowsAlreadySeen = new HashSet<RowBase.RowKey>();
+    for (int index = insertionPointIndex; 
+            index < sortedIndexEntries.size() && matchCount < MAX_SEARCH_ROWS; 
+            ++index) {
          if (interrupted.get()) { return null; }
          final IndexEntry indexEntry = sortedIndexEntries.get(index);
          if (!indexEntry.normalizedToken.startsWith(searchToken)) {
@@ -372,14 +394,16 @@ public final class Index implements RAFSerializable<Index> {
  //        System.out.println("Searching indexEntry: " + indexEntry.token);
  
          // Extra +1 to skip token row.
-        for (int rowIndex = indexEntry.startRow + 1; rowIndex < indexEntry.startRow + 1 + indexEntry.numRows && rowIndex < rows.size(); ++rowIndex) {
+        for (int rowIndex = indexEntry.startRow + 1; 
+                rowIndex < indexEntry.startRow + 1 + indexEntry.numRows && rowIndex < rows.size(); 
+                ++rowIndex) {
            if (interrupted.get()) { return null; }
            final RowBase row = rows.get(rowIndex);
            final RowBase.RowKey rowKey = row.getRowKey();
-          if (cachedRowKeys.contains(rowKey)) {
+          if (rowsAlreadySeen.contains(rowKey)) {
              continue;
            }
-          cachedRowKeys.add(rowKey);
+          rowsAlreadySeen.add(rowKey);
            final RowMatchType matchType = row.matches(searchTokens, pattern, normalizer(), swapPairEntries);
            if (matchType != RowMatchType.NO_MATCH) {
              matches.get(matchType).add(row);