src/com/hughes/android/dictionary/engine/Index.java

   1 // Copyright 2011 Google Inc. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //     http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 /**
  16  *
  17  */
  18 package com.hughes.android.dictionary.engine;
  19
  20 import java.io.IOException;
  21 import java.io.PrintStream;
  22 import java.io.RandomAccessFile;
  23 import java.util.ArrayList;
  24 import java.util.Collection;
  25 import java.util.Collections;
  26 import java.util.EnumMap;
  27 import java.util.LinkedHashSet;
  28 import java.util.List;
  29 import java.util.Map;
  30 import java.util.Set;
  31 import java.util.concurrent.atomic.AtomicBoolean;
  32 import java.util.regex.Pattern;
  33
  34 import com.hughes.android.dictionary.DictionaryInfo;
  35 import com.hughes.android.dictionary.DictionaryInfo.IndexInfo;
  36 import com.hughes.util.CachingList;
  37 import com.hughes.util.raf.RAFList;
  38 import com.hughes.util.raf.RAFSerializable;
  39 import com.hughes.util.raf.RAFSerializer;
  40 import com.hughes.util.raf.SerializableSerializer;
  41 import com.hughes.util.raf.UniformRAFList;
  42 import com.ibm.icu.text.Collator;
  43 import com.ibm.icu.text.Transliterator;
  44
  45 public final class Index implements RAFSerializable<Index> {
  46
  47   static final int CACHE_SIZE = 5000;
  48
  49   final Dictionary dict;
  50
  51   public final String shortName;  // Typically the ISO code for the language.
  52   public final String longName;
  53
  54   // persisted: tells how the entries are sorted.
  55   public final Language sortLanguage;
  56   final String normalizerRules;
  57
  58   // Built from the two above.
  59   private Transliterator normalizer;
  60
  61   // persisted
  62   public final List<IndexEntry> sortedIndexEntries;
  63
  64   // persisted.
  65   public final Set<String> stoplist;
  66
  67   // One big list!
  68   // Various sub-types.
  69   // persisted
  70   public final List<RowBase> rows;
  71   public final boolean swapPairEntries;
  72
  73   // Version 2:
  74   int mainTokenCount = -1;
  75
  76   // --------------------------------------------------------------------------
  77
  78   public Index(final Dictionary dict, final String shortName, final String longName, final Language sortLanguage, final String normalizerRules, final boolean swapPairEntries, final Set<String> stoplist) {
  79     this.dict = dict;
  80     this.shortName = shortName;
  81     this.longName = longName;
  82     this.sortLanguage = sortLanguage;
  83     this.normalizerRules = normalizerRules;
  84     this.swapPairEntries = swapPairEntries;
  85     sortedIndexEntries = new ArrayList<IndexEntry>();
  86     this.stoplist = stoplist;
  87     rows = new ArrayList<RowBase>();
  88
  89     normalizer = null;
  90   }
  91
  92   public synchronized Transliterator normalizer() {
  93     if (normalizer == null) {
  94       normalizer = Transliterator.createFromRules("", normalizerRules, Transliterator.FORWARD);
  95     }
  96     return normalizer;
  97   }
  98
  99   public Index(final Dictionary dict, final RandomAccessFile raf) throws IOException {
 100     this.dict = dict;
 101     shortName = raf.readUTF();
 102     longName = raf.readUTF();
 103     final String languageCode = raf.readUTF();
 104     sortLanguage = Language.lookup(languageCode);
 105     normalizerRules = raf.readUTF();
 106     swapPairEntries = raf.readBoolean();
 107     if (sortLanguage == null) {
 108       throw new IOException("Unsupported language: " + languageCode);
 109     }
 110     if (dict.dictFileVersion >= 2) {
 111       mainTokenCount = raf.readInt();
 112     }
 113     sortedIndexEntries = CachingList.create(RAFList.create(raf, IndexEntry.SERIALIZER, raf.getFilePointer()), CACHE_SIZE);
 114     if (dict.dictFileVersion >= 4) {
 115       stoplist = new SerializableSerializer<Set<String>>().read(raf);
 116     } else {
 117       stoplist = Collections.emptySet();
 118     }
 119     rows = CachingList.create(UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()), CACHE_SIZE);
 120   }
 121
 122   @Override
 123   public void write(final RandomAccessFile raf) throws IOException {
 124     raf.writeUTF(shortName);
 125     raf.writeUTF(longName);
 126     raf.writeUTF(sortLanguage.getIsoCode());
 127     raf.writeUTF(normalizerRules);
 128     raf.writeBoolean(swapPairEntries);
 129     if (dict.dictFileVersion >= 2) {
 130       raf.writeInt(mainTokenCount);
 131     }
 132     RAFList.write(raf, sortedIndexEntries, IndexEntry.SERIALIZER);
 133     new SerializableSerializer<Set<String>>().write(raf, stoplist);
 134     UniformRAFList.write(raf, (Collection<RowBase>) rows, new RowBase.Serializer(this), 5);
 135   }
 136
 137   public void print(final PrintStream out) {
 138     for (final RowBase row : rows) {
 139       row.print(out);
 140     }
 141   }
 142
 143   public static final class IndexEntry implements RAFSerializable<Index.IndexEntry> {
 144     public final String token;
 145     private final String normalizedToken;
 146     public final int startRow;
 147     public final int numRows;
 148
 149
 150     static final RAFSerializer<IndexEntry> SERIALIZER = new RAFSerializer<IndexEntry> () {
 151       @Override
 152       public IndexEntry read(RandomAccessFile raf) throws IOException {
 153         return new IndexEntry(raf);
 154       }
 155       @Override
 156       public void write(RandomAccessFile raf, IndexEntry t) throws IOException {
 157         t.write(raf);
 158       }};
 159
 160     public IndexEntry(final String token, final String normalizedToken, final int startRow, final int numRows) {
 161       assert token.equals(token.trim());
 162       assert token.length() > 0;
 163       this.token = token;
 164       this.normalizedToken = normalizedToken;
 165       this.startRow = startRow;
 166       this.numRows = numRows;
 167     }
 168
 169     public IndexEntry(final RandomAccessFile raf) throws IOException {
 170       token = raf.readUTF();
 171       startRow = raf.readInt();
 172       numRows = raf.readInt();
 173       final boolean hasNormalizedForm = raf.readBoolean();
 174       normalizedToken = hasNormalizedForm ? raf.readUTF() : token;
 175     }
 176
 177     public void write(RandomAccessFile raf) throws IOException {
 178       raf.writeUTF(token);
 179       raf.writeInt(startRow);
 180       raf.writeInt(numRows);
 181       final boolean hasNormalizedForm = !token.equals(normalizedToken);
 182       raf.writeBoolean(hasNormalizedForm);
 183       if (hasNormalizedForm) {
 184         raf.writeUTF(normalizedToken);
 185       }
 186     }
 187
 188     public String toString() {
 189       return String.format("%s@%d(%d)", token, startRow, numRows);
 190     }
 191
 192     public String normalizedToken() {
 193       return normalizedToken;
 194     }
 195   }
 196
 197   public IndexEntry findInsertionPoint(String token, final AtomicBoolean interrupted) {
 198     final int index = findInsertionPointIndex(token, interrupted);
 199     return index != -1 ? sortedIndexEntries.get(index) : null;
 200   }
 201
 202   public int findInsertionPointIndex(String token, final AtomicBoolean interrupted) {
 203     token = normalizeToken(token);
 204
 205     int start = 0;
 206     int end = sortedIndexEntries.size();
 207
 208     final Collator sortCollator = sortLanguage.getCollator();
 209     while (start < end) {
 210       final int mid = (start + end) / 2;
 211       if (interrupted.get()) {
 212         return -1;
 213       }
 214       final IndexEntry midEntry = sortedIndexEntries.get(mid);
 215
 216       final int comp = sortCollator.compare(token, midEntry.normalizedToken());
 217       if (comp == 0) {
 218         final int result = windBackCase(token, mid, interrupted);
 219         return result;
 220       } else if (comp < 0) {
 221         //System.out.println("Upper bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
 222         end = mid;
 223       } else {
 224         //System.out.println("Lower bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
 225         start = mid + 1;
 226       }
 227     }
 228
 229     // If we search for a substring of a string that's in there, return that.
 230     int result = Math.min(start, sortedIndexEntries.size() - 1);
 231     result = windBackCase(sortedIndexEntries.get(result).normalizedToken(), result, interrupted);
 232     return result;
 233   }
 234
 235   private final int windBackCase(final String token, int result, final AtomicBoolean interrupted) {
 236     while (result > 0 && sortedIndexEntries.get(result - 1).normalizedToken().equals(token)) {
 237       --result;
 238       if (interrupted.get()) {
 239         return result;
 240       }
 241     }
 242     return result;
 243   }
 244
 245   public IndexInfo getIndexInfo() {
 246     return new DictionaryInfo.IndexInfo(shortName, sortedIndexEntries.size(), mainTokenCount);
 247   }
 248
 249   public final List<RowBase> multiWordSearch(final List<String> searchTokens, final AtomicBoolean interrupted) {
 250     final List<RowBase> result = new ArrayList<RowBase>();
 251
 252     final Set<String> normalizedNonStoplist = new LinkedHashSet<String>();
 253
 254     final StringBuilder regex = new StringBuilder();
 255     for (int i = 0; i < searchTokens.size(); ++i) {
 256       final String searchToken = searchTokens.get(i);
 257       final String normalized = normalizeToken(searchTokens.get(i));
 258       // Normalize them all.
 259       searchTokens.set(i, normalized);
 260
 261       if (!stoplist.contains(searchToken)) {
 262         normalizedNonStoplist.add(normalized);
 263       }
 264
 265       if (regex.length() > 0) {
 266         regex.append("[\\s]*");
 267       }
 268       regex.append(Pattern.quote(normalized));
 269     }
 270     final Pattern pattern = Pattern.compile(regex.toString());
 271
 272
 273     // The things that match.
 274     final Map<RowMatchType,List<RowBase>> matches = new EnumMap<RowMatchType, List<RowBase>>(RowMatchType.class);
 275     for (final RowMatchType rowMatchType : RowMatchType.values()) {
 276       if (rowMatchType != RowMatchType.NO_MATCH) {
 277         matches.put(rowMatchType, new ArrayList<RowBase>());
 278       }
 279     }
 280
 281     int bestRowCount = Integer.MAX_VALUE;
 282     String bestToken = null;
 283     for (final String searchToken : normalizedNonStoplist) {
 284       final int insertionPointIndex = findInsertionPointIndex(searchToken, interrupted);
 285       if (interrupted.get()) { return null; }
 286       if (insertionPointIndex == -1) {
 287         // If we've typed "train statio", don't fail just because the index
 288         // doesn't contain "statio".
 289         continue;
 290       }
 291
 292       int rowCount = 0;
 293       for (int index = insertionPointIndex; index < sortedIndexEntries.size(); ++index) {
 294         if (interrupted.get()) { return null; }
 295         final IndexEntry indexEntry = sortedIndexEntries.get(index);
 296         if (!indexEntry.normalizedToken.equals(searchToken)) {
 297           break;
 298         }
 299         rowCount += indexEntry.numRows;
 300       }
 301
 302       //System.out.println(searchToken + ", rowCount=" + rowCount);
 303       if (rowCount < bestRowCount) {
 304         bestRowCount = rowCount;
 305         bestToken = searchToken;
 306       }
 307     }
 308
 309     final String searchToken = bestToken != null ? bestToken : searchTokens.get(0);
 310
 311 //    for (final String searchToken : searchTokens) {
 312
 313     final int insertionPointIndex = findInsertionPointIndex(searchToken, interrupted);
 314     if (interrupted.get()) { return null; }
 315
 316
 317 //      System.out.println("Searching token: " + searchToken);
 318
 319
 320       for (int index = insertionPointIndex; index < sortedIndexEntries.size(); ++index) {
 321         if (interrupted.get()) { return null; }
 322         final IndexEntry indexEntry = sortedIndexEntries.get(index);
 323         if (!indexEntry.normalizedToken.equals(searchToken)) {
 324           break;
 325         }
 326
 327 //        System.out.println("Searching indexEntry: " + indexEntry.token);
 328
 329         for (int rowIndex = indexEntry.startRow; rowIndex < indexEntry.startRow + indexEntry.numRows; ++rowIndex) {
 330           if (interrupted.get()) { return null; }
 331           final RowBase row = rows.get(rowIndex);
 332           final RowMatchType matchType = row.matches(searchTokens, pattern, normalizer(), swapPairEntries);
 333           if (matchType != RowMatchType.NO_MATCH) {
 334             matches.get(matchType).add(row);
 335           }
 336         }
 337       }
 338 //    }  // searchTokens
 339
 340     final RowBase.LengthComparator lengthComparator = new RowBase.LengthComparator(swapPairEntries);
 341     for (final Collection<RowBase> rows : matches.values()) {
 342       final List<RowBase> ordered = new ArrayList<RowBase>(rows);
 343       Collections.sort(ordered, lengthComparator);
 344       result.addAll(ordered);
 345     }
 346
 347     return result;
 348   }
 349
 350   private String normalizeToken(final String searchToken) {
 351     if (TransliteratorManager.init(null)) {
 352       final Transliterator normalizer = normalizer();
 353       return normalizer.transliterate(searchToken);
 354     } else {
 355       // Do our best since the Transliterators aren't up yet.
 356       return searchToken.toLowerCase();
 357     }
 358   }
 359
 360 }