1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
18 package com.hughes.android.dictionary.engine;
20 import java.io.IOException;
21 import java.io.PrintStream;
22 import java.io.RandomAccessFile;
23 import java.util.ArrayList;
24 import java.util.Collection;
25 import java.util.Collections;
26 import java.util.EnumMap;
27 import java.util.LinkedHashSet;
28 import java.util.List;
31 import java.util.concurrent.atomic.AtomicBoolean;
32 import java.util.regex.Pattern;
34 import com.hughes.android.dictionary.DictionaryInfo;
35 import com.hughes.android.dictionary.DictionaryInfo.IndexInfo;
36 import com.hughes.util.CachingList;
37 import com.hughes.util.raf.RAFList;
38 import com.hughes.util.raf.RAFSerializable;
39 import com.hughes.util.raf.RAFSerializer;
40 import com.hughes.util.raf.SerializableSerializer;
41 import com.hughes.util.raf.UniformRAFList;
42 import com.ibm.icu.text.Collator;
43 import com.ibm.icu.text.Transliterator;
45 public final class Index implements RAFSerializable<Index> {
47 static final int CACHE_SIZE = 5000;
49 final Dictionary dict;
51 public final String shortName; // Typically the ISO code for the language.
52 public final String longName;
54 // persisted: tells how the entries are sorted.
55 public final Language sortLanguage;
56 final String normalizerRules;
58 // Built from the two above.
59 private Transliterator normalizer;
62 public final List<IndexEntry> sortedIndexEntries;
65 public final Set<String> stoplist;
70 public final List<RowBase> rows;
71 public final boolean swapPairEntries;
74 int mainTokenCount = -1;
76 // --------------------------------------------------------------------------
78 public Index(final Dictionary dict, final String shortName, final String longName, final Language sortLanguage, final String normalizerRules, final boolean swapPairEntries, final Set<String> stoplist) {
80 this.shortName = shortName;
81 this.longName = longName;
82 this.sortLanguage = sortLanguage;
83 this.normalizerRules = normalizerRules;
84 this.swapPairEntries = swapPairEntries;
85 sortedIndexEntries = new ArrayList<IndexEntry>();
86 this.stoplist = stoplist;
87 rows = new ArrayList<RowBase>();
92 public synchronized Transliterator normalizer() {
93 if (normalizer == null) {
94 normalizer = Transliterator.createFromRules("", normalizerRules, Transliterator.FORWARD);
99 public Index(final Dictionary dict, final RandomAccessFile raf) throws IOException {
101 shortName = raf.readUTF();
102 longName = raf.readUTF();
103 final String languageCode = raf.readUTF();
104 sortLanguage = Language.lookup(languageCode);
105 normalizerRules = raf.readUTF();
106 swapPairEntries = raf.readBoolean();
107 if (sortLanguage == null) {
108 throw new IOException("Unsupported language: " + languageCode);
110 if (dict.dictFileVersion >= 2) {
111 mainTokenCount = raf.readInt();
113 sortedIndexEntries = CachingList.create(RAFList.create(raf, IndexEntry.SERIALIZER, raf.getFilePointer()), CACHE_SIZE);
114 if (dict.dictFileVersion >= 4) {
115 stoplist = new SerializableSerializer<Set<String>>().read(raf);
117 stoplist = Collections.emptySet();
119 rows = CachingList.create(UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()), CACHE_SIZE);
123 public void write(final RandomAccessFile raf) throws IOException {
124 raf.writeUTF(shortName);
125 raf.writeUTF(longName);
126 raf.writeUTF(sortLanguage.getIsoCode());
127 raf.writeUTF(normalizerRules);
128 raf.writeBoolean(swapPairEntries);
129 if (dict.dictFileVersion >= 2) {
130 raf.writeInt(mainTokenCount);
132 RAFList.write(raf, sortedIndexEntries, IndexEntry.SERIALIZER);
133 new SerializableSerializer<Set<String>>().write(raf, stoplist);
134 UniformRAFList.write(raf, (Collection<RowBase>) rows, new RowBase.Serializer(this), 5);
137 public void print(final PrintStream out) {
138 for (final RowBase row : rows) {
143 public static final class IndexEntry implements RAFSerializable<Index.IndexEntry> {
144 public final String token;
145 private final String normalizedToken;
146 public final int startRow;
147 public final int numRows;
150 static final RAFSerializer<IndexEntry> SERIALIZER = new RAFSerializer<IndexEntry> () {
152 public IndexEntry read(RandomAccessFile raf) throws IOException {
153 return new IndexEntry(raf);
156 public void write(RandomAccessFile raf, IndexEntry t) throws IOException {
160 public IndexEntry(final String token, final String normalizedToken, final int startRow, final int numRows) {
161 assert token.equals(token.trim());
162 assert token.length() > 0;
164 this.normalizedToken = normalizedToken;
165 this.startRow = startRow;
166 this.numRows = numRows;
169 public IndexEntry(final RandomAccessFile raf) throws IOException {
170 token = raf.readUTF();
171 startRow = raf.readInt();
172 numRows = raf.readInt();
173 final boolean hasNormalizedForm = raf.readBoolean();
174 normalizedToken = hasNormalizedForm ? raf.readUTF() : token;
177 public void write(RandomAccessFile raf) throws IOException {
179 raf.writeInt(startRow);
180 raf.writeInt(numRows);
181 final boolean hasNormalizedForm = !token.equals(normalizedToken);
182 raf.writeBoolean(hasNormalizedForm);
183 if (hasNormalizedForm) {
184 raf.writeUTF(normalizedToken);
188 public String toString() {
189 return String.format("%s@%d(%d)", token, startRow, numRows);
192 public String normalizedToken() {
193 return normalizedToken;
197 public IndexEntry findInsertionPoint(String token, final AtomicBoolean interrupted) {
198 final int index = findInsertionPointIndex(token, interrupted);
199 return index != -1 ? sortedIndexEntries.get(index) : null;
202 public int findInsertionPointIndex(String token, final AtomicBoolean interrupted) {
203 token = normalizeToken(token);
206 int end = sortedIndexEntries.size();
208 final Collator sortCollator = sortLanguage.getCollator();
209 while (start < end) {
210 final int mid = (start + end) / 2;
211 if (interrupted.get()) {
214 final IndexEntry midEntry = sortedIndexEntries.get(mid);
216 final int comp = sortCollator.compare(token, midEntry.normalizedToken());
218 final int result = windBackCase(token, mid, interrupted);
220 } else if (comp < 0) {
221 //System.out.println("Upper bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
224 //System.out.println("Lower bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
229 // If we search for a substring of a string that's in there, return that.
230 int result = Math.min(start, sortedIndexEntries.size() - 1);
231 result = windBackCase(sortedIndexEntries.get(result).normalizedToken(), result, interrupted);
235 private final int windBackCase(final String token, int result, final AtomicBoolean interrupted) {
236 while (result > 0 && sortedIndexEntries.get(result - 1).normalizedToken().equals(token)) {
238 if (interrupted.get()) {
245 public IndexInfo getIndexInfo() {
246 return new DictionaryInfo.IndexInfo(shortName, sortedIndexEntries.size(), mainTokenCount);
249 public final List<RowBase> multiWordSearch(final List<String> searchTokens, final AtomicBoolean interrupted) {
250 final List<RowBase> result = new ArrayList<RowBase>();
252 final Set<String> normalizedNonStoplist = new LinkedHashSet<String>();
254 final StringBuilder regex = new StringBuilder();
255 for (int i = 0; i < searchTokens.size(); ++i) {
256 final String searchToken = searchTokens.get(i);
257 final String normalized = normalizeToken(searchTokens.get(i));
258 // Normalize them all.
259 searchTokens.set(i, normalized);
261 if (!stoplist.contains(searchToken)) {
262 normalizedNonStoplist.add(normalized);
265 if (regex.length() > 0) {
266 regex.append("[\\s]*");
268 regex.append(Pattern.quote(normalized));
270 final Pattern pattern = Pattern.compile(regex.toString());
273 // The things that match.
274 final Map<RowMatchType,List<RowBase>> matches = new EnumMap<RowMatchType, List<RowBase>>(RowMatchType.class);
275 for (final RowMatchType rowMatchType : RowMatchType.values()) {
276 if (rowMatchType != RowMatchType.NO_MATCH) {
277 matches.put(rowMatchType, new ArrayList<RowBase>());
281 int bestRowCount = Integer.MAX_VALUE;
282 String bestToken = null;
283 for (final String searchToken : normalizedNonStoplist) {
284 final int insertionPointIndex = findInsertionPointIndex(searchToken, interrupted);
285 if (interrupted.get()) { return null; }
286 if (insertionPointIndex == -1) {
287 // If we've typed "train statio", don't fail just because the index
288 // doesn't contain "statio".
293 for (int index = insertionPointIndex; index < sortedIndexEntries.size(); ++index) {
294 if (interrupted.get()) { return null; }
295 final IndexEntry indexEntry = sortedIndexEntries.get(index);
296 if (!indexEntry.normalizedToken.equals(searchToken)) {
299 rowCount += indexEntry.numRows;
302 //System.out.println(searchToken + ", rowCount=" + rowCount);
303 if (rowCount < bestRowCount) {
304 bestRowCount = rowCount;
305 bestToken = searchToken;
309 final String searchToken = bestToken != null ? bestToken : searchTokens.get(0);
311 // for (final String searchToken : searchTokens) {
313 final int insertionPointIndex = findInsertionPointIndex(searchToken, interrupted);
314 if (interrupted.get()) { return null; }
317 // System.out.println("Searching token: " + searchToken);
320 for (int index = insertionPointIndex; index < sortedIndexEntries.size(); ++index) {
321 if (interrupted.get()) { return null; }
322 final IndexEntry indexEntry = sortedIndexEntries.get(index);
323 if (!indexEntry.normalizedToken.equals(searchToken)) {
327 // System.out.println("Searching indexEntry: " + indexEntry.token);
329 for (int rowIndex = indexEntry.startRow; rowIndex < indexEntry.startRow + indexEntry.numRows; ++rowIndex) {
330 if (interrupted.get()) { return null; }
331 final RowBase row = rows.get(rowIndex);
332 final RowMatchType matchType = row.matches(searchTokens, pattern, normalizer(), swapPairEntries);
333 if (matchType != RowMatchType.NO_MATCH) {
334 matches.get(matchType).add(row);
340 final RowBase.LengthComparator lengthComparator = new RowBase.LengthComparator(swapPairEntries);
341 for (final Collection<RowBase> rows : matches.values()) {
342 final List<RowBase> ordered = new ArrayList<RowBase>(rows);
343 Collections.sort(ordered, lengthComparator);
344 result.addAll(ordered);
350 private String normalizeToken(final String searchToken) {
351 if (TransliteratorManager.init(null)) {
352 final Transliterator normalizer = normalizer();
353 return normalizer.transliterate(searchToken);
355 // Do our best since the Transliterators aren't up yet.
356 return searchToken.toLowerCase();