1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
18 package com.hughes.android.dictionary.engine;
20 import java.io.IOException;
21 import java.io.PrintStream;
22 import java.io.RandomAccessFile;
23 import java.util.ArrayList;
24 import java.util.Collection;
25 import java.util.List;
26 import java.util.concurrent.atomic.AtomicBoolean;
28 import com.hughes.util.CachingList;
29 import com.hughes.util.raf.RAFList;
30 import com.hughes.util.raf.RAFSerializable;
31 import com.hughes.util.raf.RAFSerializer;
32 import com.hughes.util.raf.UniformRAFList;
33 import com.ibm.icu.text.Collator;
34 import com.ibm.icu.text.Transliterator;
36 public final class Index implements RAFSerializable<Index> {
38 static final int CACHE_SIZE = 5000;
40 final Dictionary dict;
42 public final String shortName;
43 public final String longName;
45 // persisted: tells how the entries are sorted.
46 public final Language sortLanguage;
47 final String normalizerRules;
49 // Built from the two above.
50 private Transliterator normalizer;
53 public final List<IndexEntry> sortedIndexEntries;
58 public final List<RowBase> rows;
60 public final boolean swapPairEntries;
62 // --------------------------------------------------------------------------
64 public Index(final Dictionary dict, final String shortName, final String longName, final Language sortLanguage, final String normalizerRules, final boolean swapPairEntries) {
66 this.shortName = shortName;
67 this.longName = longName;
68 this.sortLanguage = sortLanguage;
69 this.normalizerRules = normalizerRules;
70 this.swapPairEntries = swapPairEntries;
71 sortedIndexEntries = new ArrayList<IndexEntry>();
72 rows = new ArrayList<RowBase>();
77 public synchronized Transliterator normalizer() {
78 if (normalizer == null) {
79 normalizer = Transliterator.createFromRules("", normalizerRules, Transliterator.FORWARD);
84 public Index(final Dictionary dict, final RandomAccessFile raf) throws IOException {
86 shortName = raf.readUTF();
87 longName = raf.readUTF();
88 final String languageCode = raf.readUTF();
89 sortLanguage = Language.lookup(languageCode);
90 normalizerRules = raf.readUTF();
91 swapPairEntries = raf.readBoolean();
92 if (sortLanguage == null) {
93 throw new IOException("Unsupported language: " + languageCode);
95 sortedIndexEntries = CachingList.create(RAFList.create(raf, IndexEntry.SERIALIZER, raf.getFilePointer()), CACHE_SIZE);
96 rows = CachingList.create(UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()), CACHE_SIZE);
100 public void write(final RandomAccessFile raf) throws IOException {
101 raf.writeUTF(shortName);
102 raf.writeUTF(longName);
103 raf.writeUTF(sortLanguage.getSymbol());
104 raf.writeUTF(normalizerRules);
105 raf.writeBoolean(swapPairEntries);
106 RAFList.write(raf, sortedIndexEntries, IndexEntry.SERIALIZER);
107 UniformRAFList.write(raf, (Collection<RowBase>) rows, new RowBase.Serializer(this), 5);
110 public void print(final PrintStream out) {
111 for (final RowBase row : rows) {
116 public static final class IndexEntry implements RAFSerializable<Index.IndexEntry> {
117 public final String token;
118 private final String normalizedToken;
119 public final int startRow;
120 public final int numRows;
123 static final RAFSerializer<IndexEntry> SERIALIZER = new RAFSerializer<IndexEntry> () {
125 public IndexEntry read(RandomAccessFile raf) throws IOException {
126 return new IndexEntry(raf);
129 public void write(RandomAccessFile raf, IndexEntry t) throws IOException {
133 public IndexEntry(final String token, final String normalizedToken, final int startRow, final int numRows) {
134 assert token.equals(token.trim());
135 assert token.length() > 0;
137 this.normalizedToken = normalizedToken;
138 this.startRow = startRow;
139 this.numRows = numRows;
142 public IndexEntry(final RandomAccessFile raf) throws IOException {
143 token = raf.readUTF();
144 startRow = raf.readInt();
145 numRows = raf.readInt();
146 final boolean hasNormalizedForm = raf.readBoolean();
147 normalizedToken = hasNormalizedForm ? raf.readUTF() : token;
150 public void write(RandomAccessFile raf) throws IOException {
152 raf.writeInt(startRow);
153 raf.writeInt(numRows);
154 final boolean hasNormalizedForm = !token.equals(normalizedToken);
155 raf.writeBoolean(hasNormalizedForm);
156 if (hasNormalizedForm) {
157 raf.writeUTF(normalizedToken);
161 public String toString() {
162 return String.format("%s@%d(%d)", token, startRow, numRows);
165 public String normalizedToken() {
166 return normalizedToken;
170 public IndexEntry findInsertionPoint(String token, final AtomicBoolean interrupted) {
171 final Transliterator normalizer = normalizer();
172 if (TransliteratorManager.init(null)) {
173 token = normalizer.transliterate(token);
175 // Do our best since the Transliterators aren't up yet.
176 token = token.toLowerCase();
180 int end = sortedIndexEntries.size();
182 final Collator sortCollator = sortLanguage.getCollator();
183 while (start < end) {
184 final int mid = (start + end) / 2;
185 if (interrupted.get()) {
188 final IndexEntry midEntry = sortedIndexEntries.get(mid);
190 final int comp = sortCollator.compare(token, midEntry.normalizedToken());
192 final int result = windBackCase(token, mid, interrupted);
193 return sortedIndexEntries.get(result);
194 } else if (comp < 0) {
195 //System.out.println("Upper bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
198 //System.out.println("Lower bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
203 // If we search for a substring of a string that's in there, return that.
204 int result = Math.min(start, sortedIndexEntries.size() - 1);
205 result = windBackCase(sortedIndexEntries.get(result).normalizedToken(), result, interrupted);
206 return sortedIndexEntries.get(result);
209 public static final class SearchResult {
210 public final IndexEntry insertionPoint;
211 public final IndexEntry longestPrefix;
212 public final String longestPrefixString;
213 public final boolean success;
215 public SearchResult(IndexEntry insertionPoint, IndexEntry longestPrefix,
216 String longestPrefixString, boolean success) {
217 this.insertionPoint = insertionPoint;
218 this.longestPrefix = longestPrefix;
219 this.longestPrefixString = longestPrefixString;
220 this.success = success;
224 public String toString() {
225 return String.format("inerstionPoint=%s,longestPrefix=%s,longestPrefixString=%s,success=%b", insertionPoint.toString(), longestPrefix.toString(), longestPrefixString, success);
229 // public SearchResult findLongestSubstring(String token, final AtomicBoolean interrupted) {
230 // token = normalizer.transliterate(token);
231 // if (token.length() == 0) {
232 // return new SearchResult(sortedIndexEntries.get(0), sortedIndexEntries.get(0), "", true);
234 // IndexEntry insertionPoint = null;
235 // IndexEntry result = null;
236 // boolean unmodified = true;
237 // while (!interrupted.get() && token.length() > 0) {
238 // result = findInsertionPoint(token, interrupted);
239 // if (result == null) {
243 // insertionPoint = result;
245 // if (result.normalizedToken(normalizer).startsWith(token)) {
246 // return new SearchResult(insertionPoint, result, token, unmodified);
248 // unmodified = false;
249 // token = token.substring(0, token.length() - 1);
251 // return new SearchResult(insertionPoint, sortedIndexEntries.get(0), "", false);
254 private final int windBackCase(final String token, int result, final AtomicBoolean interrupted) {
255 while (result > 0 && sortedIndexEntries.get(result - 1).normalizedToken().equals(token)) {
257 if (interrupted.get()) {
265 public int tokenRowBinarySearch(final int rowIndex) {
267 int end = sortedIndexEntries.size();
268 while (start < end) {
269 final int mid = (start + end) / 2;
270 final int midRowIndex = sortedIndexEntries.get(mid).startRow;
271 if (midRowIndex == rowIndex) {