4 package com.hughes.android.dictionary.engine;
6 import java.io.IOException;
7 import java.io.PrintStream;
8 import java.io.RandomAccessFile;
9 import java.util.ArrayList;
10 import java.util.Collection;
11 import java.util.List;
12 import java.util.concurrent.atomic.AtomicBoolean;
14 import com.hughes.util.CachingList;
15 import com.hughes.util.raf.RAFList;
16 import com.hughes.util.raf.RAFSerializable;
17 import com.hughes.util.raf.RAFSerializer;
18 import com.hughes.util.raf.UniformRAFList;
19 import com.ibm.icu.text.Collator;
20 import com.ibm.icu.text.Transliterator;
22 public final class Index implements RAFSerializable<Index> {
24 static final int CACHE_SIZE = 5000;
26 final Dictionary dict;
28 public final String shortName;
29 public final String longName;
31 // persisted: tells how the entries are sorted.
32 public final Language sortLanguage;
33 final String normalizerRules;
35 // Built from the two above.
36 private Transliterator normalizer;
39 public final List<IndexEntry> sortedIndexEntries;
44 public final List<RowBase> rows;
46 public final boolean swapPairEntries;
48 // --------------------------------------------------------------------------
50 public Index(final Dictionary dict, final String shortName, final String longName, final Language sortLanguage, final String normalizerRules, final boolean swapPairEntries) {
52 this.shortName = shortName;
53 this.longName = longName;
54 this.sortLanguage = sortLanguage;
55 this.normalizerRules = normalizerRules;
56 this.swapPairEntries = swapPairEntries;
57 sortedIndexEntries = new ArrayList<IndexEntry>();
58 rows = new ArrayList<RowBase>();
63 public synchronized Transliterator normalizer() {
64 if (normalizer == null) {
65 normalizer = Transliterator.createFromRules("", normalizerRules, Transliterator.FORWARD);
70 public Index(final Dictionary dict, final RandomAccessFile raf) throws IOException {
72 shortName = raf.readUTF();
73 longName = raf.readUTF();
74 final String languageCode = raf.readUTF();
75 sortLanguage = Language.lookup(languageCode);
76 normalizerRules = raf.readUTF();
77 swapPairEntries = raf.readBoolean();
78 if (sortLanguage == null) {
79 throw new IOException("Unsupported language: " + languageCode);
81 sortedIndexEntries = CachingList.create(RAFList.create(raf, IndexEntry.SERIALIZER, raf.getFilePointer()), CACHE_SIZE);
82 rows = CachingList.create(UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()), CACHE_SIZE);
86 public void write(final RandomAccessFile raf) throws IOException {
87 raf.writeUTF(shortName);
88 raf.writeUTF(longName);
89 raf.writeUTF(sortLanguage.getSymbol());
90 raf.writeUTF(normalizerRules);
91 raf.writeBoolean(swapPairEntries);
92 RAFList.write(raf, sortedIndexEntries, IndexEntry.SERIALIZER);
93 UniformRAFList.write(raf, (Collection<RowBase>) rows, new RowBase.Serializer(this), 5);
96 public void print(final PrintStream out) {
97 for (final RowBase row : rows) {
102 public static final class IndexEntry implements RAFSerializable<Index.IndexEntry> {
103 public final String token;
104 private final String normalizedToken;
105 public final int startRow;
106 public final int numRows;
109 static final RAFSerializer<IndexEntry> SERIALIZER = new RAFSerializer<IndexEntry> () {
111 public IndexEntry read(RandomAccessFile raf) throws IOException {
112 return new IndexEntry(raf);
115 public void write(RandomAccessFile raf, IndexEntry t) throws IOException {
119 public IndexEntry(final String token, final String normalizedToken, final int startRow, final int numRows) {
120 assert token.equals(token.trim());
121 assert token.length() > 0;
123 this.normalizedToken = normalizedToken;
124 this.startRow = startRow;
125 this.numRows = numRows;
128 public IndexEntry(final RandomAccessFile raf) throws IOException {
129 token = raf.readUTF();
130 startRow = raf.readInt();
131 numRows = raf.readInt();
132 final boolean hasNormalizedForm = raf.readBoolean();
133 normalizedToken = hasNormalizedForm ? raf.readUTF() : token;
136 public void write(RandomAccessFile raf) throws IOException {
138 raf.writeInt(startRow);
139 raf.writeInt(numRows);
140 final boolean hasNormalizedForm = !token.equals(normalizedToken);
141 raf.writeBoolean(hasNormalizedForm);
142 if (hasNormalizedForm) {
143 raf.writeUTF(normalizedToken);
147 public String toString() {
148 return String.format("%s@%d(%d)", token, startRow, numRows);
151 public String normalizedToken() {
152 return normalizedToken;
156 public IndexEntry findInsertionPoint(String token, final AtomicBoolean interrupted) {
157 final Transliterator normalizer = normalizer();
158 if (TransliteratorManager.init(null)) {
159 token = normalizer.transliterate(token);
163 int end = sortedIndexEntries.size();
165 final Collator sortCollator = sortLanguage.getCollator();
166 while (start < end) {
167 final int mid = (start + end) / 2;
168 if (interrupted.get()) {
171 final IndexEntry midEntry = sortedIndexEntries.get(mid);
173 final int comp = sortCollator.compare(token, midEntry.normalizedToken());
175 final int result = windBackCase(token, mid, interrupted);
176 return sortedIndexEntries.get(result);
177 } else if (comp < 0) {
178 //System.out.println("Upper bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
181 //System.out.println("Lower bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
186 // If we search for a substring of a string that's in there, return that.
187 int result = Math.min(start, sortedIndexEntries.size() - 1);
188 result = windBackCase(sortedIndexEntries.get(result).normalizedToken(), result, interrupted);
189 return sortedIndexEntries.get(result);
192 public static final class SearchResult {
193 public final IndexEntry insertionPoint;
194 public final IndexEntry longestPrefix;
195 public final String longestPrefixString;
196 public final boolean success;
198 public SearchResult(IndexEntry insertionPoint, IndexEntry longestPrefix,
199 String longestPrefixString, boolean success) {
200 this.insertionPoint = insertionPoint;
201 this.longestPrefix = longestPrefix;
202 this.longestPrefixString = longestPrefixString;
203 this.success = success;
207 public String toString() {
208 return String.format("inerstionPoint=%s,longestPrefix=%s,longestPrefixString=%s,success=%b", insertionPoint.toString(), longestPrefix.toString(), longestPrefixString, success);
212 // public SearchResult findLongestSubstring(String token, final AtomicBoolean interrupted) {
213 // token = normalizer.transliterate(token);
214 // if (token.length() == 0) {
215 // return new SearchResult(sortedIndexEntries.get(0), sortedIndexEntries.get(0), "", true);
217 // IndexEntry insertionPoint = null;
218 // IndexEntry result = null;
219 // boolean unmodified = true;
220 // while (!interrupted.get() && token.length() > 0) {
221 // result = findInsertionPoint(token, interrupted);
222 // if (result == null) {
226 // insertionPoint = result;
228 // if (result.normalizedToken(normalizer).startsWith(token)) {
229 // return new SearchResult(insertionPoint, result, token, unmodified);
231 // unmodified = false;
232 // token = token.substring(0, token.length() - 1);
234 // return new SearchResult(insertionPoint, sortedIndexEntries.get(0), "", false);
237 private final int windBackCase(final String token, int result, final AtomicBoolean interrupted) {
238 while (result > 0 && sortedIndexEntries.get(result - 1).normalizedToken().equals(token)) {
240 if (interrupted.get()) {
248 public int tokenRowBinarySearch(final int rowIndex) {
250 int end = sortedIndexEntries.size();
251 while (start < end) {
252 final int mid = (start + end) / 2;
253 final int midRowIndex = sortedIndexEntries.get(mid).startRow;
254 if (midRowIndex == rowIndex) {