4 package com.hughes.android.dictionary.engine;
6 import java.io.IOException;
7 import java.io.PrintStream;
8 import java.io.RandomAccessFile;
9 import java.util.ArrayList;
10 import java.util.Collection;
11 import java.util.List;
12 import java.util.concurrent.atomic.AtomicBoolean;
14 import com.hughes.util.CachingList;
15 import com.hughes.util.raf.RAFList;
16 import com.hughes.util.raf.RAFSerializable;
17 import com.hughes.util.raf.RAFSerializer;
18 import com.hughes.util.raf.UniformRAFList;
19 import com.ibm.icu.text.Collator;
20 import com.ibm.icu.text.Transliterator;
22 public final class Index implements RAFSerializable<Index> {
24 static final int CACHE_SIZE = 5000;
26 final Dictionary dict;
28 public final String shortName;
29 public final String longName;
31 // persisted: tells how the entries are sorted.
32 public final Language sortLanguage;
33 final String normalizerRules;
35 // Built from the two above.
36 final Transliterator normalizer;
39 public final List<IndexEntry> sortedIndexEntries;
44 public final List<RowBase> rows;
46 public final boolean swapPairEntries;
48 // --------------------------------------------------------------------------
50 public Index(final Dictionary dict, final String shortName, final String longName, final Language sortLanguage, final String normalizerRules, final boolean swapPairEntries) {
52 this.shortName = shortName;
53 this.longName = longName;
54 this.sortLanguage = sortLanguage;
55 this.normalizerRules = normalizerRules;
56 this.swapPairEntries = swapPairEntries;
57 sortedIndexEntries = new ArrayList<IndexEntry>();
58 rows = new ArrayList<RowBase>();
60 normalizer = Transliterator.createFromRules("", normalizerRules, Transliterator.FORWARD);
63 public Index(final Dictionary dict, final RandomAccessFile raf) throws IOException {
65 shortName = raf.readUTF();
66 longName = raf.readUTF();
67 final String languageCode = raf.readUTF();
68 sortLanguage = Language.lookup(languageCode);
69 normalizerRules = raf.readUTF();
70 swapPairEntries = raf.readBoolean();
71 if (sortLanguage == null) {
72 throw new IOException("Unsupported language: " + languageCode);
74 sortedIndexEntries = CachingList.create(RAFList.create(raf, IndexEntry.SERIALIZER, raf.getFilePointer()), CACHE_SIZE);
75 rows = CachingList.create(UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()), CACHE_SIZE);
77 normalizer = Transliterator.createFromRules("", normalizerRules, Transliterator.FORWARD);
81 public void write(final RandomAccessFile raf) throws IOException {
82 raf.writeUTF(shortName);
83 raf.writeUTF(longName);
84 raf.writeUTF(sortLanguage.getSymbol());
85 raf.writeUTF(normalizerRules);
86 raf.writeBoolean(swapPairEntries);
87 RAFList.write(raf, sortedIndexEntries, IndexEntry.SERIALIZER);
88 UniformRAFList.write(raf, (Collection<RowBase>) rows, new RowBase.Serializer(this), 5);
91 public void print(final PrintStream out) {
92 for (final RowBase row : rows) {
97 public static final class IndexEntry implements RAFSerializable<Index.IndexEntry> {
98 public final String token;
99 public final int startRow;
100 public final int numRows;
102 private String normalizedToken;
104 static final RAFSerializer<IndexEntry> SERIALIZER = new RAFSerializer<IndexEntry> () {
106 public IndexEntry read(RandomAccessFile raf) throws IOException {
107 return new IndexEntry(raf);
110 public void write(RandomAccessFile raf, IndexEntry t) throws IOException {
114 public IndexEntry(final String token, final int startRow, final int numRows) {
115 assert token.equals(token.trim());
116 assert token.length() > 0;
118 this.startRow = startRow;
119 this.numRows = numRows;
122 public IndexEntry(final RandomAccessFile raf) throws IOException {
123 token = raf.readUTF();
124 startRow = raf.readInt();
125 numRows = raf.readInt();
128 public void write(RandomAccessFile raf) throws IOException {
130 raf.writeInt(startRow);
131 raf.writeInt(numRows);
134 public String toString() {
135 return String.format("%s@%d(%d)", token, startRow, numRows);
138 public synchronized String normalizedToken(final Transliterator normalizer) {
139 if (normalizedToken == null) {
140 normalizedToken = normalizer.transform(token);
142 return normalizedToken;
146 public IndexEntry findInsertionPoint(String token, final AtomicBoolean interrupted) {
147 token = normalizer.transliterate(token);
150 int end = sortedIndexEntries.size();
152 final Collator sortCollator = sortLanguage.getCollator();
153 while (start < end) {
154 final int mid = (start + end) / 2;
155 if (interrupted.get()) {
158 final IndexEntry midEntry = sortedIndexEntries.get(mid);
160 final int comp = sortCollator.compare(token, midEntry.normalizedToken(normalizer));
162 final int result = windBackCase(token, mid, interrupted);
163 return sortedIndexEntries.get(result);
164 } else if (comp < 0) {
165 System.out.println("Upper bound: " + midEntry + ", norm=" + midEntry.normalizedToken(normalizer) + ", mid=" + mid);
168 System.out.println("Lower bound: " + midEntry + ", norm=" + midEntry.normalizedToken(normalizer) + ", mid=" + mid);
173 // If we search for a substring of a string that's in there, return that.
174 int result = Math.min(start, sortedIndexEntries.size() - 1);
175 result = windBackCase(sortedIndexEntries.get(result).normalizedToken(normalizer), result, interrupted);
176 return sortedIndexEntries.get(result);
179 public static final class SearchResult {
180 public final IndexEntry insertionPoint;
181 public final IndexEntry longestPrefix;
182 public final String longestPrefixString;
183 public final boolean success;
185 public SearchResult(IndexEntry insertionPoint, IndexEntry longestPrefix,
186 String longestPrefixString, boolean success) {
187 this.insertionPoint = insertionPoint;
188 this.longestPrefix = longestPrefix;
189 this.longestPrefixString = longestPrefixString;
190 this.success = success;
194 public String toString() {
195 return String.format("inerstionPoint=%s,longestPrefix=%s,longestPrefixString=%s,success=%b", insertionPoint.toString(), longestPrefix.toString(), longestPrefixString, success);
199 // public SearchResult findLongestSubstring(String token, final AtomicBoolean interrupted) {
200 // token = normalizer.transliterate(token);
201 // if (token.length() == 0) {
202 // return new SearchResult(sortedIndexEntries.get(0), sortedIndexEntries.get(0), "", true);
204 // IndexEntry insertionPoint = null;
205 // IndexEntry result = null;
206 // boolean unmodified = true;
207 // while (!interrupted.get() && token.length() > 0) {
208 // result = findInsertionPoint(token, interrupted);
209 // if (result == null) {
213 // insertionPoint = result;
215 // if (result.normalizedToken(normalizer).startsWith(token)) {
216 // return new SearchResult(insertionPoint, result, token, unmodified);
218 // unmodified = false;
219 // token = token.substring(0, token.length() - 1);
221 // return new SearchResult(insertionPoint, sortedIndexEntries.get(0), "", false);
224 private final int windBackCase(final String token, int result, final AtomicBoolean interrupted) {
225 while (result > 0 && sortedIndexEntries.get(result - 1).normalizedToken(normalizer).equals(token)) {
227 if (interrupted.get()) {
235 public int tokenRowBinarySearch(final int rowIndex) {
237 int end = sortedIndexEntries.size();
238 while (start < end) {
239 final int mid = (start + end) / 2;
240 final int midRowIndex = sortedIndexEntries.get(mid).startRow;
241 if (midRowIndex == rowIndex) {