4 package com.hughes.android.dictionary.engine;
6 import java.io.IOException;
7 import java.io.PrintStream;
8 import java.io.RandomAccessFile;
9 import java.util.ArrayList;
10 import java.util.Collection;
11 import java.util.List;
12 import java.util.concurrent.atomic.AtomicBoolean;
14 import com.hughes.util.CachingList;
15 import com.hughes.util.raf.RAFList;
16 import com.hughes.util.raf.RAFSerializable;
17 import com.hughes.util.raf.RAFSerializer;
18 import com.hughes.util.raf.UniformRAFList;
19 import com.ibm.icu.text.Collator;
21 public final class Index implements RAFSerializable<Index> {
23 static final int CACHE_SIZE = 5000;
25 final Dictionary dict;
27 public final String shortName;
28 public final String longName;
30 // persisted: tells how the entries are sorted.
31 public final Language sortLanguage;
34 public final List<IndexEntry> sortedIndexEntries;
39 public final List<RowBase> rows;
41 public final boolean swapPairEntries;
43 // --------------------------------------------------------------------------
45 public Index(final Dictionary dict, final String shortName, final String longName, final Language sortLanguage, final boolean swapPairEntries) {
47 this.shortName = shortName;
48 this.longName = longName;
49 this.sortLanguage = sortLanguage;
50 this.swapPairEntries = swapPairEntries;
51 sortedIndexEntries = new ArrayList<IndexEntry>();
52 rows = new ArrayList<RowBase>();
55 public Index(final Dictionary dict, final RandomAccessFile raf) throws IOException {
57 shortName = raf.readUTF();
58 longName = raf.readUTF();
59 final String languageCode = raf.readUTF();
60 sortLanguage = Language.lookup(languageCode);
61 swapPairEntries = raf.readBoolean();
62 if (sortLanguage == null) {
63 throw new IOException("Unsupported language: " + languageCode);
65 sortedIndexEntries = CachingList.create(RAFList.create(raf, IndexEntry.SERIALIZER, raf.getFilePointer()), CACHE_SIZE);
66 rows = CachingList.create(UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()), CACHE_SIZE);
70 public void write(final RandomAccessFile raf) throws IOException {
71 raf.writeUTF(shortName);
72 raf.writeUTF(longName);
73 raf.writeUTF(sortLanguage.getSymbol());
74 raf.writeBoolean(swapPairEntries);
75 RAFList.write(raf, sortedIndexEntries, IndexEntry.SERIALIZER);
76 UniformRAFList.write(raf, (Collection<RowBase>) rows, new RowBase.Serializer(this), 5);
79 public void print(final PrintStream out) {
80 for (final RowBase row : rows) {
85 public static final class IndexEntry implements RAFSerializable<Index.IndexEntry> {
86 public final String token;
87 public final int startRow;
88 public final int numRows;
90 static final RAFSerializer<IndexEntry> SERIALIZER = new RAFSerializer<IndexEntry> () {
92 public IndexEntry read(RandomAccessFile raf) throws IOException {
93 return new IndexEntry(raf);
96 public void write(RandomAccessFile raf, IndexEntry t) throws IOException {
100 public IndexEntry(final String token, final int startRow, final int numRows) {
101 assert token.equals(token.trim());
102 assert token.length() > 0;
104 this.startRow = startRow;
105 this.numRows = numRows;
108 public IndexEntry(final RandomAccessFile raf) throws IOException {
109 token = raf.readUTF();
110 startRow = raf.readInt();
111 numRows = raf.readInt();
114 public void write(RandomAccessFile raf) throws IOException {
116 raf.writeInt(startRow);
117 raf.writeInt(numRows);
120 public String toString() {
121 return String.format("%s@%d(%d)", token, startRow, numRows);
125 public IndexEntry findInsertionPoint(String token, final AtomicBoolean interrupted) {
126 token = sortLanguage.textNorm(token, true);
129 int end = sortedIndexEntries.size();
131 final Collator sortCollator = sortLanguage.getSortCollator();
132 while (start < end) {
133 final int mid = (start + end) / 2;
134 if (interrupted.get()) {
137 final IndexEntry midEntry = sortedIndexEntries.get(mid);
139 final int comp = sortCollator.compare(token, sortLanguage.textNorm(midEntry.token, true));
141 final int result = windBackCase(token, mid, sortCollator, interrupted);
142 return sortedIndexEntries.get(result);
143 } else if (comp < 0) {
144 // Log.d("THAD", "Upper bound: " + midEntry);
147 // Log.d("THAD", "Lower bound: " + midEntry);
152 // If we search for a substring of a string that's in there, return that.
153 int result = Math.min(start, sortedIndexEntries.size() - 1);
154 result = windBackCase(sortLanguage.textNorm(sortedIndexEntries.get(result).token, true), result, sortCollator, interrupted);
155 return sortedIndexEntries.get(result);
158 public static final class SearchResult {
159 public final IndexEntry insertionPoint;
160 public final IndexEntry longestPrefix;
161 public final String longestPrefixString;
162 public final boolean success;
164 public SearchResult(IndexEntry insertionPoint, IndexEntry longestPrefix,
165 String longestPrefixString, boolean success) {
166 this.insertionPoint = insertionPoint;
167 this.longestPrefix = longestPrefix;
168 this.longestPrefixString = longestPrefixString;
169 this.success = success;
173 public String toString() {
174 return String.format("inerstionPoint=%s,longestPrefix=%s,longestPrefixString=%s,success=%b", insertionPoint.toString(), longestPrefix.toString(), longestPrefixString, success);
178 public SearchResult findLongestSubstring(String token, final AtomicBoolean interrupted) {
179 if (token.length() == 0) {
180 return new SearchResult(sortedIndexEntries.get(0), sortedIndexEntries.get(0), "", true);
182 IndexEntry insertionPoint = null;
183 IndexEntry result = null;
184 boolean unmodified = true;
185 while (!interrupted.get() && token.length() > 0) {
186 result = findInsertionPoint(token, interrupted);
187 if (result == null) {
191 insertionPoint = result;
193 if (sortLanguage.textNorm(result.token, true).startsWith(sortLanguage.textNorm(token, true))) {
194 return new SearchResult(insertionPoint, result, token, unmodified);
197 token = token.substring(0, token.length() - 1);
199 return new SearchResult(insertionPoint, sortedIndexEntries.get(0), "", false);
202 private final int windBackCase(final String token, int result, final Collator sortCollator, final AtomicBoolean interrupted) {
203 while (result > 0 && sortCollator.compare(sortLanguage.textNorm(sortedIndexEntries.get(result - 1).token, true), token) >= 0) {
205 if (interrupted.get()) {