1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
18 package com.hughes.android.dictionary.engine;
20 import java.io.IOException;
21 import java.io.PrintStream;
22 import java.io.RandomAccessFile;
23 import java.util.ArrayList;
24 import java.util.Collection;
25 import java.util.List;
26 import java.util.concurrent.atomic.AtomicBoolean;
28 import com.hughes.android.dictionary.DictionaryInfo;
29 import com.hughes.android.dictionary.DictionaryInfo.IndexInfo;
30 import com.hughes.util.CachingList;
31 import com.hughes.util.raf.RAFList;
32 import com.hughes.util.raf.RAFSerializable;
33 import com.hughes.util.raf.RAFSerializer;
34 import com.hughes.util.raf.UniformRAFList;
35 import com.ibm.icu.text.Collator;
36 import com.ibm.icu.text.Transliterator;
38 public final class Index implements RAFSerializable<Index> {
40 static final int CACHE_SIZE = 5000;
42 final Dictionary dict;
44 public final String shortName; // Typically the ISO code for the language.
45 public final String longName;
47 // persisted: tells how the entries are sorted.
48 public final Language sortLanguage;
49 final String normalizerRules;
51 // Built from the two above.
52 private Transliterator normalizer;
55 public final List<IndexEntry> sortedIndexEntries;
60 public final List<RowBase> rows;
61 public final boolean swapPairEntries;
64 int mainTokenCount = -1;
66 // --------------------------------------------------------------------------
68 public Index(final Dictionary dict, final String shortName, final String longName, final Language sortLanguage, final String normalizerRules, final boolean swapPairEntries) {
70 this.shortName = shortName;
71 this.longName = longName;
72 this.sortLanguage = sortLanguage;
73 this.normalizerRules = normalizerRules;
74 this.swapPairEntries = swapPairEntries;
75 sortedIndexEntries = new ArrayList<IndexEntry>();
76 rows = new ArrayList<RowBase>();
81 public synchronized Transliterator normalizer() {
82 if (normalizer == null) {
83 normalizer = Transliterator.createFromRules("", normalizerRules, Transliterator.FORWARD);
88 public Index(final Dictionary dict, final RandomAccessFile raf) throws IOException {
90 shortName = raf.readUTF();
91 longName = raf.readUTF();
92 final String languageCode = raf.readUTF();
93 sortLanguage = Language.lookup(languageCode);
94 normalizerRules = raf.readUTF();
95 swapPairEntries = raf.readBoolean();
96 if (sortLanguage == null) {
97 throw new IOException("Unsupported language: " + languageCode);
99 if (dict.dictFileVersion >= 2) {
100 mainTokenCount = raf.readInt();
102 sortedIndexEntries = CachingList.create(RAFList.create(raf, IndexEntry.SERIALIZER, raf.getFilePointer()), CACHE_SIZE);
103 rows = CachingList.create(UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()), CACHE_SIZE);
107 public void write(final RandomAccessFile raf) throws IOException {
108 raf.writeUTF(shortName);
109 raf.writeUTF(longName);
110 raf.writeUTF(sortLanguage.getIsoCode());
111 raf.writeUTF(normalizerRules);
112 raf.writeBoolean(swapPairEntries);
113 if (dict.dictFileVersion >= 2) {
114 raf.writeInt(mainTokenCount);
116 RAFList.write(raf, sortedIndexEntries, IndexEntry.SERIALIZER);
117 UniformRAFList.write(raf, (Collection<RowBase>) rows, new RowBase.Serializer(this), 5);
120 public void print(final PrintStream out) {
121 for (final RowBase row : rows) {
126 public static final class IndexEntry implements RAFSerializable<Index.IndexEntry> {
127 public final String token;
128 private final String normalizedToken;
129 public final int startRow;
130 public final int numRows;
133 static final RAFSerializer<IndexEntry> SERIALIZER = new RAFSerializer<IndexEntry> () {
135 public IndexEntry read(RandomAccessFile raf) throws IOException {
136 return new IndexEntry(raf);
139 public void write(RandomAccessFile raf, IndexEntry t) throws IOException {
143 public IndexEntry(final String token, final String normalizedToken, final int startRow, final int numRows) {
144 assert token.equals(token.trim());
145 assert token.length() > 0;
147 this.normalizedToken = normalizedToken;
148 this.startRow = startRow;
149 this.numRows = numRows;
152 public IndexEntry(final RandomAccessFile raf) throws IOException {
153 token = raf.readUTF();
154 startRow = raf.readInt();
155 numRows = raf.readInt();
156 final boolean hasNormalizedForm = raf.readBoolean();
157 normalizedToken = hasNormalizedForm ? raf.readUTF() : token;
160 public void write(RandomAccessFile raf) throws IOException {
162 raf.writeInt(startRow);
163 raf.writeInt(numRows);
164 final boolean hasNormalizedForm = !token.equals(normalizedToken);
165 raf.writeBoolean(hasNormalizedForm);
166 if (hasNormalizedForm) {
167 raf.writeUTF(normalizedToken);
171 public String toString() {
172 return String.format("%s@%d(%d)", token, startRow, numRows);
175 public String normalizedToken() {
176 return normalizedToken;
180 public IndexEntry findInsertionPoint(String token, final AtomicBoolean interrupted) {
181 if (TransliteratorManager.init(null)) {
182 final Transliterator normalizer = normalizer();
183 token = normalizer.transliterate(token);
185 // Do our best since the Transliterators aren't up yet.
186 token = token.toLowerCase();
190 int end = sortedIndexEntries.size();
192 final Collator sortCollator = sortLanguage.getCollator();
193 while (start < end) {
194 final int mid = (start + end) / 2;
195 if (interrupted.get()) {
198 final IndexEntry midEntry = sortedIndexEntries.get(mid);
200 final int comp = sortCollator.compare(token, midEntry.normalizedToken());
202 final int result = windBackCase(token, mid, interrupted);
203 return sortedIndexEntries.get(result);
204 } else if (comp < 0) {
205 //System.out.println("Upper bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
208 //System.out.println("Lower bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
213 // If we search for a substring of a string that's in there, return that.
214 int result = Math.min(start, sortedIndexEntries.size() - 1);
215 result = windBackCase(sortedIndexEntries.get(result).normalizedToken(), result, interrupted);
216 return sortedIndexEntries.get(result);
219 private final int windBackCase(final String token, int result, final AtomicBoolean interrupted) {
220 while (result > 0 && sortedIndexEntries.get(result - 1).normalizedToken().equals(token)) {
222 if (interrupted.get()) {
229 public IndexInfo getIndexInfo() {
230 return new DictionaryInfo.IndexInfo(shortName, sortedIndexEntries.size(), mainTokenCount);