// See the License for the specific language governing permissions and
// limitations under the License.
-/**
- *
- */
-
package com.hughes.android.dictionary.engine;
import com.hughes.android.dictionary.DictionaryInfo;
import com.hughes.util.raf.RAFSerializer;
import com.hughes.util.raf.SerializableSerializer;
import com.hughes.util.raf.UniformRAFList;
-import com.ibm.icu.text.Collator;
import com.ibm.icu.text.Transliterator;
import java.io.DataInput;
import java.io.IOException;
import java.io.PrintStream;
import java.io.RandomAccessFile;
+import java.nio.channels.FileChannel;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.HashSet;
-import java.util.LinkedHashMap;
-import java.util.LinkedHashSet;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
public final class Index implements RAFSerializable<Index> {
- static final int CACHE_SIZE = 5000;
+ private static final int CACHE_SIZE = 5000;
public final Dictionary dict;
// persisted: tells how the entries are sorted.
public final Language sortLanguage;
- final String normalizerRules;
+ private final String normalizerRules;
// Built from the two above.
private Transliterator normalizer;
public final List<IndexEntry> sortedIndexEntries;
// persisted.
- public final Set<String> stoplist;
+ private final Set<String> stoplist;
// One big list!
// Various sub-types.
public final boolean swapPairEntries;
// Version 2:
- int mainTokenCount = -1;
+ @SuppressWarnings("WeakerAccess")
+ public int mainTokenCount = -1;
// --------------------------------------------------------------------------
public Index(final Dictionary dict, final String shortName, final String longName,
- final Language sortLanguage, final String normalizerRules,
- final boolean swapPairEntries, final Set<String> stoplist) {
+ final Language sortLanguage, final String normalizerRules,
+ final boolean swapPairEntries, final Set<String> stoplist) {
this.dict = dict;
this.shortName = shortName;
this.longName = longName;
this.sortLanguage = sortLanguage;
this.normalizerRules = normalizerRules;
this.swapPairEntries = swapPairEntries;
- sortedIndexEntries = new ArrayList<IndexEntry>();
+ sortedIndexEntries = new ArrayList<>();
this.stoplist = stoplist;
- rows = new ArrayList<RowBase>();
+ rows = new ArrayList<>();
normalizer = null;
}
/**
* Deferred initialization because it can be slow.
*/
+ @SuppressWarnings("WeakerAccess")
public synchronized Transliterator normalizer() {
if (normalizer == null) {
- normalizer = Transliterator
- .createFromRules("", normalizerRules, Transliterator.FORWARD);
+ normalizer = TransliteratorManager.get(normalizerRules);
}
return normalizer;
}
* Note that using this comparator probably involves doing too many text
* normalizations.
*/
+ @SuppressWarnings("WeakerAccess")
public NormalizeComparator getSortComparator() {
- return new NormalizeComparator(normalizer(), sortLanguage.getCollator());
+ return new NormalizeComparator(normalizer(), sortLanguage.getCollator(), dict.dictFileVersion);
}
- public Index(final Dictionary dict, final DataInput inp) throws IOException {
+ public Index(final Dictionary dict, final FileChannel inp, final DataInput raf) throws IOException {
this.dict = dict;
- RandomAccessFile raf = (RandomAccessFile)inp;
shortName = raf.readUTF();
longName = raf.readUTF();
final String languageCode = raf.readUTF();
mainTokenCount = raf.readInt();
}
sortedIndexEntries = CachingList.create(
- RAFList.create(raf, indexEntrySerializer, raf.getFilePointer(), dict.dictFileVersion,
- dict.dictFileVersion >= 7 ? 16 : 1, dict.dictFileVersion >= 7), CACHE_SIZE);
+ RAFList.create(inp, new IndexEntrySerializer(dict.dictFileVersion == 6 ? inp : null), inp.position(),
+ dict.dictFileVersion, dict.dictInfo + " idx " + languageCode + ": "), CACHE_SIZE, true);
if (dict.dictFileVersion >= 7) {
int count = StringUtil.readVarInt(raf);
- stoplist = new HashSet<String>(count);
+ stoplist = new HashSet<>(count);
for (int i = 0; i < count; ++i) {
stoplist.add(raf.readUTF());
}
- } else if (dict.dictFileVersion >= 4) {
+ } else if (dict.dictFileVersion >= 4) {
stoplist = new SerializableSerializer<Set<String>>().read(raf);
} else {
stoplist = Collections.emptySet();
}
rows = CachingList.create(
- UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()),
- CACHE_SIZE);
+ UniformRAFList.create(inp, new RowBase.Serializer(this), inp.position()),
+ CACHE_SIZE, true);
}
@Override
raf.writeUTF(sortLanguage.getIsoCode());
raf.writeUTF(normalizerRules);
raf.writeBoolean(swapPairEntries);
- if (dict.dictFileVersion >= 2) {
- raf.writeInt(mainTokenCount);
- }
- RAFList.write(raf, sortedIndexEntries, indexEntrySerializer, 16, true);
+ raf.writeInt(mainTokenCount);
+ RAFList.write(raf, sortedIndexEntries, new IndexEntrySerializer(null), 32, true);
StringUtil.writeVarInt(raf, stoplist.size());
for (String i : stoplist) {
raf.writeUTF(i);
}
}
- private final RAFSerializer<IndexEntry> indexEntrySerializer = new RAFSerializer<IndexEntry>() {
+ private final class IndexEntrySerializer implements RAFSerializer<IndexEntry> {
+ private final FileChannel ch;
+
+ IndexEntrySerializer(FileChannel ch) {
+ this.ch = ch;
+ }
+
@Override
public IndexEntry read(DataInput raf) throws IOException {
- return new IndexEntry(Index.this, raf);
+ return new IndexEntry(Index.this, ch, raf);
}
@Override
public void write(DataOutput raf, IndexEntry t) throws IOException {
t.write(raf);
}
- };
+ }
public static final class IndexEntry implements RAFSerializable<Index.IndexEntry> {
- private final Index index;
public final String token;
private final String normalizedToken;
public final int startRow;
- public final int numRows; // doesn't count the token row!
+ final int numRows; // doesn't count the token row!
public List<HtmlEntry> htmlEntries;
- private int[] htmlEntryIndices;
public IndexEntry(final Index index, final String token, final String normalizedToken,
- final int startRow, final int numRows) {
- this.index = index;
+ final int startRow, final int numRows) {
assert token.equals(token.trim());
assert token.length() > 0;
this.token = token;
this.normalizedToken = normalizedToken;
this.startRow = startRow;
this.numRows = numRows;
- this.htmlEntries = new ArrayList<HtmlEntry>();
+ this.htmlEntries = new ArrayList<>();
}
- public IndexEntry(final Index index, final DataInput raf) throws IOException {
- this.index = index;
+ IndexEntry(final Index index, final FileChannel ch, final DataInput raf) throws IOException {
token = raf.readUTF();
if (index.dict.dictFileVersion >= 7) {
startRow = StringUtil.readVarInt(raf);
}
final boolean hasNormalizedForm = raf.readBoolean();
normalizedToken = hasNormalizedForm ? raf.readUTF() : token;
- htmlEntryIndices = null;
if (index.dict.dictFileVersion >= 7) {
int size = StringUtil.readVarInt(raf);
- htmlEntryIndices = new int[size];
- for (int i = 0; i < size; ++i) {
- htmlEntryIndices[i] = StringUtil.readVarInt(raf);
- }
- this.htmlEntries = CachingList.create(new AbstractList<HtmlEntry>() {
- @Override
- public HtmlEntry get(int i) {
- return index.dict.htmlEntries.get(htmlEntryIndices[i]);
- }
- @Override
- public int size() {
- return htmlEntryIndices.length;
+ if (size == 0) {
+ this.htmlEntries = Collections.emptyList();
+ } else {
+ final int[] htmlEntryIndices = new int[size];
+ for (int i = 0; i < size; ++i) {
+ htmlEntryIndices[i] = StringUtil.readVarInt(raf);
}
- }, 1);
+ this.htmlEntries = new AbstractList<HtmlEntry>() {
+ @Override
+ public HtmlEntry get(int i) {
+ return index.dict.htmlEntries.get(htmlEntryIndices[i]);
+ }
+ @Override
+ public int size() {
+ return htmlEntryIndices.length;
+ }
+ };
+ }
} else if (index.dict.dictFileVersion >= 6) {
this.htmlEntries = CachingList.create(
- RAFList.create((RandomAccessFile)raf, index.dict.htmlEntryIndexSerializer,
- ((RandomAccessFile)raf).getFilePointer(), index.dict.dictFileVersion), 1);
+ RAFList.create(ch, index.dict.htmlEntryIndexSerializer,
+ ch.position(), index.dict.dictFileVersion,
+ index.dict.dictInfo + " htmlEntries: "), 1, false);
} else {
this.htmlEntries = Collections.emptyList();
}
return String.format("%s@%d(%d)", token, startRow, numRows);
}
- public String normalizedToken() {
+ String normalizedToken() {
return normalizedToken;
}
}
- static final TransformingList.Transformer<IndexEntry, String> INDEX_ENTRY_TO_TOKEN = new TransformingList.Transformer<IndexEntry, String>() {
+ private static final TransformingList.Transformer<IndexEntry, String> INDEX_ENTRY_TO_TOKEN = new TransformingList.Transformer<IndexEntry, String>() {
@Override
public String transform(IndexEntry t1) {
return t1.token;
public IndexEntry findExact(final String exactToken) {
final int result = Collections.binarySearch(
- TransformingList.create(sortedIndexEntries, INDEX_ENTRY_TO_TOKEN), exactToken,
- getSortComparator());
+ TransformingList.create(sortedIndexEntries, INDEX_ENTRY_TO_TOKEN), exactToken,
+ getSortComparator());
if (result >= 0) {
return sortedIndexEntries.get(result);
}
return index != -1 ? sortedIndexEntries.get(index) : null;
}
- public int findInsertionPointIndex(String token, final AtomicBoolean interrupted) {
+ private int compareIdx(String token, final Comparator<Object> sortCollator, int idx) {
+ final IndexEntry entry = sortedIndexEntries.get(idx);
+ return NormalizeComparator.compareWithoutDash(token, entry.normalizedToken(), sortCollator, dict.dictFileVersion);
+ }
+
+ private int findMatchLen(final Comparator<Object> sortCollator, String a, String b) {
+ int start = 0;
+ int end = Math.min(a.length(), b.length());
+ while (start < end)
+ {
+ int mid = (start + end + 1) / 2;
+ if (sortCollator.compare(a.substring(0, mid), b.substring(0, mid)) == 0)
+ start = mid;
+ else
+ end = mid - 1;
+ }
+ return start;
+ }
+
+ private int findInsertionPointIndex(String token, final AtomicBoolean interrupted) {
token = normalizeToken(token);
int start = 0;
int end = sortedIndexEntries.size();
- final Comparator sortCollator = sortLanguage.getCollator();
+ final Comparator<Object> sortCollator = sortLanguage.getCollator();
while (start < end) {
final int mid = (start + end) / 2;
if (interrupted.get()) {
}
final IndexEntry midEntry = sortedIndexEntries.get(mid);
- final int comp = sortCollator.compare(token, midEntry.normalizedToken());
+ int comp = NormalizeComparator.compareWithoutDash(token, midEntry.normalizedToken(), sortCollator, dict.dictFileVersion);
+ if (comp == 0)
+ comp = sortCollator.compare(token, midEntry.normalizedToken());
if (comp == 0) {
- final int result = windBackCase(token, mid, interrupted);
- return result;
+ return windBackCase(token, mid, interrupted);
} else if (comp < 0) {
// System.out.println("Upper bound: " + midEntry + ", norm=" +
// midEntry.normalizedToken() + ", mid=" + mid);
- end = mid;
+
+ // Hack for robustness if sort order is broken
+ if (mid + 2 < end &&
+ compareIdx(token, sortCollator, mid + 1) > 0 &&
+ compareIdx(token, sortCollator, mid + 2) > 0) {
+ start = mid;
+ } else {
+ end = mid;
+ }
} else {
// System.out.println("Lower bound: " + midEntry + ", norm=" +
// midEntry.normalizedToken() + ", mid=" + mid);
- start = mid + 1;
+
+ // Hack for robustness if sort order is broken
+ if (mid - 2 >= start &&
+ compareIdx(token, sortCollator, mid - 1) < 0 &&
+ compareIdx(token, sortCollator, mid - 2) < 0) {
+ end = mid + 1;
+ } else {
+ start = mid + 1;
+ }
}
}
+ // if the word before is the better match, move
+ // our result to it
+ if (start > 0 && start < sortedIndexEntries.size()) {
+ String prev = sortedIndexEntries.get(start - 1).normalizedToken();
+ String next = sortedIndexEntries.get(start).normalizedToken();
+ if (findMatchLen(sortCollator, token, prev) >= findMatchLen(sortCollator, token, next))
+ start--;
+ }
+
// If we search for a substring of a string that's in there, return
// that.
int result = Math.min(start, sortedIndexEntries.size() - 1);
private static final int MAX_SEARCH_ROWS = 1000;
- private final Map<String, Integer> prefixToNumRows = new LinkedHashMap<String, Integer>();
+ private final Map<String, Integer> prefixToNumRows = new HashMap<>();
private synchronized final int getUpperBoundOnRowsStartingWith(final String normalizedPrefix,
final int maxRows, final AtomicBoolean interrupted) {
return -1;
}
final IndexEntry indexEntry = sortedIndexEntries.get(index);
- if (!indexEntry.normalizedToken.startsWith(normalizedPrefix)) {
+ if (!indexEntry.normalizedToken.startsWith(normalizedPrefix) &&
+ !NormalizeComparator.withoutDash(indexEntry.normalizedToken).startsWith(normalizedPrefix)) {
break;
}
rowCount += indexEntry.numRows + indexEntry.htmlEntries.size();
break;
}
}
- prefixToNumRows.put(normalizedPrefix, numRows);
+ prefixToNumRows.put(normalizedPrefix, rowCount);
return rowCount;
}
public final List<RowBase> multiWordSearch(
- final String searchText, final List<String> searchTokens,
- final AtomicBoolean interrupted) {
+ final String searchText, final List<String> searchTokens,
+ final AtomicBoolean interrupted) {
final long startMills = System.currentTimeMillis();
- final List<RowBase> result = new ArrayList<RowBase>();
+ final List<RowBase> result = new ArrayList<>();
- final Set<String> normalizedNonStoplist = new LinkedHashSet<String>();
+ final Set<String> normalizedNonStoplist = new HashSet<>();
String bestPrefix = null;
int leastRows = Integer.MAX_VALUE;
if (!stoplist.contains(searchToken)) {
if (normalizedNonStoplist.add(normalized)) {
final int numRows = getUpperBoundOnRowsStartingWith(normalized,
- MAX_SEARCH_ROWS, interrupted);
+ MAX_SEARCH_ROWS, interrupted);
if (numRows != -1 && numRows < leastRows) {
if (numRows == 0) {
// We really are done here.
System.out.println("Everything was in the stoplist!");
}
System.out.println("Searching using prefix: " + bestPrefix + ", leastRows=" + leastRows
- + ", searchTokens=" + searchTokens);
+ + ", searchTokens=" + searchTokens);
// Place to store the things that match.
- final Map<RowMatchType, List<RowBase>> matches = new EnumMap<RowMatchType, List<RowBase>>(
+ final Map<RowMatchType, List<RowBase>> matches = new EnumMap<>(
RowMatchType.class);
for (final RowMatchType rowMatchType : RowMatchType.values()) {
if (rowMatchType != RowMatchType.NO_MATCH) {
final String searchToken = bestPrefix;
final int insertionPointIndex = findInsertionPointIndex(searchToken, interrupted);
- final Set<RowKey> rowsAlreadySeen = new HashSet<RowBase.RowKey>();
+ final Set<RowKey> rowsAlreadySeen = new HashSet<>();
for (int index = insertionPointIndex; index < sortedIndexEntries.size()
&& matchCount < MAX_SEARCH_ROWS; ++index) {
if (interrupted.get()) {
return null;
}
final IndexEntry indexEntry = sortedIndexEntries.get(index);
- if (!indexEntry.normalizedToken.startsWith(searchToken)) {
+ if (!indexEntry.normalizedToken.startsWith(searchToken) &&
+ !NormalizeComparator.withoutDash(indexEntry.normalizedToken).startsWith(searchToken)) {
break;
}
}
rowsAlreadySeen.add(rowKey);
final RowMatchType matchType = row.matches(searchTokens, pattern, normalizer(),
- swapPairEntries);
+ swapPairEntries);
if (matchType != RowMatchType.NO_MATCH) {
matches.get(matchType).add(row);
++matchCount;
// Sort them into a reasonable order.
final RowBase.LengthComparator lengthComparator = new RowBase.LengthComparator(
- swapPairEntries);
+ swapPairEntries);
for (final Collection<RowBase> rows : matches.values()) {
- final List<RowBase> ordered = new ArrayList<RowBase>(rows);
+ final List<RowBase> ordered = new ArrayList<>(rows);
Collections.sort(ordered, lengthComparator);
result.addAll(ordered);
}
}
private String normalizeToken(final String searchToken) {
- if (TransliteratorManager.init(null)) {
+ if (TransliteratorManager.init(null, null)) {
final Transliterator normalizer = normalizer();
return normalizer.transliterate(searchToken);
} else {