X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FIndex.java;h=30c18e2eb3bce1acd67ae9cac22d4ad102bb7637;hb=83d9dc7cd871082a82c2dd0dbb7a0ceabd7c83a0;hp=04691ff0c5ede6e8289c16949cec69b2bc54d1d4;hpb=3f9b8540ee99ca4039e9357e021bdccb41b532ef;p=Dictionary.git diff --git a/src/com/hughes/android/dictionary/engine/Index.java b/src/com/hughes/android/dictionary/engine/Index.java index 04691ff..30c18e2 100644 --- a/src/com/hughes/android/dictionary/engine/Index.java +++ b/src/com/hughes/android/dictionary/engine/Index.java @@ -12,10 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -/** - * - */ - package com.hughes.android.dictionary.engine; import com.hughes.android.dictionary.DictionaryInfo; @@ -36,6 +32,7 @@ import java.io.DataOutput; import java.io.IOException; import java.io.PrintStream; import java.io.RandomAccessFile; +import java.nio.channels.FileChannel; import java.util.AbstractList; import java.util.ArrayList; import java.util.Collection; @@ -52,7 +49,7 @@ import java.util.regex.Pattern; public final class Index implements RAFSerializable { - static final int CACHE_SIZE = 5000; + private static final int CACHE_SIZE = 5000; public final Dictionary dict; @@ -61,7 +58,7 @@ public final class Index implements RAFSerializable { // persisted: tells how the entries are sorted. public final Language sortLanguage; - final String normalizerRules; + private final String normalizerRules; // Built from the two above. private Transliterator normalizer; @@ -70,7 +67,7 @@ public final class Index implements RAFSerializable { public final List sortedIndexEntries; // persisted. - public final Set stoplist; + private final Set stoplist; // One big list! // Various sub-types. @@ -79,7 +76,8 @@ public final class Index implements RAFSerializable { public final boolean swapPairEntries; // Version 2: - int mainTokenCount = -1; + @SuppressWarnings("WeakerAccess") + public int mainTokenCount = -1; // -------------------------------------------------------------------------- @@ -92,9 +90,9 @@ public final class Index implements RAFSerializable { this.sortLanguage = sortLanguage; this.normalizerRules = normalizerRules; this.swapPairEntries = swapPairEntries; - sortedIndexEntries = new ArrayList(); + sortedIndexEntries = new ArrayList<>(); this.stoplist = stoplist; - rows = new ArrayList(); + rows = new ArrayList<>(); normalizer = null; } @@ -102,6 +100,7 @@ public final class Index implements RAFSerializable { /** * Deferred initialization because it can be slow. */ + @SuppressWarnings("WeakerAccess") public synchronized Transliterator normalizer() { if (normalizer == null) { normalizer = TransliteratorManager.get(normalizerRules); @@ -113,13 +112,13 @@ public final class Index implements RAFSerializable { * Note that using this comparator probably involves doing too many text * normalizations. */ + @SuppressWarnings("WeakerAccess") public NormalizeComparator getSortComparator() { return new NormalizeComparator(normalizer(), sortLanguage.getCollator(), dict.dictFileVersion); } - public Index(final Dictionary dict, final DataInput inp) throws IOException { + public Index(final Dictionary dict, final FileChannel inp, final DataInput raf) throws IOException { this.dict = dict; - RandomAccessFile raf = (RandomAccessFile)inp; shortName = raf.readUTF(); longName = raf.readUTF(); final String languageCode = raf.readUTF(); @@ -133,11 +132,11 @@ public final class Index implements RAFSerializable { mainTokenCount = raf.readInt(); } sortedIndexEntries = CachingList.create( - RAFList.create(raf, indexEntrySerializer, raf.getFilePointer(), - dict.dictFileVersion, dict.dictInfo + " idx " + languageCode + ": "), CACHE_SIZE); + RAFList.create(inp, new IndexEntrySerializer(dict.dictFileVersion == 6 ? inp : null), inp.position(), + dict.dictFileVersion, dict.dictInfo + " idx " + languageCode + ": "), CACHE_SIZE, true); if (dict.dictFileVersion >= 7) { int count = StringUtil.readVarInt(raf); - stoplist = new HashSet(count); + stoplist = new HashSet<>(count); for (int i = 0; i < count; ++i) { stoplist.add(raf.readUTF()); } @@ -147,8 +146,8 @@ public final class Index implements RAFSerializable { stoplist = Collections.emptySet(); } rows = CachingList.create( - UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()), - CACHE_SIZE); + UniformRAFList.create(inp, new RowBase.Serializer(this), inp.position()), + CACHE_SIZE, true); } @Override @@ -162,7 +161,7 @@ public final class Index implements RAFSerializable { if (dict.dictFileVersion >= 2) { raf.writeInt(mainTokenCount); } - RAFList.write(raf, sortedIndexEntries, indexEntrySerializer, 32, true); + RAFList.write(raf, sortedIndexEntries, new IndexEntrySerializer(null), 32, true); StringUtil.writeVarInt(raf, stoplist.size()); for (String i : stoplist) { raf.writeUTF(i); @@ -176,23 +175,29 @@ public final class Index implements RAFSerializable { } } - private final RAFSerializer indexEntrySerializer = new RAFSerializer() { + private final class IndexEntrySerializer implements RAFSerializer { + private final FileChannel ch; + + IndexEntrySerializer(FileChannel ch) { + this.ch = ch; + } + @Override public IndexEntry read(DataInput raf) throws IOException { - return new IndexEntry(Index.this, raf); + return new IndexEntry(Index.this, ch, raf); } @Override public void write(DataOutput raf, IndexEntry t) throws IOException { t.write(raf); } - }; + } public static final class IndexEntry implements RAFSerializable { public final String token; private final String normalizedToken; public final int startRow; - public final int numRows; // doesn't count the token row! + final int numRows; // doesn't count the token row! public List htmlEntries; public IndexEntry(final Index index, final String token, final String normalizedToken, @@ -203,10 +208,10 @@ public final class Index implements RAFSerializable { this.normalizedToken = normalizedToken; this.startRow = startRow; this.numRows = numRows; - this.htmlEntries = new ArrayList(); + this.htmlEntries = new ArrayList<>(); } - public IndexEntry(final Index index, final DataInput raf) throws IOException { + IndexEntry(final Index index, final FileChannel ch, final DataInput raf) throws IOException { token = raf.readUTF(); if (index.dict.dictFileVersion >= 7) { startRow = StringUtil.readVarInt(raf); @@ -239,9 +244,9 @@ public final class Index implements RAFSerializable { } } else if (index.dict.dictFileVersion >= 6) { this.htmlEntries = CachingList.create( - RAFList.create((RandomAccessFile)raf, index.dict.htmlEntryIndexSerializer, - ((RandomAccessFile)raf).getFilePointer(), index.dict.dictFileVersion, - index.dict.dictInfo + " htmlEntries: "), 1); + RAFList.create(ch, index.dict.htmlEntryIndexSerializer, + ch.position(), index.dict.dictFileVersion, + index.dict.dictInfo + " htmlEntries: "), 1, false); } else { this.htmlEntries = Collections.emptyList(); } @@ -265,12 +270,12 @@ public final class Index implements RAFSerializable { return String.format("%s@%d(%d)", token, startRow, numRows); } - public String normalizedToken() { + String normalizedToken() { return normalizedToken; } } - static final TransformingList.Transformer INDEX_ENTRY_TO_TOKEN = new TransformingList.Transformer() { + private static final TransformingList.Transformer INDEX_ENTRY_TO_TOKEN = new TransformingList.Transformer() { @Override public String transform(IndexEntry t1) { return t1.token; @@ -292,18 +297,32 @@ public final class Index implements RAFSerializable { return index != -1 ? sortedIndexEntries.get(index) : null; } - private int compareIdx(String token, final Comparator sortCollator, int idx) { + private int compareIdx(String token, final Comparator sortCollator, int idx) { final IndexEntry entry = sortedIndexEntries.get(idx); return NormalizeComparator.compareWithoutDash(token, entry.normalizedToken(), sortCollator, dict.dictFileVersion); } - public int findInsertionPointIndex(String token, final AtomicBoolean interrupted) { + private int findMatchLen(final Comparator sortCollator, String a, String b) { + int start = 0; + int end = Math.min(a.length(), b.length()); + while (start < end) + { + int mid = (start + end + 1) / 2; + if (sortCollator.compare(a.substring(0, mid), b.substring(0, mid)) == 0) + start = mid; + else + end = mid - 1; + } + return start; + } + + private int findInsertionPointIndex(String token, final AtomicBoolean interrupted) { token = normalizeToken(token); int start = 0; int end = sortedIndexEntries.size(); - final Comparator sortCollator = sortLanguage.getCollator(); + final Comparator sortCollator = sortLanguage.getCollator(); while (start < end) { final int mid = (start + end) / 2; if (interrupted.get()) { @@ -315,8 +334,7 @@ public final class Index implements RAFSerializable { if (comp == 0) comp = sortCollator.compare(token, midEntry.normalizedToken()); if (comp == 0) { - final int result = windBackCase(token, mid, interrupted); - return result; + return windBackCase(token, mid, interrupted); } else if (comp < 0) { // System.out.println("Upper bound: " + midEntry + ", norm=" + // midEntry.normalizedToken() + ", mid=" + mid); @@ -344,6 +362,15 @@ public final class Index implements RAFSerializable { } } + // if the word before is the better match, move + // our result to it + if (start > 0 && start < sortedIndexEntries.size()) { + String prev = sortedIndexEntries.get(start - 1).normalizedToken(); + String next = sortedIndexEntries.get(start).normalizedToken(); + if (findMatchLen(sortCollator, token, prev) >= findMatchLen(sortCollator, token, next)) + start--; + } + // If we search for a substring of a string that's in there, return // that. int result = Math.min(start, sortedIndexEntries.size() - 1); @@ -367,7 +394,7 @@ public final class Index implements RAFSerializable { private static final int MAX_SEARCH_ROWS = 1000; - private final Map prefixToNumRows = new HashMap(); + private final Map prefixToNumRows = new HashMap<>(); private synchronized final int getUpperBoundOnRowsStartingWith(final String normalizedPrefix, final int maxRows, final AtomicBoolean interrupted) { @@ -392,7 +419,7 @@ public final class Index implements RAFSerializable { break; } } - prefixToNumRows.put(normalizedPrefix, numRows); + prefixToNumRows.put(normalizedPrefix, rowCount); return rowCount; } @@ -400,9 +427,9 @@ public final class Index implements RAFSerializable { final String searchText, final List searchTokens, final AtomicBoolean interrupted) { final long startMills = System.currentTimeMillis(); - final List result = new ArrayList(); + final List result = new ArrayList<>(); - final Set normalizedNonStoplist = new HashSet(); + final Set normalizedNonStoplist = new HashSet<>(); String bestPrefix = null; int leastRows = Integer.MAX_VALUE; @@ -446,8 +473,8 @@ public final class Index implements RAFSerializable { + ", searchTokens=" + searchTokens); // Place to store the things that match. - final Map> matches = new EnumMap>( - RowMatchType.class); + final Map> matches = new EnumMap<>( + RowMatchType.class); for (final RowMatchType rowMatchType : RowMatchType.values()) { if (rowMatchType != RowMatchType.NO_MATCH) { matches.put(rowMatchType, new ArrayList()); @@ -466,7 +493,7 @@ public final class Index implements RAFSerializable { final String searchToken = bestPrefix; final int insertionPointIndex = findInsertionPointIndex(searchToken, interrupted); - final Set rowsAlreadySeen = new HashSet(); + final Set rowsAlreadySeen = new HashSet<>(); for (int index = insertionPointIndex; index < sortedIndexEntries.size() && matchCount < MAX_SEARCH_ROWS; ++index) { if (interrupted.get()) { @@ -506,7 +533,7 @@ public final class Index implements RAFSerializable { final RowBase.LengthComparator lengthComparator = new RowBase.LengthComparator( swapPairEntries); for (final Collection rows : matches.values()) { - final List ordered = new ArrayList(rows); + final List ordered = new ArrayList<>(rows); Collections.sort(ordered, lengthComparator); result.addAll(ordered); }