X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FIndex.java;h=b9300ba0462c9846398229ad96c561f45343fe36;hb=f8e4d0f62dc4f4fe3577c5bb03e3d8fa8a956e5b;hp=cb29bac28ab989a40d6f12c7d647f77b39c974ab;hpb=f8329ca5a93f93c26bc9f014a831da876f32867d;p=Dictionary.git diff --git a/src/com/hughes/android/dictionary/engine/Index.java b/src/com/hughes/android/dictionary/engine/Index.java index cb29bac..b9300ba 100644 --- a/src/com/hughes/android/dictionary/engine/Index.java +++ b/src/com/hughes/android/dictionary/engine/Index.java @@ -12,28 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -/** - * - */ - package com.hughes.android.dictionary.engine; -import com.hughes.android.dictionary.DictionaryInfo; -import com.hughes.android.dictionary.DictionaryInfo.IndexInfo; -import com.hughes.android.dictionary.engine.RowBase.RowKey; -import com.hughes.util.CachingList; -import com.hughes.util.StringUtil; -import com.hughes.util.TransformingList; -import com.hughes.util.raf.RAFList; -import com.hughes.util.raf.RAFSerializable; -import com.hughes.util.raf.RAFSerializer; -import com.hughes.util.raf.SerializableSerializer; -import com.hughes.util.raf.UniformRAFList; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Transliterator; - +import java.io.BufferedOutputStream; import java.io.DataInput; import java.io.DataOutput; +import java.io.DataOutputStream; +import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; import java.io.RandomAccessFile; @@ -41,19 +26,32 @@ import java.util.AbstractList; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.Comparator; import java.util.EnumMap; +import java.util.HashMap; import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; import java.util.regex.Pattern; -public final class Index implements RAFSerializable { +import com.hughes.android.dictionary.DictionaryInfo; +import com.hughes.android.dictionary.DictionaryInfo.IndexInfo; +import com.hughes.android.dictionary.engine.RowBase.RowKey; +import com.hughes.util.CachingList; +import com.hughes.util.DataInputBuffer; +import com.hughes.util.StringUtil; +import com.hughes.util.TransformingList; +import com.hughes.util.raf.RAFList; +import com.hughes.util.raf.RAFSerializer; +import com.hughes.util.raf.UniformRAFList; +import com.ibm.icu.text.Transliterator; + +public final class Index { - static final int CACHE_SIZE = 5000; + private static final int CACHE_SIZE = 5000; public final Dictionary dict; @@ -62,7 +60,7 @@ public final class Index implements RAFSerializable { // persisted: tells how the entries are sorted. public final Language sortLanguage; - final String normalizerRules; + public final String normalizerRules; // Built from the two above. private Transliterator normalizer; @@ -80,22 +78,23 @@ public final class Index implements RAFSerializable { public final boolean swapPairEntries; // Version 2: - int mainTokenCount = -1; + @SuppressWarnings("WeakerAccess") + public int mainTokenCount = -1; // -------------------------------------------------------------------------- public Index(final Dictionary dict, final String shortName, final String longName, - final Language sortLanguage, final String normalizerRules, - final boolean swapPairEntries, final Set stoplist) { + final Language sortLanguage, final String normalizerRules, + final boolean swapPairEntries, final Set stoplist) { this.dict = dict; this.shortName = shortName; this.longName = longName; this.sortLanguage = sortLanguage; this.normalizerRules = normalizerRules; this.swapPairEntries = swapPairEntries; - sortedIndexEntries = new ArrayList(); + sortedIndexEntries = new ArrayList<>(); this.stoplist = stoplist; - rows = new ArrayList(); + rows = new ArrayList<>(); normalizer = null; } @@ -103,10 +102,10 @@ public final class Index implements RAFSerializable { /** * Deferred initialization because it can be slow. */ + @SuppressWarnings("WeakerAccess") public synchronized Transliterator normalizer() { if (normalizer == null) { - normalizer = Transliterator - .createFromRules("", normalizerRules, Transliterator.FORWARD); + normalizer = TransliteratorManager.get(normalizerRules); } return normalizer; } @@ -115,39 +114,49 @@ public final class Index implements RAFSerializable { * Note that using this comparator probably involves doing too many text * normalizations. */ + @SuppressWarnings("WeakerAccess") public NormalizeComparator getSortComparator() { - return new NormalizeComparator(normalizer(), sortLanguage.getCollator()); + return new NormalizeComparator(normalizer(), sortLanguage.getCollator(), dict.dictFileVersion); } - public Index(final Dictionary dict, final DataInput inp) throws IOException { + public Index(final Dictionary dict, final DataInputBuffer raf) throws IOException { this.dict = dict; - RandomAccessFile raf = (RandomAccessFile)inp; shortName = raf.readUTF(); longName = raf.readUTF(); final String languageCode = raf.readUTF(); sortLanguage = Language.lookup(languageCode); normalizerRules = raf.readUTF(); swapPairEntries = raf.readBoolean(); - if (sortLanguage == null) { - throw new IOException("Unsupported language: " + languageCode); - } if (dict.dictFileVersion >= 2) { mainTokenCount = raf.readInt(); } sortedIndexEntries = CachingList.create( - RAFList.create(raf, indexEntrySerializer, raf.getFilePointer(), dict.dictFileVersion, - dict.dictFileVersion >= 7 ? 16 : 1, dict.dictFileVersion >= 7), CACHE_SIZE); - if (dict.dictFileVersion >= 4) { - stoplist = new SerializableSerializer>().read(raf); + RAFList.create(raf, new IndexEntrySerializer(), + dict.dictFileVersion, dict.dictInfo + " idx " + languageCode + ": "), CACHE_SIZE, true); + if (dict.dictFileVersion >= 7) { + int count = StringUtil.readVarInt(raf); + stoplist = new HashSet<>(count); + for (int i = 0; i < count; ++i) { + stoplist.add(raf.readUTF()); + } + } else if (dict.dictFileVersion >= 4) { + stoplist = new HashSet<>(); + raf.readInt(); // length + raf.skipBytes(18); + byte b = raf.readByte(); + raf.skipBytes(b == 'L' ? 71 : 33); + while ((b = raf.readByte()) == 0x74) { + stoplist.add(raf.readUTF()); + } + if (b != 0x78) throw new IOException("Invalid data in dictionary stoplist!"); } else { stoplist = Collections.emptySet(); } rows = CachingList.create( - UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()), - CACHE_SIZE); + UniformRAFList.create(raf, new RowBase.Serializer(this)), + CACHE_SIZE, true); } - @Override public void write(final DataOutput out) throws IOException { RandomAccessFile raf = (RandomAccessFile)out; raf.writeUTF(shortName); @@ -155,16 +164,15 @@ public final class Index implements RAFSerializable { raf.writeUTF(sortLanguage.getIsoCode()); raf.writeUTF(normalizerRules); raf.writeBoolean(swapPairEntries); - if (dict.dictFileVersion >= 2) { - raf.writeInt(mainTokenCount); + raf.writeInt(mainTokenCount); + RAFList.write(raf, sortedIndexEntries, new IndexEntrySerializer(), 32, true); + StringUtil.writeVarInt(raf, stoplist.size()); + for (String i : stoplist) { + raf.writeUTF(i); } - RAFList.write(raf, sortedIndexEntries, indexEntrySerializer, 16, true); - new SerializableSerializer>().write(raf, stoplist); - UniformRAFList.write(raf, rows, new RowBase.Serializer(this), 5 /* - * bytes - * per - * entry - */); + DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(raf.getFD()))); + UniformRAFList.write(outb, rows, new RowBase.Serializer(this), 3 /* bytes per entry */); + outb.flush(); } public void print(final PrintStream out) { @@ -173,7 +181,7 @@ public final class Index implements RAFSerializable { } } - private final RAFSerializer indexEntrySerializer = new RAFSerializer() { + private final class IndexEntrySerializer implements RAFSerializer { @Override public IndexEntry read(DataInput raf) throws IOException { return new IndexEntry(Index.this, raf); @@ -183,31 +191,27 @@ public final class Index implements RAFSerializable { public void write(DataOutput raf, IndexEntry t) throws IOException { t.write(raf); } - }; + } - public static final class IndexEntry implements RAFSerializable { - private final Index index; + public static final class IndexEntry { public final String token; private final String normalizedToken; public final int startRow; - public final int numRows; // doesn't count the token row! + final int numRows; // doesn't count the token row! public List htmlEntries; - private int[] htmlEntryIndices; public IndexEntry(final Index index, final String token, final String normalizedToken, - final int startRow, final int numRows) { - this.index = index; + final int startRow, final int numRows, final List htmlEntries) { assert token.equals(token.trim()); assert token.length() > 0; this.token = token; this.normalizedToken = normalizedToken; this.startRow = startRow; this.numRows = numRows; - this.htmlEntries = new ArrayList(); + this.htmlEntries = htmlEntries; } - public IndexEntry(final Index index, final DataInput raf) throws IOException { - this.index = index; + IndexEntry(final Index index, final DataInput raf) throws IOException { token = raf.readUTF(); if (index.dict.dictFileVersion >= 7) { startRow = StringUtil.readVarInt(raf); @@ -218,27 +222,32 @@ public final class Index implements RAFSerializable { } final boolean hasNormalizedForm = raf.readBoolean(); normalizedToken = hasNormalizedForm ? raf.readUTF() : token; - htmlEntryIndices = null; if (index.dict.dictFileVersion >= 7) { int size = StringUtil.readVarInt(raf); - htmlEntryIndices = new int[size]; - for (int i = 0; i < size; ++i) { - htmlEntryIndices[i] = StringUtil.readVarInt(raf); - } - this.htmlEntries = CachingList.create(new AbstractList() { - @Override - public HtmlEntry get(int i) { - return index.dict.htmlEntries.get(htmlEntryIndices[i]); - } - @Override - public int size() { - return htmlEntryIndices.length; + if (size == 0) { + this.htmlEntries = Collections.emptyList(); + } else { + final int[] htmlEntryIndices = new int[size]; + for (int i = 0; i < size; ++i) { + htmlEntryIndices[i] = StringUtil.readVarInt(raf); } - }, 1); + this.htmlEntries = new AbstractList() { + @Override + public HtmlEntry get(int i) { + return index.dict.htmlEntries.get(htmlEntryIndices[i]); + } + + @Override + public int size() { + return htmlEntryIndices.length; + } + }; + } } else if (index.dict.dictFileVersion >= 6) { this.htmlEntries = CachingList.create( - RAFList.create((RandomAccessFile)raf, index.dict.htmlEntryIndexSerializer, - ((RandomAccessFile)raf).getFilePointer(), index.dict.dictFileVersion), 1); + RAFList.create((DataInputBuffer)raf, index.dict.htmlEntryIndexSerializer, + index.dict.dictFileVersion, + index.dict.dictInfo + " htmlEntries: "), 1, false); } else { this.htmlEntries = Collections.emptyList(); } @@ -262,12 +271,12 @@ public final class Index implements RAFSerializable { return String.format("%s@%d(%d)", token, startRow, numRows); } - public String normalizedToken() { + String normalizedToken() { return normalizedToken; } } - static final TransformingList.Transformer INDEX_ENTRY_TO_TOKEN = new TransformingList.Transformer() { + private static final TransformingList.Transformer INDEX_ENTRY_TO_TOKEN = new TransformingList.Transformer() { @Override public String transform(IndexEntry t1) { return t1.token; @@ -276,8 +285,8 @@ public final class Index implements RAFSerializable { public IndexEntry findExact(final String exactToken) { final int result = Collections.binarySearch( - TransformingList.create(sortedIndexEntries, INDEX_ENTRY_TO_TOKEN), exactToken, - getSortComparator()); + TransformingList.create(sortedIndexEntries, INDEX_ENTRY_TO_TOKEN), exactToken, + getSortComparator()); if (result >= 0) { return sortedIndexEntries.get(result); } @@ -289,13 +298,33 @@ public final class Index implements RAFSerializable { return index != -1 ? sortedIndexEntries.get(index) : null; } - public int findInsertionPointIndex(String token, final AtomicBoolean interrupted) { + private int compareIdx(String token, final Comparator sortCollator, int idx) { + final IndexEntry entry = sortedIndexEntries.get(idx); + return NormalizeComparator.compareWithoutDash(token, entry.normalizedToken(), sortCollator, dict.dictFileVersion); + } + + private int findMatchLen(final Comparator sortCollator, String a, String b) { + int start = 0; + int end = Math.min(a.length(), b.length()); + while (start < end) + { + int mid = (start + end + 1) / 2; + if (sortCollator.compare(a.substring(0, mid), b.substring(0, mid)) == 0) + start = mid; + else + end = mid - 1; + } + return start; + } + + private int findInsertionPointIndex(String token, final AtomicBoolean interrupted) { + String orig_token = token; token = normalizeToken(token); int start = 0; int end = sortedIndexEntries.size(); - final Collator sortCollator = sortLanguage.getCollator(); + final Comparator sortCollator = sortLanguage.getCollator(); while (start < end) { final int mid = (start + end) / 2; if (interrupted.get()) { @@ -303,18 +332,62 @@ public final class Index implements RAFSerializable { } final IndexEntry midEntry = sortedIndexEntries.get(mid); - final int comp = sortCollator.compare(token, midEntry.normalizedToken()); + int comp = NormalizeComparator.compareWithoutDash(token, midEntry.normalizedToken(), sortCollator, dict.dictFileVersion); + if (comp == 0) + comp = sortCollator.compare(token, midEntry.normalizedToken()); if (comp == 0) { - final int result = windBackCase(token, mid, interrupted); - return result; + start = end = mid; + break; } else if (comp < 0) { // System.out.println("Upper bound: " + midEntry + ", norm=" + // midEntry.normalizedToken() + ", mid=" + mid); - end = mid; + + // Hack for robustness if sort order is broken + if (mid + 2 < end && + compareIdx(token, sortCollator, mid + 1) > 0 && + compareIdx(token, sortCollator, mid + 2) > 0) { + start = mid; + } else { + end = mid; + } } else { // System.out.println("Lower bound: " + midEntry + ", norm=" + // midEntry.normalizedToken() + ", mid=" + mid); - start = mid + 1; + + // Hack for robustness if sort order is broken + if (mid - 2 >= start && + compareIdx(token, sortCollator, mid - 1) < 0 && + compareIdx(token, sortCollator, mid - 2) < 0) { + end = mid + 1; + } else { + start = mid + 1; + } + } + } + + // if the word before is the better match, move + // our result to it + if (start > 0 && start < sortedIndexEntries.size()) { + String prev = sortedIndexEntries.get(start - 1).normalizedToken(); + String next = sortedIndexEntries.get(start).normalizedToken(); + if (findMatchLen(sortCollator, token, prev) >= findMatchLen(sortCollator, token, next)) + start--; + } + + // If the search term was normalized, try to find an exact match first + if (!orig_token.equalsIgnoreCase(token)) { + int matchLen = findMatchLen(sortCollator, token, sortedIndexEntries.get(start).normalizedToken()); + int scan = start; + while (scan >= 0 && scan < sortedIndexEntries.size()) { + IndexEntry e = sortedIndexEntries.get(scan); + if (e.token.equalsIgnoreCase(orig_token)) + { + return scan; + } + if (matchLen > findMatchLen(sortCollator, token, e.normalizedToken())) + break; + if (interrupted.get()) return start; + scan++; } } @@ -325,7 +398,7 @@ public final class Index implements RAFSerializable { return result; } - private final int windBackCase(final String token, int result, final AtomicBoolean interrupted) { + private int windBackCase(final String token, int result, final AtomicBoolean interrupted) { while (result > 0 && sortedIndexEntries.get(result - 1).normalizedToken().equals(token)) { --result; if (interrupted.get()) { @@ -341,10 +414,10 @@ public final class Index implements RAFSerializable { private static final int MAX_SEARCH_ROWS = 1000; - private final Map prefixToNumRows = new LinkedHashMap(); + private final Map prefixToNumRows = new HashMap<>(); - private synchronized final int getUpperBoundOnRowsStartingWith(final String normalizedPrefix, - final int maxRows, final AtomicBoolean interrupted) { + private synchronized int getUpperBoundOnRowsStartingWith(final String normalizedPrefix, + final int maxRows, final AtomicBoolean interrupted) { final Integer numRows = prefixToNumRows.get(normalizedPrefix); if (numRows != null) { return numRows; @@ -357,7 +430,8 @@ public final class Index implements RAFSerializable { return -1; } final IndexEntry indexEntry = sortedIndexEntries.get(index); - if (!indexEntry.normalizedToken.startsWith(normalizedPrefix)) { + if (!indexEntry.normalizedToken.startsWith(normalizedPrefix) && + !NormalizeComparator.withoutDash(indexEntry.normalizedToken).startsWith(normalizedPrefix)) { break; } rowCount += indexEntry.numRows + indexEntry.htmlEntries.size(); @@ -366,17 +440,17 @@ public final class Index implements RAFSerializable { break; } } - prefixToNumRows.put(normalizedPrefix, numRows); + prefixToNumRows.put(normalizedPrefix, rowCount); return rowCount; } public final List multiWordSearch( - final String searchText, final List searchTokens, - final AtomicBoolean interrupted) { + final String searchText, final List searchTokens, + final AtomicBoolean interrupted) { final long startMills = System.currentTimeMillis(); - final List result = new ArrayList(); + final List result = new ArrayList<>(); - final Set normalizedNonStoplist = new LinkedHashSet(); + final Set normalizedNonStoplist = new HashSet<>(); String bestPrefix = null; int leastRows = Integer.MAX_VALUE; @@ -393,7 +467,7 @@ public final class Index implements RAFSerializable { if (!stoplist.contains(searchToken)) { if (normalizedNonStoplist.add(normalized)) { final int numRows = getUpperBoundOnRowsStartingWith(normalized, - MAX_SEARCH_ROWS, interrupted); + MAX_SEARCH_ROWS, interrupted); if (numRows != -1 && numRows < leastRows) { if (numRows == 0) { // We really are done here. @@ -417,10 +491,10 @@ public final class Index implements RAFSerializable { System.out.println("Everything was in the stoplist!"); } System.out.println("Searching using prefix: " + bestPrefix + ", leastRows=" + leastRows - + ", searchTokens=" + searchTokens); + + ", searchTokens=" + searchTokens); // Place to store the things that match. - final Map> matches = new EnumMap>( + final Map> matches = new EnumMap<>( RowMatchType.class); for (final RowMatchType rowMatchType : RowMatchType.values()) { if (rowMatchType != RowMatchType.NO_MATCH) { @@ -440,14 +514,15 @@ public final class Index implements RAFSerializable { final String searchToken = bestPrefix; final int insertionPointIndex = findInsertionPointIndex(searchToken, interrupted); - final Set rowsAlreadySeen = new HashSet(); + final Set rowsAlreadySeen = new HashSet<>(); for (int index = insertionPointIndex; index < sortedIndexEntries.size() && matchCount < MAX_SEARCH_ROWS; ++index) { if (interrupted.get()) { return null; } final IndexEntry indexEntry = sortedIndexEntries.get(index); - if (!indexEntry.normalizedToken.startsWith(searchToken)) { + if (!indexEntry.normalizedToken.startsWith(searchToken) && + !NormalizeComparator.withoutDash(indexEntry.normalizedToken).startsWith(searchToken)) { break; } @@ -467,7 +542,7 @@ public final class Index implements RAFSerializable { } rowsAlreadySeen.add(rowKey); final RowMatchType matchType = row.matches(searchTokens, pattern, normalizer(), - swapPairEntries); + swapPairEntries); if (matchType != RowMatchType.NO_MATCH) { matches.get(matchType).add(row); ++matchCount; @@ -478,9 +553,9 @@ public final class Index implements RAFSerializable { // Sort them into a reasonable order. final RowBase.LengthComparator lengthComparator = new RowBase.LengthComparator( - swapPairEntries); + swapPairEntries); for (final Collection rows : matches.values()) { - final List ordered = new ArrayList(rows); + final List ordered = new ArrayList<>(rows); Collections.sort(ordered, lengthComparator); result.addAll(ordered); } @@ -490,12 +565,12 @@ public final class Index implements RAFSerializable { } private String normalizeToken(final String searchToken) { - if (TransliteratorManager.init(null)) { + if (TransliteratorManager.init(null, null)) { final Transliterator normalizer = normalizer(); return normalizer.transliterate(searchToken); } else { // Do our best since the Transliterators aren't up yet. - return searchToken.toLowerCase(); + return searchToken.toLowerCase(Locale.US); } }