X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FIndex.java;h=e289f4942909b1433c36591dcc566a60589e7256;hb=4d21ab1dcc458d31c72add1742af3717f02a0063;hp=f4666ccf11bcd9a6a5da590206f5755e5dbe0907;hpb=b51d6d5fd02ee3678dcbd3fcf0678946cc5c86e0;p=Dictionary.git diff --git a/src/com/hughes/android/dictionary/engine/Index.java b/src/com/hughes/android/dictionary/engine/Index.java index f4666cc..e289f49 100644 --- a/src/com/hughes/android/dictionary/engine/Index.java +++ b/src/com/hughes/android/dictionary/engine/Index.java @@ -13,7 +13,7 @@ // limitations under the License. /** - * + * */ package com.hughes.android.dictionary.engine; @@ -22,25 +22,29 @@ import com.hughes.android.dictionary.DictionaryInfo; import com.hughes.android.dictionary.DictionaryInfo.IndexInfo; import com.hughes.android.dictionary.engine.RowBase.RowKey; import com.hughes.util.CachingList; +import com.hughes.util.StringUtil; import com.hughes.util.TransformingList; import com.hughes.util.raf.RAFList; import com.hughes.util.raf.RAFSerializable; import com.hughes.util.raf.RAFSerializer; import com.hughes.util.raf.SerializableSerializer; import com.hughes.util.raf.UniformRAFList; -import com.ibm.icu.text.Collator; +import java.text.Collator; import com.ibm.icu.text.Transliterator; +import java.io.DataInput; +import java.io.DataOutput; import java.io.IOException; import java.io.PrintStream; import java.io.RandomAccessFile; +import java.util.AbstractList; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.Comparator; import java.util.EnumMap; import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; @@ -81,8 +85,8 @@ public final class Index implements RAFSerializable { // -------------------------------------------------------------------------- public Index(final Dictionary dict, final String shortName, final String longName, - final Language sortLanguage, final String normalizerRules, - final boolean swapPairEntries, final Set stoplist) { + final Language sortLanguage, final String normalizerRules, + final boolean swapPairEntries, final Set stoplist) { this.dict = dict; this.shortName = shortName; this.longName = longName; @@ -101,8 +105,7 @@ public final class Index implements RAFSerializable { */ public synchronized Transliterator normalizer() { if (normalizer == null) { - normalizer = Transliterator - .createFromRules("", normalizerRules, Transliterator.FORWARD); + normalizer = TransliteratorManager.get(normalizerRules); } return normalizer; } @@ -112,11 +115,12 @@ public final class Index implements RAFSerializable { * normalizations. */ public NormalizeComparator getSortComparator() { - return new NormalizeComparator(normalizer(), sortLanguage.getCollator()); + return new NormalizeComparator(normalizer(), sortLanguage.getCollator(), dict.dictFileVersion); } - public Index(final Dictionary dict, final RandomAccessFile raf) throws IOException { + public Index(final Dictionary dict, final DataInput inp) throws IOException { this.dict = dict; + RandomAccessFile raf = (RandomAccessFile)inp; shortName = raf.readUTF(); longName = raf.readUTF(); final String languageCode = raf.readUTF(); @@ -130,19 +134,27 @@ public final class Index implements RAFSerializable { mainTokenCount = raf.readInt(); } sortedIndexEntries = CachingList.create( - RAFList.create(raf, indexEntrySerializer, raf.getFilePointer()), CACHE_SIZE); - if (dict.dictFileVersion >= 4) { + RAFList.create(raf, indexEntrySerializer, raf.getFilePointer(), + dict.dictFileVersion, dict.dictInfo + " idx " + languageCode + ": "), CACHE_SIZE); + if (dict.dictFileVersion >= 7) { + int count = StringUtil.readVarInt(raf); + stoplist = new HashSet(count); + for (int i = 0; i < count; ++i) { + stoplist.add(raf.readUTF()); + } + } else if (dict.dictFileVersion >= 4) { stoplist = new SerializableSerializer>().read(raf); } else { stoplist = Collections.emptySet(); } rows = CachingList.create( - UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()), - CACHE_SIZE); + UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()), + CACHE_SIZE); } @Override - public void write(final RandomAccessFile raf) throws IOException { + public void write(final DataOutput out) throws IOException { + RandomAccessFile raf = (RandomAccessFile)out; raf.writeUTF(shortName); raf.writeUTF(longName); raf.writeUTF(sortLanguage.getIsoCode()); @@ -151,13 +163,12 @@ public final class Index implements RAFSerializable { if (dict.dictFileVersion >= 2) { raf.writeInt(mainTokenCount); } - RAFList.write(raf, sortedIndexEntries, indexEntrySerializer); - new SerializableSerializer>().write(raf, stoplist); - UniformRAFList.write(raf, rows, new RowBase.Serializer(this), 5 /* - * bytes - * per - * entry - */); + RAFList.write(raf, sortedIndexEntries, indexEntrySerializer, 32, true); + StringUtil.writeVarInt(raf, stoplist.size()); + for (String i : stoplist) { + raf.writeUTF(i); + } + UniformRAFList.write(raf, rows, new RowBase.Serializer(this), 3 /* bytes per entry */); } public void print(final PrintStream out) { @@ -168,27 +179,25 @@ public final class Index implements RAFSerializable { private final RAFSerializer indexEntrySerializer = new RAFSerializer() { @Override - public IndexEntry read(RandomAccessFile raf) throws IOException { + public IndexEntry read(DataInput raf) throws IOException { return new IndexEntry(Index.this, raf); } @Override - public void write(RandomAccessFile raf, IndexEntry t) throws IOException { + public void write(DataOutput raf, IndexEntry t) throws IOException { t.write(raf); } }; public static final class IndexEntry implements RAFSerializable { - private final Index index; public final String token; private final String normalizedToken; public final int startRow; public final int numRows; // doesn't count the token row! - public final List htmlEntries; + public List htmlEntries; public IndexEntry(final Index index, final String token, final String normalizedToken, - final int startRow, final int numRows) { - this.index = index; + final int startRow, final int numRows) { assert token.equals(token.trim()); assert token.length() > 0; this.token = token; @@ -198,32 +207,59 @@ public final class Index implements RAFSerializable { this.htmlEntries = new ArrayList(); } - public IndexEntry(final Index index, final RandomAccessFile raf) throws IOException { - this.index = index; + public IndexEntry(final Index index, final DataInput raf) throws IOException { token = raf.readUTF(); - startRow = raf.readInt(); - numRows = raf.readInt(); + if (index.dict.dictFileVersion >= 7) { + startRow = StringUtil.readVarInt(raf); + numRows = StringUtil.readVarInt(raf); + } else { + startRow = raf.readInt(); + numRows = raf.readInt(); + } final boolean hasNormalizedForm = raf.readBoolean(); normalizedToken = hasNormalizedForm ? raf.readUTF() : token; - if (index.dict.dictFileVersion >= 6) { + if (index.dict.dictFileVersion >= 7) { + int size = StringUtil.readVarInt(raf); + if (size == 0) { + this.htmlEntries = Collections.emptyList(); + } else { + final int[] htmlEntryIndices = new int[size]; + for (int i = 0; i < size; ++i) { + htmlEntryIndices[i] = StringUtil.readVarInt(raf); + } + this.htmlEntries = new AbstractList() { + @Override + public HtmlEntry get(int i) { + return index.dict.htmlEntries.get(htmlEntryIndices[i]); + } + @Override + public int size() { + return htmlEntryIndices.length; + } + }; + } + } else if (index.dict.dictFileVersion >= 6) { this.htmlEntries = CachingList.create( - RAFList.create(raf, index.dict.htmlEntryIndexSerializer, - raf.getFilePointer()), 1); + RAFList.create((RandomAccessFile)raf, index.dict.htmlEntryIndexSerializer, + ((RandomAccessFile)raf).getFilePointer(), index.dict.dictFileVersion, + index.dict.dictInfo + " htmlEntries: "), 1); } else { this.htmlEntries = Collections.emptyList(); } } - public void write(RandomAccessFile raf) throws IOException { + public void write(DataOutput raf) throws IOException { raf.writeUTF(token); - raf.writeInt(startRow); - raf.writeInt(numRows); + StringUtil.writeVarInt(raf, startRow); + StringUtil.writeVarInt(raf, numRows); final boolean hasNormalizedForm = !token.equals(normalizedToken); raf.writeBoolean(hasNormalizedForm); if (hasNormalizedForm) { raf.writeUTF(normalizedToken); } - RAFList.write(raf, htmlEntries, index.dict.htmlEntryIndexSerializer); + StringUtil.writeVarInt(raf, htmlEntries.size()); + for (HtmlEntry e : htmlEntries) + StringUtil.writeVarInt(raf, e.index()); } public String toString() { @@ -244,8 +280,8 @@ public final class Index implements RAFSerializable { public IndexEntry findExact(final String exactToken) { final int result = Collections.binarySearch( - TransformingList.create(sortedIndexEntries, INDEX_ENTRY_TO_TOKEN), exactToken, - getSortComparator()); + TransformingList.create(sortedIndexEntries, INDEX_ENTRY_TO_TOKEN), exactToken, + getSortComparator()); if (result >= 0) { return sortedIndexEntries.get(result); } @@ -257,13 +293,18 @@ public final class Index implements RAFSerializable { return index != -1 ? sortedIndexEntries.get(index) : null; } + private int compareIdx(String token, final Comparator sortCollator, int idx) { + final IndexEntry entry = sortedIndexEntries.get(idx); + return NormalizeComparator.compareWithoutDash(token, entry.normalizedToken(), sortCollator, dict.dictFileVersion); + } + public int findInsertionPointIndex(String token, final AtomicBoolean interrupted) { token = normalizeToken(token); int start = 0; int end = sortedIndexEntries.size(); - final Collator sortCollator = sortLanguage.getCollator(); + final Comparator sortCollator = sortLanguage.getCollator(); while (start < end) { final int mid = (start + end) / 2; if (interrupted.get()) { @@ -271,18 +312,36 @@ public final class Index implements RAFSerializable { } final IndexEntry midEntry = sortedIndexEntries.get(mid); - final int comp = sortCollator.compare(token, midEntry.normalizedToken()); + int comp = NormalizeComparator.compareWithoutDash(token, midEntry.normalizedToken(), sortCollator, dict.dictFileVersion); + if (comp == 0) + comp = sortCollator.compare(token, midEntry.normalizedToken()); if (comp == 0) { final int result = windBackCase(token, mid, interrupted); return result; } else if (comp < 0) { // System.out.println("Upper bound: " + midEntry + ", norm=" + // midEntry.normalizedToken() + ", mid=" + mid); - end = mid; + + // Hack for robustness if sort order is broken + if (mid + 2 < end && + compareIdx(token, sortCollator, mid + 1) > 0 && + compareIdx(token, sortCollator, mid + 2) > 0) { + start = mid; + } else { + end = mid; + } } else { // System.out.println("Lower bound: " + midEntry + ", norm=" + // midEntry.normalizedToken() + ", mid=" + mid); - start = mid + 1; + + // Hack for robustness if sort order is broken + if (mid - 2 >= start && + compareIdx(token, sortCollator, mid - 1) < 0 && + compareIdx(token, sortCollator, mid - 2) < 0) { + end = mid + 1; + } else { + start = mid + 1; + } } } @@ -309,7 +368,7 @@ public final class Index implements RAFSerializable { private static final int MAX_SEARCH_ROWS = 1000; - private final Map prefixToNumRows = new LinkedHashMap(); + private final Map prefixToNumRows = new HashMap(); private synchronized final int getUpperBoundOnRowsStartingWith(final String normalizedPrefix, final int maxRows, final AtomicBoolean interrupted) { @@ -339,12 +398,12 @@ public final class Index implements RAFSerializable { } public final List multiWordSearch( - final String searchText, final List searchTokens, - final AtomicBoolean interrupted) { + final String searchText, final List searchTokens, + final AtomicBoolean interrupted) { final long startMills = System.currentTimeMillis(); final List result = new ArrayList(); - final Set normalizedNonStoplist = new LinkedHashSet(); + final Set normalizedNonStoplist = new HashSet(); String bestPrefix = null; int leastRows = Integer.MAX_VALUE; @@ -361,7 +420,7 @@ public final class Index implements RAFSerializable { if (!stoplist.contains(searchToken)) { if (normalizedNonStoplist.add(normalized)) { final int numRows = getUpperBoundOnRowsStartingWith(normalized, - MAX_SEARCH_ROWS, interrupted); + MAX_SEARCH_ROWS, interrupted); if (numRows != -1 && numRows < leastRows) { if (numRows == 0) { // We really are done here. @@ -385,11 +444,11 @@ public final class Index implements RAFSerializable { System.out.println("Everything was in the stoplist!"); } System.out.println("Searching using prefix: " + bestPrefix + ", leastRows=" + leastRows - + ", searchTokens=" + searchTokens); + + ", searchTokens=" + searchTokens); // Place to store the things that match. final Map> matches = new EnumMap>( - RowMatchType.class); + RowMatchType.class); for (final RowMatchType rowMatchType : RowMatchType.values()) { if (rowMatchType != RowMatchType.NO_MATCH) { matches.put(rowMatchType, new ArrayList()); @@ -435,7 +494,7 @@ public final class Index implements RAFSerializable { } rowsAlreadySeen.add(rowKey); final RowMatchType matchType = row.matches(searchTokens, pattern, normalizer(), - swapPairEntries); + swapPairEntries); if (matchType != RowMatchType.NO_MATCH) { matches.get(matchType).add(row); ++matchCount; @@ -446,7 +505,7 @@ public final class Index implements RAFSerializable { // Sort them into a reasonable order. final RowBase.LengthComparator lengthComparator = new RowBase.LengthComparator( - swapPairEntries); + swapPairEntries); for (final Collection rows : matches.values()) { final List ordered = new ArrayList(rows); Collections.sort(ordered, lengthComparator);