import com.hughes.util.raf.RAFSerializer;
import com.hughes.util.raf.SerializableSerializer;
import com.hughes.util.raf.UniformRAFList;
-import java.text.Collator;
import com.ibm.icu.text.Transliterator;
import java.io.DataInput;
+import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.IOException;
import java.io.PrintStream;
import java.io.RandomAccessFile;
+import java.nio.channels.Channels;
+import java.nio.channels.FileChannel;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Collection;
return new NormalizeComparator(normalizer(), sortLanguage.getCollator(), dict.dictFileVersion);
}
- public Index(final Dictionary dict, final DataInput inp) throws IOException {
+ public Index(final Dictionary dict, final FileChannel inp, final DataInput raf) throws IOException {
this.dict = dict;
- RandomAccessFile raf = (RandomAccessFile)inp;
shortName = raf.readUTF();
longName = raf.readUTF();
final String languageCode = raf.readUTF();
mainTokenCount = raf.readInt();
}
sortedIndexEntries = CachingList.create(
- RAFList.create(raf, indexEntrySerializer, raf.getFilePointer(),
- dict.dictFileVersion, dict.dictInfo + " idx " + languageCode + ": "), CACHE_SIZE);
+ RAFList.create(inp, new IndexEntrySerializer(dict.dictFileVersion == 6 ? inp : null), inp.position(),
+ dict.dictFileVersion, dict.dictInfo + " idx " + languageCode + ": "), CACHE_SIZE, true);
if (dict.dictFileVersion >= 7) {
int count = StringUtil.readVarInt(raf);
stoplist = new HashSet<String>(count);
stoplist = Collections.emptySet();
}
rows = CachingList.create(
- UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()),
- CACHE_SIZE);
+ UniformRAFList.create(inp, new RowBase.Serializer(this), inp.position()),
+ CACHE_SIZE, true);
}
@Override
if (dict.dictFileVersion >= 2) {
raf.writeInt(mainTokenCount);
}
- RAFList.write(raf, sortedIndexEntries, indexEntrySerializer, 32, true);
+ RAFList.write(raf, sortedIndexEntries, new IndexEntrySerializer(null), 32, true);
StringUtil.writeVarInt(raf, stoplist.size());
for (String i : stoplist) {
raf.writeUTF(i);
}
}
- private final RAFSerializer<IndexEntry> indexEntrySerializer = new RAFSerializer<IndexEntry>() {
+ private final class IndexEntrySerializer implements RAFSerializer<IndexEntry> {
+ private final FileChannel ch;
+
+ public IndexEntrySerializer(FileChannel ch) {
+ this.ch = ch;
+ }
+
@Override
public IndexEntry read(DataInput raf) throws IOException {
- return new IndexEntry(Index.this, raf);
+ return new IndexEntry(Index.this, ch, raf);
}
@Override
public void write(DataOutput raf, IndexEntry t) throws IOException {
t.write(raf);
}
- };
+ }
public static final class IndexEntry implements RAFSerializable<Index.IndexEntry> {
public final String token;
this.htmlEntries = new ArrayList<HtmlEntry>();
}
- public IndexEntry(final Index index, final DataInput raf) throws IOException {
+ public IndexEntry(final Index index, final FileChannel ch, final DataInput raf) throws IOException {
token = raf.readUTF();
if (index.dict.dictFileVersion >= 7) {
startRow = StringUtil.readVarInt(raf);
}
} else if (index.dict.dictFileVersion >= 6) {
this.htmlEntries = CachingList.create(
- RAFList.create((RandomAccessFile)raf, index.dict.htmlEntryIndexSerializer,
- ((RandomAccessFile)raf).getFilePointer(), index.dict.dictFileVersion,
- index.dict.dictInfo + " htmlEntries: "), 1);
+ RAFList.create(ch, index.dict.htmlEntryIndexSerializer,
+ ch.position(), index.dict.dictFileVersion,
+ index.dict.dictInfo + " htmlEntries: "), 1, false);
} else {
this.htmlEntries = Collections.emptyList();
}
return index != -1 ? sortedIndexEntries.get(index) : null;
}
+ private int compareIdx(String token, final Comparator sortCollator, int idx) {
+ final IndexEntry entry = sortedIndexEntries.get(idx);
+ return NormalizeComparator.compareWithoutDash(token, entry.normalizedToken(), sortCollator, dict.dictFileVersion);
+ }
+
+ private int findMatchLen(final Comparator sortCollator, String a, String b) {
+ int start = 0;
+ int end = Math.min(a.length(), b.length());
+ while (start < end)
+ {
+ int mid = (start + end + 1) / 2;
+ if (sortCollator.compare(a.substring(0, mid), b.substring(0, mid)) == 0)
+ start = mid;
+ else
+ end = mid - 1;
+ }
+ return start;
+ }
+
public int findInsertionPointIndex(String token, final AtomicBoolean interrupted) {
token = normalizeToken(token);
} else if (comp < 0) {
// System.out.println("Upper bound: " + midEntry + ", norm=" +
// midEntry.normalizedToken() + ", mid=" + mid);
- end = mid;
+
+ // Hack for robustness if sort order is broken
+ if (mid + 2 < end &&
+ compareIdx(token, sortCollator, mid + 1) > 0 &&
+ compareIdx(token, sortCollator, mid + 2) > 0) {
+ start = mid;
+ } else {
+ end = mid;
+ }
} else {
// System.out.println("Lower bound: " + midEntry + ", norm=" +
// midEntry.normalizedToken() + ", mid=" + mid);
- start = mid + 1;
+
+ // Hack for robustness if sort order is broken
+ if (mid - 2 >= start &&
+ compareIdx(token, sortCollator, mid - 1) < 0 &&
+ compareIdx(token, sortCollator, mid - 2) < 0) {
+ end = mid + 1;
+ } else {
+ start = mid + 1;
+ }
}
}
+ // if the word before is the better match, move
+ // our result to it
+ if (start > 0 && start < sortedIndexEntries.size()) {
+ String prev = sortedIndexEntries.get(start - 1).normalizedToken();
+ String next = sortedIndexEntries.get(start).normalizedToken();
+ if (findMatchLen(sortCollator, token, prev) >= findMatchLen(sortCollator, token, next))
+ start--;
+ }
+
// If we search for a substring of a string that's in there, return
// that.
int result = Math.min(start, sortedIndexEntries.size() - 1);
}
private String normalizeToken(final String searchToken) {
- if (TransliteratorManager.init(null)) {
+ if (TransliteratorManager.init(null, null)) {
final Transliterator normalizer = normalizer();
return normalizer.transliterate(searchToken);
} else {