- public String toString() {
- return String.format("inerstionPoint=%s,longestPrefix=%s,longestPrefixString=%s,success=%b", insertionPoint.toString(), longestPrefix.toString(), longestPrefixString, success);
- }
- }
-
-// public SearchResult findLongestSubstring(String token, final AtomicBoolean interrupted) {
-// token = normalizer.transliterate(token);
-// if (token.length() == 0) {
-// return new SearchResult(sortedIndexEntries.get(0), sortedIndexEntries.get(0), "", true);
-// }
-// IndexEntry insertionPoint = null;
-// IndexEntry result = null;
-// boolean unmodified = true;
-// while (!interrupted.get() && token.length() > 0) {
-// result = findInsertionPoint(token, interrupted);
-// if (result == null) {
-// return null;
-// }
-// if (unmodified) {
-// insertionPoint = result;
-// }
-// if (result.normalizedToken(normalizer).startsWith(token)) {
-// return new SearchResult(insertionPoint, result, token, unmodified);
-// }
-// unmodified = false;
-// token = token.substring(0, token.length() - 1);
-// }
-// return new SearchResult(insertionPoint, sortedIndexEntries.get(0), "", false);
-// }
-
- private final int windBackCase(final String token, int result, final AtomicBoolean interrupted) {
- while (result > 0 && sortedIndexEntries.get(result - 1).normalizedToken(normalizer).equals(token)) {
- --result;
- if (interrupted.get()) {
+ public void write(final DataOutput out) throws IOException {
+ RandomAccessFile raf = (RandomAccessFile)out;
+ raf.writeUTF(shortName);
+ raf.writeUTF(longName);
+ raf.writeUTF(sortLanguage.getIsoCode());
+ raf.writeUTF(normalizerRules);
+ raf.writeBoolean(swapPairEntries);
+ if (dict.dictFileVersion >= 2) {
+ raf.writeInt(mainTokenCount);
+ }
+ RAFList.write(raf, sortedIndexEntries, new IndexEntrySerializer(null), 32, true);
+ StringUtil.writeVarInt(raf, stoplist.size());
+ for (String i : stoplist) {
+ raf.writeUTF(i);
+ }
+ UniformRAFList.write(raf, rows, new RowBase.Serializer(this), 3 /* bytes per entry */);
+ }
+
+ public void print(final PrintStream out) {
+ for (final RowBase row : rows) {
+ row.print(out);
+ }
+ }
+
+ private final class IndexEntrySerializer implements RAFSerializer<IndexEntry> {
+ private final FileChannel ch;
+
+ public IndexEntrySerializer(FileChannel ch) {
+ this.ch = ch;
+ }
+
+ @Override
+ public IndexEntry read(DataInput raf) throws IOException {
+ return new IndexEntry(Index.this, ch, raf);
+ }
+
+ @Override
+ public void write(DataOutput raf, IndexEntry t) throws IOException {
+ t.write(raf);
+ }
+ }
+
+ public static final class IndexEntry implements RAFSerializable<Index.IndexEntry> {
+ public final String token;
+ private final String normalizedToken;
+ public final int startRow;
+ public final int numRows; // doesn't count the token row!
+ public List<HtmlEntry> htmlEntries;
+
+ public IndexEntry(final Index index, final String token, final String normalizedToken,
+ final int startRow, final int numRows) {
+ assert token.equals(token.trim());
+ assert token.length() > 0;
+ this.token = token;
+ this.normalizedToken = normalizedToken;
+ this.startRow = startRow;
+ this.numRows = numRows;
+ this.htmlEntries = new ArrayList<HtmlEntry>();
+ }
+
+ public IndexEntry(final Index index, final FileChannel ch, final DataInput raf) throws IOException {
+ token = raf.readUTF();
+ if (index.dict.dictFileVersion >= 7) {
+ startRow = StringUtil.readVarInt(raf);
+ numRows = StringUtil.readVarInt(raf);
+ } else {
+ startRow = raf.readInt();
+ numRows = raf.readInt();
+ }
+ final boolean hasNormalizedForm = raf.readBoolean();
+ normalizedToken = hasNormalizedForm ? raf.readUTF() : token;
+ if (index.dict.dictFileVersion >= 7) {
+ int size = StringUtil.readVarInt(raf);
+ if (size == 0) {
+ this.htmlEntries = Collections.emptyList();
+ } else {
+ final int[] htmlEntryIndices = new int[size];
+ for (int i = 0; i < size; ++i) {
+ htmlEntryIndices[i] = StringUtil.readVarInt(raf);
+ }
+ this.htmlEntries = new AbstractList<HtmlEntry>() {
+ @Override
+ public HtmlEntry get(int i) {
+ return index.dict.htmlEntries.get(htmlEntryIndices[i]);
+ }
+ @Override
+ public int size() {
+ return htmlEntryIndices.length;
+ }
+ };
+ }
+ } else if (index.dict.dictFileVersion >= 6) {
+ this.htmlEntries = CachingList.create(
+ RAFList.create(ch, index.dict.htmlEntryIndexSerializer,
+ ch.position(), index.dict.dictFileVersion,
+ index.dict.dictInfo + " htmlEntries: "), 1, false);
+ } else {
+ this.htmlEntries = Collections.emptyList();
+ }
+ }
+
+ public void write(DataOutput raf) throws IOException {
+ raf.writeUTF(token);
+ StringUtil.writeVarInt(raf, startRow);
+ StringUtil.writeVarInt(raf, numRows);
+ final boolean hasNormalizedForm = !token.equals(normalizedToken);
+ raf.writeBoolean(hasNormalizedForm);
+ if (hasNormalizedForm) {
+ raf.writeUTF(normalizedToken);
+ }
+ StringUtil.writeVarInt(raf, htmlEntries.size());
+ for (HtmlEntry e : htmlEntries)
+ StringUtil.writeVarInt(raf, e.index());
+ }
+
+ public String toString() {
+ return String.format("%s@%d(%d)", token, startRow, numRows);
+ }
+
+ public String normalizedToken() {
+ return normalizedToken;
+ }
+ }
+
+ static final TransformingList.Transformer<IndexEntry, String> INDEX_ENTRY_TO_TOKEN = new TransformingList.Transformer<IndexEntry, String>() {
+ @Override
+ public String transform(IndexEntry t1) {
+ return t1.token;
+ }
+ };
+
+ public IndexEntry findExact(final String exactToken) {
+ final int result = Collections.binarySearch(
+ TransformingList.create(sortedIndexEntries, INDEX_ENTRY_TO_TOKEN), exactToken,
+ getSortComparator());
+ if (result >= 0) {
+ return sortedIndexEntries.get(result);
+ }
+ return null;
+ }
+
+ public IndexEntry findInsertionPoint(String token, final AtomicBoolean interrupted) {
+ final int index = findInsertionPointIndex(token, interrupted);
+ return index != -1 ? sortedIndexEntries.get(index) : null;
+ }
+
+ private int compareIdx(String token, final Comparator sortCollator, int idx) {
+ final IndexEntry entry = sortedIndexEntries.get(idx);
+ return NormalizeComparator.compareWithoutDash(token, entry.normalizedToken(), sortCollator, dict.dictFileVersion);
+ }
+
+ private int findMatchLen(final Comparator sortCollator, String a, String b) {
+ int start = 0;
+ int end = Math.min(a.length(), b.length());
+ while (start < end)
+ {
+ int mid = (start + end + 1) / 2;
+ if (sortCollator.compare(a.substring(0, mid), b.substring(0, mid)) == 0)
+ start = mid;
+ else
+ end = mid - 1;
+ }
+ return start;
+ }
+
+ public int findInsertionPointIndex(String token, final AtomicBoolean interrupted) {
+ token = normalizeToken(token);
+
+ int start = 0;
+ int end = sortedIndexEntries.size();
+
+ final Comparator sortCollator = sortLanguage.getCollator();
+ while (start < end) {
+ final int mid = (start + end) / 2;
+ if (interrupted.get()) {
+ return -1;
+ }
+ final IndexEntry midEntry = sortedIndexEntries.get(mid);
+
+ int comp = NormalizeComparator.compareWithoutDash(token, midEntry.normalizedToken(), sortCollator, dict.dictFileVersion);
+ if (comp == 0)
+ comp = sortCollator.compare(token, midEntry.normalizedToken());
+ if (comp == 0) {
+ final int result = windBackCase(token, mid, interrupted);
+ return result;
+ } else if (comp < 0) {
+ // System.out.println("Upper bound: " + midEntry + ", norm=" +
+ // midEntry.normalizedToken() + ", mid=" + mid);
+
+ // Hack for robustness if sort order is broken
+ if (mid + 2 < end &&
+ compareIdx(token, sortCollator, mid + 1) > 0 &&
+ compareIdx(token, sortCollator, mid + 2) > 0) {
+ start = mid;
+ } else {
+ end = mid;
+ }
+ } else {
+ // System.out.println("Lower bound: " + midEntry + ", norm=" +
+ // midEntry.normalizedToken() + ", mid=" + mid);
+
+ // Hack for robustness if sort order is broken
+ if (mid - 2 >= start &&
+ compareIdx(token, sortCollator, mid - 1) < 0 &&
+ compareIdx(token, sortCollator, mid - 2) < 0) {
+ end = mid + 1;
+ } else {
+ start = mid + 1;
+ }
+ }
+ }
+
+ // if the word before is the better match, move
+ // our result to it
+ if (start > 0 && start < sortedIndexEntries.size()) {
+ String prev = sortedIndexEntries.get(start - 1).normalizedToken();
+ String next = sortedIndexEntries.get(start).normalizedToken();
+ if (findMatchLen(sortCollator, token, prev) >= findMatchLen(sortCollator, token, next))
+ start--;
+ }
+
+ // If we search for a substring of a string that's in there, return
+ // that.
+ int result = Math.min(start, sortedIndexEntries.size() - 1);
+ result = windBackCase(sortedIndexEntries.get(result).normalizedToken(), result, interrupted);
+ return result;
+ }
+
+ private final int windBackCase(final String token, int result, final AtomicBoolean interrupted) {
+ while (result > 0 && sortedIndexEntries.get(result - 1).normalizedToken().equals(token)) {
+ --result;
+ if (interrupted.get()) {
+ return result;
+ }
+ }