+ }
+
+ public IndexEntry findInsertionPoint(String token, final AtomicBoolean interrupted) {
+ final int index = findInsertionPointIndex(token, interrupted);
+ return index != -1 ? sortedIndexEntries.get(index) : null;
+ }
+
+ private int compareIdx(String token, final Comparator<Object> sortCollator, int idx) {
+ final IndexEntry entry = sortedIndexEntries.get(idx);
+ return NormalizeComparator.compareWithoutDash(token, entry.normalizedToken(), sortCollator, dict.dictFileVersion);
+ }
+
+ private int findMatchLen(final Comparator<Object> sortCollator, String a, String b) {
+ int start = 0;
+ int end = Math.min(a.length(), b.length());
+ while (start < end)
+ {
+ int mid = (start + end + 1) / 2;
+ if (sortCollator.compare(a.substring(0, mid), b.substring(0, mid)) == 0)
+ start = mid;
+ else
+ end = mid - 1;
+ }
+ return start;
+ }
+
+ private int findInsertionPointIndex(String token, final AtomicBoolean interrupted) {
+ token = normalizeToken(token);
+
+ int start = 0;
+ int end = sortedIndexEntries.size();
+
+ final Comparator<Object> sortCollator = sortLanguage.getCollator();
+ while (start < end) {
+ final int mid = (start + end) / 2;
+ if (interrupted.get()) {
+ return -1;
+ }
+ final IndexEntry midEntry = sortedIndexEntries.get(mid);
+
+ int comp = NormalizeComparator.compareWithoutDash(token, midEntry.normalizedToken(), sortCollator, dict.dictFileVersion);
+ if (comp == 0)
+ comp = sortCollator.compare(token, midEntry.normalizedToken());
+ if (comp == 0) {
+ return windBackCase(token, mid, interrupted);
+ } else if (comp < 0) {
+ // System.out.println("Upper bound: " + midEntry + ", norm=" +
+ // midEntry.normalizedToken() + ", mid=" + mid);
+
+ // Hack for robustness if sort order is broken
+ if (mid + 2 < end &&
+ compareIdx(token, sortCollator, mid + 1) > 0 &&
+ compareIdx(token, sortCollator, mid + 2) > 0) {
+ start = mid;
+ } else {
+ end = mid;
+ }
+ } else {
+ // System.out.println("Lower bound: " + midEntry + ", norm=" +
+ // midEntry.normalizedToken() + ", mid=" + mid);
+
+ // Hack for robustness if sort order is broken
+ if (mid - 2 >= start &&
+ compareIdx(token, sortCollator, mid - 1) < 0 &&
+ compareIdx(token, sortCollator, mid - 2) < 0) {
+ end = mid + 1;
+ } else {
+ start = mid + 1;
+ }
+ }
+ }
+
+ // if the word before is the better match, move
+ // our result to it
+ if (start > 0 && start < sortedIndexEntries.size()) {
+ String prev = sortedIndexEntries.get(start - 1).normalizedToken();
+ String next = sortedIndexEntries.get(start).normalizedToken();
+ if (findMatchLen(sortCollator, token, prev) >= findMatchLen(sortCollator, token, next))
+ start--;
+ }
+
+ // If we search for a substring of a string that's in there, return
+ // that.
+ int result = Math.min(start, sortedIndexEntries.size() - 1);
+ result = windBackCase(sortedIndexEntries.get(result).normalizedToken(), result, interrupted);
+ return result;
+ }
+
+ private final int windBackCase(final String token, int result, final AtomicBoolean interrupted) {
+ while (result > 0 && sortedIndexEntries.get(result - 1).normalizedToken().equals(token)) {
+ --result;
+ if (interrupted.get()) {
+ return result;
+ }
+ }
+ return result;
+ }
+
+ public IndexInfo getIndexInfo() {
+ return new DictionaryInfo.IndexInfo(shortName, sortedIndexEntries.size(), mainTokenCount);
+ }
+
+ private static final int MAX_SEARCH_ROWS = 1000;
+
+ private final Map<String, Integer> prefixToNumRows = new HashMap<>();
+
+ private synchronized final int getUpperBoundOnRowsStartingWith(final String normalizedPrefix,
+ final int maxRows, final AtomicBoolean interrupted) {
+ final Integer numRows = prefixToNumRows.get(normalizedPrefix);
+ if (numRows != null) {
+ return numRows;
+ }
+ final int insertionPointIndex = findInsertionPointIndex(normalizedPrefix, interrupted);
+
+ int rowCount = 0;
+ for (int index = insertionPointIndex; index < sortedIndexEntries.size(); ++index) {
+ if (interrupted.get()) {
+ return -1;
+ }
+ final IndexEntry indexEntry = sortedIndexEntries.get(index);
+ if (!indexEntry.normalizedToken.startsWith(normalizedPrefix) &&
+ !NormalizeComparator.withoutDash(indexEntry.normalizedToken).startsWith(normalizedPrefix)) {
+ break;
+ }
+ rowCount += indexEntry.numRows + indexEntry.htmlEntries.size();
+ if (rowCount > maxRows) {
+ System.out.println("Giving up, too many words with prefix: " + normalizedPrefix);
+ break;
+ }
+ }
+ prefixToNumRows.put(normalizedPrefix, rowCount);
+ return rowCount;
+ }
+
+ public final List<RowBase> multiWordSearch(
+ final String searchText, final List<String> searchTokens,
+ final AtomicBoolean interrupted) {
+ final long startMills = System.currentTimeMillis();
+ final List<RowBase> result = new ArrayList<>();
+
+ final Set<String> normalizedNonStoplist = new HashSet<>();
+
+ String bestPrefix = null;
+ int leastRows = Integer.MAX_VALUE;
+ final StringBuilder searchTokensRegex = new StringBuilder();
+ for (int i = 0; i < searchTokens.size(); ++i) {
+ if (interrupted.get()) {
+ return null;
+ }
+ final String searchToken = searchTokens.get(i);
+ final String normalized = normalizeToken(searchTokens.get(i));
+ // Normalize them all.
+ searchTokens.set(i, normalized);
+
+ if (!stoplist.contains(searchToken)) {
+ if (normalizedNonStoplist.add(normalized)) {
+ final int numRows = getUpperBoundOnRowsStartingWith(normalized,
+ MAX_SEARCH_ROWS, interrupted);
+ if (numRows != -1 && numRows < leastRows) {
+ if (numRows == 0) {
+ // We really are done here.
+ return Collections.emptyList();
+ }
+ leastRows = numRows;
+ bestPrefix = normalized;
+ }
+ }
+ }
+
+ if (searchTokensRegex.length() > 0) {
+ searchTokensRegex.append("[\\s]*");
+ }
+ searchTokensRegex.append(Pattern.quote(normalized));
+ }
+ final Pattern pattern = Pattern.compile(searchTokensRegex.toString());
+
+ if (bestPrefix == null) {
+ bestPrefix = searchTokens.get(0);
+ System.out.println("Everything was in the stoplist!");
+ }
+ System.out.println("Searching using prefix: " + bestPrefix + ", leastRows=" + leastRows
+ + ", searchTokens=" + searchTokens);
+
+ // Place to store the things that match.
+ final Map<RowMatchType, List<RowBase>> matches = new EnumMap<>(
+ RowMatchType.class);
+ for (final RowMatchType rowMatchType : RowMatchType.values()) {
+ if (rowMatchType != RowMatchType.NO_MATCH) {
+ matches.put(rowMatchType, new ArrayList<RowBase>());
+ }
+ }
+
+ int matchCount = 0;
+
+ final int exactMatchIndex = findInsertionPointIndex(searchText, interrupted);
+ if (exactMatchIndex != -1) {
+ final IndexEntry exactMatch = sortedIndexEntries.get(exactMatchIndex);
+ if (pattern.matcher(exactMatch.token).find()) {
+ matches.get(RowMatchType.TITLE_MATCH).add(rows.get(exactMatch.startRow));
+ }
+ }
+
+ final String searchToken = bestPrefix;
+ final int insertionPointIndex = findInsertionPointIndex(searchToken, interrupted);
+ final Set<RowKey> rowsAlreadySeen = new HashSet<>();
+ for (int index = insertionPointIndex; index < sortedIndexEntries.size()
+ && matchCount < MAX_SEARCH_ROWS; ++index) {
+ if (interrupted.get()) {
+ return null;
+ }
+ final IndexEntry indexEntry = sortedIndexEntries.get(index);
+ if (!indexEntry.normalizedToken.startsWith(searchToken) &&
+ !NormalizeComparator.withoutDash(indexEntry.normalizedToken).startsWith(searchToken)) {
+ break;
+ }
+
+ // System.out.println("Searching indexEntry: " + indexEntry.token);
+
+ // Extra +1 to skip token row.
+ for (int rowIndex = indexEntry.startRow + 1; rowIndex < indexEntry.startRow + 1
+ + indexEntry.numRows
+ && rowIndex < rows.size(); ++rowIndex) {
+ if (interrupted.get()) {
+ return null;
+ }
+ final RowBase row = rows.get(rowIndex);
+ final RowBase.RowKey rowKey = row.getRowKey();
+ if (rowsAlreadySeen.contains(rowKey)) {
+ continue;
+ }
+ rowsAlreadySeen.add(rowKey);
+ final RowMatchType matchType = row.matches(searchTokens, pattern, normalizer(),
+ swapPairEntries);
+ if (matchType != RowMatchType.NO_MATCH) {
+ matches.get(matchType).add(row);
+ ++matchCount;
+ }
+ }
+ }
+ // } // searchTokens
+
+ // Sort them into a reasonable order.
+ final RowBase.LengthComparator lengthComparator = new RowBase.LengthComparator(
+ swapPairEntries);
+ for (final Collection<RowBase> rows : matches.values()) {
+ final List<RowBase> ordered = new ArrayList<>(rows);
+ Collections.sort(ordered, lengthComparator);
+ result.addAll(ordered);
+ }
+
+ System.out.println("searchDuration: " + (System.currentTimeMillis() - startMills));