import com.hughes.util.raf.RAFSerializer;
import com.hughes.util.raf.UniformRAFList;
import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.Transliterator;
public final class Index implements RAFSerializable<Index> {
// persisted: tells how the entries are sorted.
public final Language sortLanguage;
+ final String normalizerRules;
+
+ // Built from the two above.
+ private Transliterator normalizer;
// persisted
public final List<IndexEntry> sortedIndexEntries;
// --------------------------------------------------------------------------
- public Index(final Dictionary dict, final String shortName, final String longName, final Language sortLanguage, final boolean swapPairEntries) {
+ public Index(final Dictionary dict, final String shortName, final String longName, final Language sortLanguage, final String normalizerRules, final boolean swapPairEntries) {
this.dict = dict;
this.shortName = shortName;
this.longName = longName;
this.sortLanguage = sortLanguage;
+ this.normalizerRules = normalizerRules;
this.swapPairEntries = swapPairEntries;
sortedIndexEntries = new ArrayList<IndexEntry>();
rows = new ArrayList<RowBase>();
+
+ normalizer = null;
+ }
+
+ public synchronized Transliterator normalizer() {
+ if (normalizer == null) {
+ normalizer = Transliterator.createFromRules("", normalizerRules, Transliterator.FORWARD);
+ }
+ return normalizer;
}
public Index(final Dictionary dict, final RandomAccessFile raf) throws IOException {
longName = raf.readUTF();
final String languageCode = raf.readUTF();
sortLanguage = Language.lookup(languageCode);
+ normalizerRules = raf.readUTF();
swapPairEntries = raf.readBoolean();
if (sortLanguage == null) {
throw new IOException("Unsupported language: " + languageCode);
raf.writeUTF(shortName);
raf.writeUTF(longName);
raf.writeUTF(sortLanguage.getSymbol());
+ raf.writeUTF(normalizerRules);
raf.writeBoolean(swapPairEntries);
RAFList.write(raf, sortedIndexEntries, IndexEntry.SERIALIZER);
UniformRAFList.write(raf, (Collection<RowBase>) rows, new RowBase.Serializer(this), 5);
}
}
- static final class IndexEntry implements RAFSerializable<Index.IndexEntry> {
- String token;
- int startRow;
+ public static final class IndexEntry implements RAFSerializable<Index.IndexEntry> {
+ public final String token;
+ private final String normalizedToken;
+ public final int startRow;
+ public final int numRows;
+
static final RAFSerializer<IndexEntry> SERIALIZER = new RAFSerializer<IndexEntry> () {
@Override
t.write(raf);
}};
- public IndexEntry(final String token, final int startRow) {
+ public IndexEntry(final String token, final String normalizedToken, final int startRow, final int numRows) {
assert token.equals(token.trim());
assert token.length() > 0;
this.token = token;
+ this.normalizedToken = normalizedToken;
this.startRow = startRow;
+ this.numRows = numRows;
}
public IndexEntry(final RandomAccessFile raf) throws IOException {
token = raf.readUTF();
startRow = raf.readInt();
+ numRows = raf.readInt();
+ final boolean hasNormalizedForm = raf.readBoolean();
+ normalizedToken = hasNormalizedForm ? raf.readUTF() : token;
}
public void write(RandomAccessFile raf) throws IOException {
raf.writeUTF(token);
raf.writeInt(startRow);
+ raf.writeInt(numRows);
+ final boolean hasNormalizedForm = !token.equals(normalizedToken);
+ raf.writeBoolean(hasNormalizedForm);
+ if (hasNormalizedForm) {
+ raf.writeUTF(normalizedToken);
+ }
}
public String toString() {
- return token + "@" + startRow;
+ return String.format("%s@%d(%d)", token, startRow, numRows);
+ }
+
+ public String normalizedToken() {
+ return normalizedToken;
}
}
public IndexEntry findInsertionPoint(String token, final AtomicBoolean interrupted) {
- token = sortLanguage.textNorm(token, true);
+ final Transliterator normalizer = normalizer();
+ if (TransliteratorManager.init(null)) {
+ token = normalizer.transliterate(token);
+ } else {
+ // Do our best since the Transliterators aren't up yet.
+ token = token.toLowerCase();
+ }
int start = 0;
int end = sortedIndexEntries.size();
- final Collator sortCollator = sortLanguage.getSortCollator();
+ final Collator sortCollator = sortLanguage.getCollator();
while (start < end) {
final int mid = (start + end) / 2;
if (interrupted.get()) {
}
final IndexEntry midEntry = sortedIndexEntries.get(mid);
- final int comp = sortCollator.compare(token, sortLanguage.textNorm(midEntry.token, true));
+ final int comp = sortCollator.compare(token, midEntry.normalizedToken());
if (comp == 0) {
- final int result = windBackCase(token, mid, sortCollator, interrupted);
+ final int result = windBackCase(token, mid, interrupted);
return sortedIndexEntries.get(result);
} else if (comp < 0) {
-// Log.d("THAD", "Upper bound: " + midEntry);
+ //System.out.println("Upper bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
end = mid;
} else {
-// Log.d("THAD", "Lower bound: " + midEntry);
+ //System.out.println("Lower bound: " + midEntry + ", norm=" + midEntry.normalizedToken() + ", mid=" + mid);
start = mid + 1;
}
}
// If we search for a substring of a string that's in there, return that.
int result = Math.min(start, sortedIndexEntries.size() - 1);
- result = windBackCase(sortLanguage.textNorm(sortedIndexEntries.get(result).token, true), result, sortCollator, interrupted);
+ result = windBackCase(sortedIndexEntries.get(result).normalizedToken(), result, interrupted);
return sortedIndexEntries.get(result);
}
public static final class SearchResult {
- final IndexEntry insertionPoint;
- final IndexEntry longestPrefix;
- final String longestPrefixString;
+ public final IndexEntry insertionPoint;
+ public final IndexEntry longestPrefix;
+ public final String longestPrefixString;
+ public final boolean success;
public SearchResult(IndexEntry insertionPoint, IndexEntry longestPrefix,
- String longestPrefixString) {
+ String longestPrefixString, boolean success) {
this.insertionPoint = insertionPoint;
this.longestPrefix = longestPrefix;
this.longestPrefixString = longestPrefixString;
+ this.success = success;
}
- }
-
- public SearchResult findLongestSubstring(String token, final AtomicBoolean interrupted) {
- IndexEntry insertionPoint = null;
- IndexEntry result = null;
- while (!interrupted.get() && token.length() > 0) {
- result = findInsertionPoint(token, interrupted);
- if (result == null) {
- return null;
- }
- if (insertionPoint == null) {
- insertionPoint = result;
- }
- if (sortLanguage.textNorm(result.token, true).startsWith(sortLanguage.textNorm(token, true))) {
- return new SearchResult(insertionPoint, result, token);
- }
- token = token.substring(0, token.length() - 1);
+
+ @Override
+ public String toString() {
+ return String.format("inerstionPoint=%s,longestPrefix=%s,longestPrefixString=%s,success=%b", insertionPoint.toString(), longestPrefix.toString(), longestPrefixString, success);
}
- return new SearchResult(insertionPoint, sortedIndexEntries.get(0), "");
}
- private final int windBackCase(final String token, int result, final Collator sortCollator, final AtomicBoolean interrupted) {
- while (result > 0 && sortCollator.compare(sortLanguage.textNorm(sortedIndexEntries.get(result - 1).token, true), token) >= 0) {
+// public SearchResult findLongestSubstring(String token, final AtomicBoolean interrupted) {
+// token = normalizer.transliterate(token);
+// if (token.length() == 0) {
+// return new SearchResult(sortedIndexEntries.get(0), sortedIndexEntries.get(0), "", true);
+// }
+// IndexEntry insertionPoint = null;
+// IndexEntry result = null;
+// boolean unmodified = true;
+// while (!interrupted.get() && token.length() > 0) {
+// result = findInsertionPoint(token, interrupted);
+// if (result == null) {
+// return null;
+// }
+// if (unmodified) {
+// insertionPoint = result;
+// }
+// if (result.normalizedToken(normalizer).startsWith(token)) {
+// return new SearchResult(insertionPoint, result, token, unmodified);
+// }
+// unmodified = false;
+// token = token.substring(0, token.length() - 1);
+// }
+// return new SearchResult(insertionPoint, sortedIndexEntries.get(0), "", false);
+// }
+
+ private final int windBackCase(final String token, int result, final AtomicBoolean interrupted) {
+ while (result > 0 && sortedIndexEntries.get(result - 1).normalizedToken().equals(token)) {
--result;
if (interrupted.get()) {
return result;
return result;
}
+ /*
+ public int tokenRowBinarySearch(final int rowIndex) {
+ int start = 0;
+ int end = sortedIndexEntries.size();
+ while (start < end) {
+ final int mid = (start + end) / 2;
+ final int midRowIndex = sortedIndexEntries.get(mid).startRow;
+ if (midRowIndex == rowIndex) {
+ return mid;
+ }
+ if ()
+ }
+ }
+ */
}
\ No newline at end of file