public void run() {\r
final long startMillis = System.currentTimeMillis();\r
for (final Index index : dictionary.indices) {\r
- index.sortLanguage.getFindCollator();\r
- final com.ibm.icu.text.Collator c = index.sortLanguage\r
- .getSortCollator();\r
+ final com.ibm.icu.text.Collator c = index.sortLanguage.getCollator();\r
if (c.compare("pre-print", "preppy") >= 0) {\r
Log.e(LOG, c.getClass()\r
+ " is buggy, lookups may not work properly.");\r
return;\r
}\r
\r
- final Index.SearchResult searchResult = searchOperation.searchResult;\r
+ final Index.IndexEntry searchResult = searchOperation.searchResult;\r
Log.d(LOG, "searchFinished: " + searchOperation + ", searchResult=" + searchResult);\r
\r
jumpToRow(searchResult.longestPrefix.startRow);\r
\r
- if (!searchResult.success) {\r
- if (vibrator != null) {\r
- vibrator.vibrate(VIBRATE_MILLIS);\r
- }\r
- searchText.setText(searchResult.longestPrefixString);\r
- searchText.setSelection(searchResult.longestPrefixString.length());\r
- return;\r
- }\r
+// if (!searchResult.success) {\r
+// if (vibrator != null) {\r
+// vibrator.vibrate(VIBRATE_MILLIS);\r
+// }\r
+// searchText.setText(searchResult.longestPrefixString);\r
+// searchText.setSelection(searchResult.longestPrefixString.length());\r
+// return;\r
+// }\r
}\r
\r
private final void jumpToRow(final int row) {\r
\r
long searchStartMillis;\r
\r
- Index.SearchResult searchResult;\r
+ Index.IndexEntry searchResult;\r
\r
SearchOperation(final String searchText, final Index index) {\r
this.searchText = searchText.trim();\r
@Override\r
public void run() {\r
searchStartMillis = System.currentTimeMillis();\r
- searchResult = index.findLongestSubstring(searchText, interrupted);\r
+ searchResult = index.findInsertionPoint(searchText, interrupted);\r
Log.d(LOG, "searchText=" + searchText + ", searchDuration="\r
+ (System.currentTimeMillis() - searchStartMillis) + ", interrupted="\r
+ interrupted.get());\r
import com.hughes.util.raf.RAFSerializer;
import com.hughes.util.raf.UniformRAFList;
import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.Transliterator;
public final class Index implements RAFSerializable<Index> {
// persisted: tells how the entries are sorted.
public final Language sortLanguage;
+ final String normalizerRules;
+
+ // Built from the two above.
+ final Transliterator normalizer;
// persisted
public final List<IndexEntry> sortedIndexEntries;
// --------------------------------------------------------------------------
- public Index(final Dictionary dict, final String shortName, final String longName, final Language sortLanguage, final boolean swapPairEntries) {
+ public Index(final Dictionary dict, final String shortName, final String longName, final Language sortLanguage, final String normalizerRules, final boolean swapPairEntries) {
this.dict = dict;
this.shortName = shortName;
this.longName = longName;
this.sortLanguage = sortLanguage;
+ this.normalizerRules = normalizerRules;
this.swapPairEntries = swapPairEntries;
sortedIndexEntries = new ArrayList<IndexEntry>();
rows = new ArrayList<RowBase>();
+
+ normalizer = Transliterator.createFromRules("", normalizerRules, Transliterator.FORWARD);
}
public Index(final Dictionary dict, final RandomAccessFile raf) throws IOException {
longName = raf.readUTF();
final String languageCode = raf.readUTF();
sortLanguage = Language.lookup(languageCode);
+ normalizerRules = raf.readUTF();
swapPairEntries = raf.readBoolean();
if (sortLanguage == null) {
throw new IOException("Unsupported language: " + languageCode);
}
sortedIndexEntries = CachingList.create(RAFList.create(raf, IndexEntry.SERIALIZER, raf.getFilePointer()), CACHE_SIZE);
rows = CachingList.create(UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()), CACHE_SIZE);
+
+ normalizer = Transliterator.createFromRules("", normalizerRules, Transliterator.FORWARD);
}
@Override
raf.writeUTF(shortName);
raf.writeUTF(longName);
raf.writeUTF(sortLanguage.getSymbol());
+ raf.writeUTF(normalizerRules);
raf.writeBoolean(swapPairEntries);
RAFList.write(raf, sortedIndexEntries, IndexEntry.SERIALIZER);
UniformRAFList.write(raf, (Collection<RowBase>) rows, new RowBase.Serializer(this), 5);
public final int startRow;
public final int numRows;
+ private String normalizedToken;
+
static final RAFSerializer<IndexEntry> SERIALIZER = new RAFSerializer<IndexEntry> () {
@Override
public IndexEntry read(RandomAccessFile raf) throws IOException {
public String toString() {
return String.format("%s@%d(%d)", token, startRow, numRows);
}
+
+ public synchronized String normalizedToken(final Transliterator normalizer) {
+ if (normalizedToken == null) {
+ normalizedToken = normalizer.transform(token);
+ }
+ return normalizedToken;
+ }
}
public IndexEntry findInsertionPoint(String token, final AtomicBoolean interrupted) {
- token = sortLanguage.textNorm(token, true);
+ token = normalizer.transliterate(token);
int start = 0;
int end = sortedIndexEntries.size();
- final Collator sortCollator = sortLanguage.getSortCollator();
+ final Collator sortCollator = sortLanguage.getCollator();
while (start < end) {
final int mid = (start + end) / 2;
if (interrupted.get()) {
}
final IndexEntry midEntry = sortedIndexEntries.get(mid);
- final int comp = sortCollator.compare(token, sortLanguage.textNorm(midEntry.token, true));
+ final int comp = sortCollator.compare(token, midEntry.normalizedToken(normalizer));
if (comp == 0) {
- final int result = windBackCase(token, mid, sortCollator, interrupted);
+ final int result = windBackCase(token, mid, interrupted);
return sortedIndexEntries.get(result);
} else if (comp < 0) {
-// Log.d("THAD", "Upper bound: " + midEntry);
+ System.out.println("Upper bound: " + midEntry + ", norm=" + midEntry.normalizedToken(normalizer) + ", mid=" + mid);
end = mid;
} else {
-// Log.d("THAD", "Lower bound: " + midEntry);
+ System.out.println("Lower bound: " + midEntry + ", norm=" + midEntry.normalizedToken(normalizer) + ", mid=" + mid);
start = mid + 1;
}
}
// If we search for a substring of a string that's in there, return that.
int result = Math.min(start, sortedIndexEntries.size() - 1);
- result = windBackCase(sortLanguage.textNorm(sortedIndexEntries.get(result).token, true), result, sortCollator, interrupted);
+ result = windBackCase(sortedIndexEntries.get(result).normalizedToken(normalizer), result, interrupted);
return sortedIndexEntries.get(result);
}
}
}
- public SearchResult findLongestSubstring(String token, final AtomicBoolean interrupted) {
- if (token.length() == 0) {
- return new SearchResult(sortedIndexEntries.get(0), sortedIndexEntries.get(0), "", true);
- }
- IndexEntry insertionPoint = null;
- IndexEntry result = null;
- boolean unmodified = true;
- while (!interrupted.get() && token.length() > 0) {
- result = findInsertionPoint(token, interrupted);
- if (result == null) {
- return null;
- }
- if (unmodified) {
- insertionPoint = result;
- }
- if (sortLanguage.textNorm(result.token, true).startsWith(sortLanguage.textNorm(token, true))) {
- return new SearchResult(insertionPoint, result, token, unmodified);
- }
- unmodified = false;
- token = token.substring(0, token.length() - 1);
- }
- return new SearchResult(insertionPoint, sortedIndexEntries.get(0), "", false);
- }
+// public SearchResult findLongestSubstring(String token, final AtomicBoolean interrupted) {
+// token = normalizer.transliterate(token);
+// if (token.length() == 0) {
+// return new SearchResult(sortedIndexEntries.get(0), sortedIndexEntries.get(0), "", true);
+// }
+// IndexEntry insertionPoint = null;
+// IndexEntry result = null;
+// boolean unmodified = true;
+// while (!interrupted.get() && token.length() > 0) {
+// result = findInsertionPoint(token, interrupted);
+// if (result == null) {
+// return null;
+// }
+// if (unmodified) {
+// insertionPoint = result;
+// }
+// if (result.normalizedToken(normalizer).startsWith(token)) {
+// return new SearchResult(insertionPoint, result, token, unmodified);
+// }
+// unmodified = false;
+// token = token.substring(0, token.length() - 1);
+// }
+// return new SearchResult(insertionPoint, sortedIndexEntries.get(0), "", false);
+// }
- private final int windBackCase(final String token, int result, final Collator sortCollator, final AtomicBoolean interrupted) {
- while (result > 0 && sortCollator.compare(sortLanguage.textNorm(sortedIndexEntries.get(result - 1).token, true), token) >= 0) {
+ private final int windBackCase(final String token, int result, final AtomicBoolean interrupted) {
+ while (result > 0 && sortedIndexEntries.get(result - 1).normalizedToken(normalizer).equals(token)) {
--result;
if (interrupted.get()) {
return result;
package com.hughes.android.dictionary.engine;\r
\r
-import java.util.Comparator;\r
import java.util.LinkedHashMap;\r
import java.util.Locale;\r
import java.util.Map;\r
\r
final String symbol;\r
final Locale locale;\r
-\r
- Collator sortCollator;\r
- final Comparator<String> sortComparator;\r
-\r
- private Collator findCollator;\r
- final Comparator<String> findComparator;\r
+ \r
+ final Collator collator;\r
\r
public Language(final Locale locale) {\r
this.symbol = locale.getLanguage();\r
this.locale = locale;\r
+ this.collator = Collator.getInstance(locale);\r
+ this.collator.setStrength(Collator.IDENTICAL);\r
\r
- this.sortComparator = new Comparator<String>() {\r
- public int compare(final String s1, final String s2) {\r
- return getSortCollator().compare(textNorm(s1, false), textNorm(s2, false));\r
- }\r
- };\r
-\r
- this.findComparator = new Comparator<String>() {\r
- public int compare(final String s1, final String s2) {\r
- return getFindCollator().compare(textNorm(s1, false), textNorm(s2, false));\r
- }\r
- };\r
- \r
symbolToLangauge.put(symbol.toLowerCase(), this);\r
}\r
\r
- public String textNorm(final String s, final boolean toLower) {\r
- return toLower ? s.toLowerCase() : s;\r
- }\r
-\r
@Override\r
public String toString() {\r
return locale.toString();\r
return symbol;\r
}\r
\r
- public synchronized Collator getFindCollator() {\r
- if (findCollator == null) {\r
- findCollator = Collator.getInstance(locale);\r
- findCollator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);\r
- findCollator.setStrength(Collator.SECONDARY);\r
- }\r
- return findCollator;\r
+ public Collator getCollator() {\r
+ return collator;\r
}\r
-\r
- public synchronized Collator getSortCollator() {\r
- if (sortCollator == null) {\r
- sortCollator = Collator.getInstance(locale);\r
- sortCollator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);\r
- sortCollator.setStrength(Collator.IDENTICAL);\r
- }\r
- return sortCollator;\r
+ \r
+ public String getDefaultNormalizerRules() {\r
+ return ":: Any-Latin; :: Lower; :: NFD; :: [:Nonspacing Mark:] Remove; :: NFC ;";\r
}\r
-\r
// ----------------------------------------------------------------\r
\r
public static final Language en = new Language(Locale.ENGLISH);\r
\r
public static final Language de = new Language(Locale.GERMAN) {\r
@Override\r
- public String textNorm(String token, final boolean toLower) {\r
- if (toLower) {\r
- token = token.toLowerCase();\r
- }\r
- boolean sub = false;\r
- // This is meant to be fast: occurrences of ae, oe, ue are probably rare.\r
- for (int ePos = token.indexOf('e', 1); ePos != -1; ePos = token.indexOf(\r
- 'e', ePos + 1)) {\r
- final char pre = Character.toLowerCase(token.charAt(ePos - 1));\r
- if (pre == 'a' || pre == 'o' || pre == 'u') {\r
- sub = true;\r
- break;\r
- }\r
- }\r
- if (!sub) {\r
- return token;\r
- }\r
- \r
- token = token.replaceAll("ae", "ä");\r
- token = token.replaceAll("oe", "ö");\r
- token = token.replaceAll("ue", "ü");\r
-\r
- token = token.replaceAll("Ae", "Ä");\r
- token = token.replaceAll("Oe", "Ö");\r
- token = token.replaceAll("Ue", "Ü");\r
-\r
- token = token.replaceAll("AE", "Ä");\r
- token = token.replaceAll("OE", "Ö");\r
- token = token.replaceAll("UE", "Ü");\r
- \r
- return token; \r
+ public String getDefaultNormalizerRules() {\r
+ return ":: Lower; 'ae' > 'ä'; 'oe' > 'ö'; 'ue' > 'ü'; 'ß' > 'ss'; ";\r
}\r
};\r
\r