From: Thad Hughes Date: Sun, 21 Nov 2010 02:58:45 +0000 (-0800) Subject: go X-Git-Url: http://gitweb.fperrin.net/?p=Dictionary.git;a=commitdiff_plain;h=7570dd30c59e1fa56758af1e3b97b531df47fe9e go --- diff --git a/.classpath b/.classpath index f033a2a..1232048 100644 --- a/.classpath +++ b/.classpath @@ -3,7 +3,7 @@ - + diff --git a/images/icon.odg b/images/icon.odg index f70cbd7..a3d19d6 100644 Binary files a/images/icon.odg and b/images/icon.odg differ diff --git a/images/icon.png b/images/icon.png index 99ae67f..dc1df50 100644 Binary files a/images/icon.png and b/images/icon.png differ diff --git a/res/drawable/icon.png b/res/drawable/icon.png index 99ae67f..dc1df50 100644 Binary files a/res/drawable/icon.png and b/res/drawable/icon.png differ diff --git a/src/com/hughes/android/dictionary/DictionaryActivity.java b/src/com/hughes/android/dictionary/DictionaryActivity.java index afca2b5..a1521fa 100644 --- a/src/com/hughes/android/dictionary/DictionaryActivity.java +++ b/src/com/hughes/android/dictionary/DictionaryActivity.java @@ -139,9 +139,7 @@ public class DictionaryActivity extends ListActivity { public void run() { final long startMillis = System.currentTimeMillis(); for (final Index index : dictionary.indices) { - index.sortLanguage.getFindCollator(); - final com.ibm.icu.text.Collator c = index.sortLanguage - .getSortCollator(); + final com.ibm.icu.text.Collator c = index.sortLanguage.getCollator(); if (c.compare("pre-print", "preppy") >= 0) { Log.e(LOG, c.getClass() + " is buggy, lookups may not work properly."); @@ -438,19 +436,19 @@ public class DictionaryActivity extends ListActivity { return; } - final Index.SearchResult searchResult = searchOperation.searchResult; + final Index.IndexEntry searchResult = searchOperation.searchResult; Log.d(LOG, "searchFinished: " + searchOperation + ", searchResult=" + searchResult); jumpToRow(searchResult.longestPrefix.startRow); - if (!searchResult.success) { - if (vibrator != null) { - vibrator.vibrate(VIBRATE_MILLIS); - } - searchText.setText(searchResult.longestPrefixString); - searchText.setSelection(searchResult.longestPrefixString.length()); - return; - } +// if (!searchResult.success) { +// if (vibrator != null) { +// vibrator.vibrate(VIBRATE_MILLIS); +// } +// searchText.setText(searchResult.longestPrefixString); +// searchText.setSelection(searchResult.longestPrefixString.length()); +// return; +// } } private final void jumpToRow(final int row) { @@ -466,7 +464,7 @@ public class DictionaryActivity extends ListActivity { long searchStartMillis; - Index.SearchResult searchResult; + Index.IndexEntry searchResult; SearchOperation(final String searchText, final Index index) { this.searchText = searchText.trim(); @@ -480,7 +478,7 @@ public class DictionaryActivity extends ListActivity { @Override public void run() { searchStartMillis = System.currentTimeMillis(); - searchResult = index.findLongestSubstring(searchText, interrupted); + searchResult = index.findInsertionPoint(searchText, interrupted); Log.d(LOG, "searchText=" + searchText + ", searchDuration=" + (System.currentTimeMillis() - searchStartMillis) + ", interrupted=" + interrupted.get()); diff --git a/src/com/hughes/android/dictionary/engine/Index.java b/src/com/hughes/android/dictionary/engine/Index.java index 2d3d42f..7cee746 100644 --- a/src/com/hughes/android/dictionary/engine/Index.java +++ b/src/com/hughes/android/dictionary/engine/Index.java @@ -17,6 +17,7 @@ import com.hughes.util.raf.RAFSerializable; import com.hughes.util.raf.RAFSerializer; import com.hughes.util.raf.UniformRAFList; import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Transliterator; public final class Index implements RAFSerializable { @@ -29,6 +30,10 @@ public final class Index implements RAFSerializable { // persisted: tells how the entries are sorted. public final Language sortLanguage; + final String normalizerRules; + + // Built from the two above. + final Transliterator normalizer; // persisted public final List sortedIndexEntries; @@ -42,14 +47,17 @@ public final class Index implements RAFSerializable { // -------------------------------------------------------------------------- - public Index(final Dictionary dict, final String shortName, final String longName, final Language sortLanguage, final boolean swapPairEntries) { + public Index(final Dictionary dict, final String shortName, final String longName, final Language sortLanguage, final String normalizerRules, final boolean swapPairEntries) { this.dict = dict; this.shortName = shortName; this.longName = longName; this.sortLanguage = sortLanguage; + this.normalizerRules = normalizerRules; this.swapPairEntries = swapPairEntries; sortedIndexEntries = new ArrayList(); rows = new ArrayList(); + + normalizer = Transliterator.createFromRules("", normalizerRules, Transliterator.FORWARD); } public Index(final Dictionary dict, final RandomAccessFile raf) throws IOException { @@ -58,12 +66,15 @@ public final class Index implements RAFSerializable { longName = raf.readUTF(); final String languageCode = raf.readUTF(); sortLanguage = Language.lookup(languageCode); + normalizerRules = raf.readUTF(); swapPairEntries = raf.readBoolean(); if (sortLanguage == null) { throw new IOException("Unsupported language: " + languageCode); } sortedIndexEntries = CachingList.create(RAFList.create(raf, IndexEntry.SERIALIZER, raf.getFilePointer()), CACHE_SIZE); rows = CachingList.create(UniformRAFList.create(raf, new RowBase.Serializer(this), raf.getFilePointer()), CACHE_SIZE); + + normalizer = Transliterator.createFromRules("", normalizerRules, Transliterator.FORWARD); } @Override @@ -71,6 +82,7 @@ public final class Index implements RAFSerializable { raf.writeUTF(shortName); raf.writeUTF(longName); raf.writeUTF(sortLanguage.getSymbol()); + raf.writeUTF(normalizerRules); raf.writeBoolean(swapPairEntries); RAFList.write(raf, sortedIndexEntries, IndexEntry.SERIALIZER); UniformRAFList.write(raf, (Collection) rows, new RowBase.Serializer(this), 5); @@ -87,6 +99,8 @@ public final class Index implements RAFSerializable { public final int startRow; public final int numRows; + private String normalizedToken; + static final RAFSerializer SERIALIZER = new RAFSerializer () { @Override public IndexEntry read(RandomAccessFile raf) throws IOException { @@ -120,15 +134,22 @@ public final class Index implements RAFSerializable { public String toString() { return String.format("%s@%d(%d)", token, startRow, numRows); } + + public synchronized String normalizedToken(final Transliterator normalizer) { + if (normalizedToken == null) { + normalizedToken = normalizer.transform(token); + } + return normalizedToken; + } } public IndexEntry findInsertionPoint(String token, final AtomicBoolean interrupted) { - token = sortLanguage.textNorm(token, true); + token = normalizer.transliterate(token); int start = 0; int end = sortedIndexEntries.size(); - final Collator sortCollator = sortLanguage.getSortCollator(); + final Collator sortCollator = sortLanguage.getCollator(); while (start < end) { final int mid = (start + end) / 2; if (interrupted.get()) { @@ -136,22 +157,22 @@ public final class Index implements RAFSerializable { } final IndexEntry midEntry = sortedIndexEntries.get(mid); - final int comp = sortCollator.compare(token, sortLanguage.textNorm(midEntry.token, true)); + final int comp = sortCollator.compare(token, midEntry.normalizedToken(normalizer)); if (comp == 0) { - final int result = windBackCase(token, mid, sortCollator, interrupted); + final int result = windBackCase(token, mid, interrupted); return sortedIndexEntries.get(result); } else if (comp < 0) { -// Log.d("THAD", "Upper bound: " + midEntry); + System.out.println("Upper bound: " + midEntry + ", norm=" + midEntry.normalizedToken(normalizer) + ", mid=" + mid); end = mid; } else { -// Log.d("THAD", "Lower bound: " + midEntry); + System.out.println("Lower bound: " + midEntry + ", norm=" + midEntry.normalizedToken(normalizer) + ", mid=" + mid); start = mid + 1; } } // If we search for a substring of a string that's in there, return that. int result = Math.min(start, sortedIndexEntries.size() - 1); - result = windBackCase(sortLanguage.textNorm(sortedIndexEntries.get(result).token, true), result, sortCollator, interrupted); + result = windBackCase(sortedIndexEntries.get(result).normalizedToken(normalizer), result, interrupted); return sortedIndexEntries.get(result); } @@ -175,32 +196,33 @@ public final class Index implements RAFSerializable { } } - public SearchResult findLongestSubstring(String token, final AtomicBoolean interrupted) { - if (token.length() == 0) { - return new SearchResult(sortedIndexEntries.get(0), sortedIndexEntries.get(0), "", true); - } - IndexEntry insertionPoint = null; - IndexEntry result = null; - boolean unmodified = true; - while (!interrupted.get() && token.length() > 0) { - result = findInsertionPoint(token, interrupted); - if (result == null) { - return null; - } - if (unmodified) { - insertionPoint = result; - } - if (sortLanguage.textNorm(result.token, true).startsWith(sortLanguage.textNorm(token, true))) { - return new SearchResult(insertionPoint, result, token, unmodified); - } - unmodified = false; - token = token.substring(0, token.length() - 1); - } - return new SearchResult(insertionPoint, sortedIndexEntries.get(0), "", false); - } +// public SearchResult findLongestSubstring(String token, final AtomicBoolean interrupted) { +// token = normalizer.transliterate(token); +// if (token.length() == 0) { +// return new SearchResult(sortedIndexEntries.get(0), sortedIndexEntries.get(0), "", true); +// } +// IndexEntry insertionPoint = null; +// IndexEntry result = null; +// boolean unmodified = true; +// while (!interrupted.get() && token.length() > 0) { +// result = findInsertionPoint(token, interrupted); +// if (result == null) { +// return null; +// } +// if (unmodified) { +// insertionPoint = result; +// } +// if (result.normalizedToken(normalizer).startsWith(token)) { +// return new SearchResult(insertionPoint, result, token, unmodified); +// } +// unmodified = false; +// token = token.substring(0, token.length() - 1); +// } +// return new SearchResult(insertionPoint, sortedIndexEntries.get(0), "", false); +// } - private final int windBackCase(final String token, int result, final Collator sortCollator, final AtomicBoolean interrupted) { - while (result > 0 && sortCollator.compare(sortLanguage.textNorm(sortedIndexEntries.get(result - 1).token, true), token) >= 0) { + private final int windBackCase(final String token, int result, final AtomicBoolean interrupted) { + while (result > 0 && sortedIndexEntries.get(result - 1).normalizedToken(normalizer).equals(token)) { --result; if (interrupted.get()) { return result; diff --git a/src/com/hughes/android/dictionary/engine/Language.java b/src/com/hughes/android/dictionary/engine/Language.java index b4d8558..42ad7b6 100755 --- a/src/com/hughes/android/dictionary/engine/Language.java +++ b/src/com/hughes/android/dictionary/engine/Language.java @@ -1,6 +1,5 @@ package com.hughes.android.dictionary.engine; -import java.util.Comparator; import java.util.LinkedHashMap; import java.util.Locale; import java.util.Map; @@ -13,36 +12,18 @@ public class Language { final String symbol; final Locale locale; - - Collator sortCollator; - final Comparator sortComparator; - - private Collator findCollator; - final Comparator findComparator; + + final Collator collator; public Language(final Locale locale) { this.symbol = locale.getLanguage(); this.locale = locale; + this.collator = Collator.getInstance(locale); + this.collator.setStrength(Collator.IDENTICAL); - this.sortComparator = new Comparator() { - public int compare(final String s1, final String s2) { - return getSortCollator().compare(textNorm(s1, false), textNorm(s2, false)); - } - }; - - this.findComparator = new Comparator() { - public int compare(final String s1, final String s2) { - return getFindCollator().compare(textNorm(s1, false), textNorm(s2, false)); - } - }; - symbolToLangauge.put(symbol.toLowerCase(), this); } - public String textNorm(final String s, final boolean toLower) { - return toLower ? s.toLowerCase() : s; - } - @Override public String toString() { return locale.toString(); @@ -52,24 +33,13 @@ public class Language { return symbol; } - public synchronized Collator getFindCollator() { - if (findCollator == null) { - findCollator = Collator.getInstance(locale); - findCollator.setDecomposition(Collator.CANONICAL_DECOMPOSITION); - findCollator.setStrength(Collator.SECONDARY); - } - return findCollator; + public Collator getCollator() { + return collator; } - - public synchronized Collator getSortCollator() { - if (sortCollator == null) { - sortCollator = Collator.getInstance(locale); - sortCollator.setDecomposition(Collator.CANONICAL_DECOMPOSITION); - sortCollator.setStrength(Collator.IDENTICAL); - } - return sortCollator; + + public String getDefaultNormalizerRules() { + return ":: Any-Latin; :: Lower; :: NFD; :: [:Nonspacing Mark:] Remove; :: NFC ;"; } - // ---------------------------------------------------------------- public static final Language en = new Language(Locale.ENGLISH); @@ -78,37 +48,8 @@ public class Language { public static final Language de = new Language(Locale.GERMAN) { @Override - public String textNorm(String token, final boolean toLower) { - if (toLower) { - token = token.toLowerCase(); - } - boolean sub = false; - // This is meant to be fast: occurrences of ae, oe, ue are probably rare. - for (int ePos = token.indexOf('e', 1); ePos != -1; ePos = token.indexOf( - 'e', ePos + 1)) { - final char pre = Character.toLowerCase(token.charAt(ePos - 1)); - if (pre == 'a' || pre == 'o' || pre == 'u') { - sub = true; - break; - } - } - if (!sub) { - return token; - } - - token = token.replaceAll("ae", "ä"); - token = token.replaceAll("oe", "ö"); - token = token.replaceAll("ue", "ü"); - - token = token.replaceAll("Ae", "Ä"); - token = token.replaceAll("Oe", "Ö"); - token = token.replaceAll("Ue", "Ü"); - - token = token.replaceAll("AE", "Ä"); - token = token.replaceAll("OE", "Ö"); - token = token.replaceAll("UE", "Ü"); - - return token; + public String getDefaultNormalizerRules() { + return ":: Lower; 'ae' > 'ä'; 'oe' > 'ö'; 'ue' > 'ü'; 'ß' > 'ss'; "; } };