From: thadh Date: Fri, 29 May 2009 21:57:41 +0000 (-0700) Subject: go X-Git-Url: http://gitweb.fperrin.net/?a=commitdiff_plain;h=beb33e1b7f7e320eb434dcb0b2c03b3730e48a68;p=Dictionary.git go --- diff --git a/src/com/hughes/android/dictionary/Dictionary.java b/src/com/hughes/android/dictionary/Dictionary.java index 8a68d6c..7fd38f5 100755 --- a/src/com/hughes/android/dictionary/Dictionary.java +++ b/src/com/hughes/android/dictionary/Dictionary.java @@ -100,10 +100,10 @@ public final class Dictionary implements RAFSerializable { } final IndexEntry midEntry = sortedIndex.get(mid); - final int comp = language.tokenComparator.compare(word, midEntry.word.toLowerCase()); + final int comp = language.sortComparator.compare(word, midEntry.word.toLowerCase()); if (comp == 0) { int result = mid; - while (result > 0 && language.tokenComparator.compare(word, sortedIndex.get(result - 1).word.toLowerCase()) == 0) { + while (result > 0 && language.findComparator.compare(word, sortedIndex.get(result - 1).word.toLowerCase()) == 0) { --result; if (interrupted.get()) { return result; diff --git a/src/com/hughes/android/dictionary/Entry.java b/src/com/hughes/android/dictionary/Entry.java index 6143b49..3facbf5 100755 --- a/src/com/hughes/android/dictionary/Entry.java +++ b/src/com/hughes/android/dictionary/Entry.java @@ -152,6 +152,11 @@ public final class Entry implements RAFSerializable { text = text.replaceAll("[\"/\\()<>\\[\\],;?!.]", " "); text = text.replaceAll("[:] ", " "); text = text.replaceAll(" [:]", " "); + + // Now be really conservative about what we allow inside a token: + // See: http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values + text = text.replaceAll("[^-:\\p{Lu}\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}\\p{Nd}\\p{Nl}\\p{No}]", " "); + result.addAll(Arrays.asList(WHITESPACE.split(text))); text = text.replaceAll("[-]", " "); diff --git a/src/com/hughes/android/dictionary/Language.java b/src/com/hughes/android/dictionary/Language.java index 122372d..9591bc0 100755 --- a/src/com/hughes/android/dictionary/Language.java +++ b/src/com/hughes/android/dictionary/Language.java @@ -1,81 +1,82 @@ package com.hughes.android.dictionary; +import java.text.Collator; import java.util.Comparator; import java.util.LinkedHashMap; +import java.util.Locale; import java.util.Map; -import com.hughes.util.StringUtil; - -public abstract class Language { +public class Language { final String symbol; - final Comparator tokenComparator; + final Locale locale; + + final Collator sortCollator; + final Comparator sortComparator; - public Language(final String symbol) { + final Collator findCollator; + final Comparator findComparator; + + public Language(final String symbol, final Locale locale) { this.symbol = symbol; - this.tokenComparator = new Comparator() { + this.locale = locale; + + this.sortCollator = Collator.getInstance(locale); + this.sortCollator.setDecomposition(Collator.CANONICAL_DECOMPOSITION); + this.sortCollator.setStrength(Collator.IDENTICAL); + this.sortComparator = new Comparator() { public int compare(final String s1, final String s2) { - final String norm1 = normalizeTokenForSort(s1); - final String norm2 = normalizeTokenForSort(s2); - final int c = norm1.compareTo(norm2); - if (c != 0) { - return c; - } - return StringUtil.flipCase(StringUtil.reverse(s1)).compareTo(StringUtil.flipCase(StringUtil.reverse(s2))); - }}; + return sortCollator.compare(textNorm(s1), textNorm(s2)); + } + }; + + this.findCollator = Collator.getInstance(locale); + this.findCollator.setDecomposition(Collator.CANONICAL_DECOMPOSITION); + this.findCollator.setStrength(Collator.SECONDARY); + this.findComparator = new Comparator() { + public int compare(final String s1, final String s2) { + return findCollator.compare(textNorm(s1), textNorm(s2)); + } + }; + } - + + public String textNorm(final String s) { + return s; + } + @Override public String toString() { return symbol; } - abstract String normalizeTokenForSort(final String token); + // ---------------------------------------------------------------- + public static final Language EN = new Language("EN", Locale.ENGLISH); - // ---------------------------------------------------------------- - - static final String normalizeTokenForSort(final String token, final boolean vowelETranslation) { - final StringBuilder result = new StringBuilder(); - for (int i = 0; i < token.length(); ++i) { - Character c = token.charAt(i); - c = Character.toLowerCase(c); - // only check for lowercase 'e' in subsequent position means don't treat acronyms as umlauted: SAE. - if (vowelETranslation && (c == 'a' || c == 'o' || c == 'u') && i + 1 < token.length() && token.charAt(i + 1) == 'e') { - if (c == 'a') { - result.append('ä'); - } else if (c == 'o') { - result.append('ö'); - } else if (c == 'u') { - result.append('ü'); + public static final Language DE = new Language("DE", Locale.GERMAN) { + @Override + public String textNorm(String token) { + boolean sub = false; + for (int ePos = token.indexOf('e', 1); ePos != -1; ePos = token.indexOf( + 'e', ePos + 1)) { + final char pre = Character.toLowerCase(token.charAt(ePos - 1)); + if (pre == 'a' || pre == 'o' || pre == 'u') { + sub = true; + break; } - ++i; - } else if (c >= 'a' && c <= 'z' || c >= '0' && c <= '9') { - result.append(c); - } else if (c == 'ß') { - result.append("ss"); - } else if (c == 'ä') { - result.append(c); - } else if (c == 'ö') { - result.append(c); - } else if (c == 'ü') { - result.append(c); } - } - return result.toString(); - } + if (!sub) { + return token; + } + token = token.replaceAll("ae", "ä"); + token = token.replaceAll("oe", "ö"); + token = token.replaceAll("ue", "ü"); - public static final Language EN = new Language("EN") { - @Override - public String normalizeTokenForSort(final String token) { - return Language.normalizeTokenForSort(token, false); - } - }; - - public static final Language DE = new Language("DE") { - @Override - String normalizeTokenForSort(final String token) { - return Language.normalizeTokenForSort(token, true); + token = token.replaceAll("Ae", "Ä"); + token = token.replaceAll("Oe", "Ö"); + token = token.replaceAll("Ue", "Ü"); + return token; } }; @@ -87,10 +88,9 @@ public abstract class Language { symbolToLangauge.put(EN.symbol, EN); symbolToLangauge.put(DE.symbol, DE); } - + static Language lookup(final String symbol) { return symbolToLangauge.get(symbol); } - }