X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=jars%2Ficu4j-4_2_1-src%2Fsrc%2Fcom%2Fibm%2Ficu%2Ftext%2FIndexCharacters.java;h=9fb47617f0d5dcc9701f71ea898aa13e9ef6d9eb;hb=127973afabe0c34015667c599d68bf9453d85652;hp=c3f7fa36ad0792fa40fc6b921a51015ff4dbbdd5;hpb=92dfc8b7d39cbc2e55f3c547c0c265bc7ae3af86;p=Dictionary.git diff --git a/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/IndexCharacters.java b/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/IndexCharacters.java old mode 100755 new mode 100644 index c3f7fa3..9fb4761 --- a/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/IndexCharacters.java +++ b/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/IndexCharacters.java @@ -1,274 +1,274 @@ -//##header -//#if defined(FOUNDATION10) || defined(J2SE13) -//#else -/* - ******************************************************************************* - * Copyright (C) 2008-2009, Google Inc, International Business Machines Corporation - * and others. All Rights Reserved. - ******************************************************************************* - */ -package com.ibm.icu.text; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.util.LocaleData; -import com.ibm.icu.util.ULocale; -import com.ibm.icu.impl.CollectionUtilities.MultiComparator; - -/** - * A set of characters for use as a UI "index", that is, a - * list of clickable characters (or character sequences) that allow the user to - * see a segment of a larger "target" list. That is, each character corresponds - * to a bucket in the target list, where everything in the bucket is greater - * than or equal to the character (according to the locale's collation). The - * intention is to have two main functions; one that produces an index list that - * is relatively static, and the other is a list that produces roughly - * equally-sized buckets. Only the first is currently provided. - *

- * The static list would be presented as something like - * - *

- *  A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
- * 
- * - * In the UI, an index character could be omitted if its bucket is empty. For - * example, if there is nothing in the bucket for Q, then Q could be omitted. - *

- * Important Notes: - *

- * - * @author markdavis - * @draft ICU 4.2 - * @provisional This API might change or be removed in a future release. - */ -//TODO(markdavis) return an additional character that is the "least greater" character than -//the last character. -public class IndexCharacters { - public static final char CGJ = '\u034F'; - private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]"); - private static final UnicodeSet HANGUL = new UnicodeSet("[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14 \uC0AC \uC544 \uC790 \uCC28 \uCE74 \uD0C0 \uD30C \uD558]"); - private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"); - private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]"); - - private ULocale locale; - private Collator comparator; - private Set indexCharacters; - private LinkedHashMap alreadyIn = new LinkedHashMap(); - private List noDistinctSorting = new ArrayList(); - private List notAlphabetic = new ArrayList(); - - /** - * Create the index object. - * @param locale - * @draft ICU 4.2 - * @provisional This API might change or be removed in a future release. - */ - public IndexCharacters(ULocale locale) { - this.locale = locale; - comparator = Collator.getInstance(locale); - comparator.setStrength(Collator.PRIMARY); - - // get the exemplars, and handle special cases - - UnicodeSet exemplars = LocaleData.getExemplarSet(locale, LocaleData.ES_STANDARD); - // question: should we add auxiliary exemplars? - if (exemplars.containsSome(CORE_LATIN)) { - exemplars.addAll(CORE_LATIN); - } - if (exemplars.containsSome(HANGUL)) { - // cut down to small list - exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL); - } - if (exemplars.containsSome(ETHIOPIC)) { - // cut down to small list - // make use of the fact that Ethiopic is allocated in 8's, where - // the base is 0 mod 8. - for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) { - if ((it.codepoint & 0x7) != 0) { - exemplars.remove(it.codepoint); - } - } - } - - // first sort them, with an "best" ordering among items that are the same according - // to the collator - - Set preferenceSorting = new TreeSet(new MultiComparator(new Comparator[]{ - comparator, new PreferenceComparator(Collator.getInstance(locale))})); - for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) { - preferenceSorting.add(it.getString()); - } - - indexCharacters = new TreeSet(comparator); - - // We nw make a sorted array of elements, uppercased - // Some of the input may, however, be redundant. - // That is, we might have c, ch, d, where "ch" sorts just like "c", "h" - // So we make a pass through, filtering out those cases. - - for (Iterator it = preferenceSorting.iterator(); it.hasNext();) { - String item = (String) it.next(); - item = UCharacter.toUpperCase(locale, item); - if (indexCharacters.contains(item)) { - for (Iterator it2 = indexCharacters.iterator(); it2.hasNext();) { - Object itemAlreadyIn = it2.next(); - if (comparator.compare(item, itemAlreadyIn) == 0) { - Set targets = (Set) alreadyIn.get(itemAlreadyIn); - if (targets == null) { - alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet()); - } - targets.add(item); - break; - } - } - } else if (UTF16.countCodePoint(item) > 1 && comparator.compare(item, separated(item)) == 0){ - noDistinctSorting.add(item); - } else if (!ALPHABETIC.containsSome(item)) { - notAlphabetic.add(item); - } else { - indexCharacters.add(item); - } - } - - // if the result is still too large, cut down to 100 elements - - final int size = indexCharacters.size() - 1; - if (size > 99) { - int count = 0; - int old = -1; - for (Iterator it = indexCharacters.iterator(); it.hasNext();) { - ++ count; - it.next(); - final int bump = count * 99 / size; - if (bump == old) { - it.remove(); - } else { - old = bump; - } - } - } - indexCharacters = Collections.unmodifiableSet(indexCharacters); - } - - /* - * Return the string with interspersed CGJs. Input must have more than 2 codepoints. - */ - private String separated(String item) { - StringBuffer result = new StringBuffer(); - // add a CGJ except within surrogates - char last = item.charAt(0); - result.append(last); - for (int i = 1; i < item.length(); ++i) { - char ch = item.charAt(i); - if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) { - result.append(CGJ); - } - result.append(ch); - last = ch; - } - return result.toString(); - } - - /** - * Get the index characters. - * @return A collection including the index characters - * @draft ICU 4.2 - * @provisional This API might change or be removed in a future release. - */ - public Collection getIndexCharacters() { - return indexCharacters; - } - - /** - * Get the locale - * @return The locale. - * @draft ICU 4.2 - * @provisional This API might change or be removed in a future release. - */ - public ULocale getLocale() { - return locale; - } - - /** - * As the index is built, items may be discarded from the exemplars. - * This contains some of the discards, and is intended for debugging. - * @internal - */ - public Map getAlreadyIn() { - return alreadyIn; - } - - /** - * As the index is built, items may be discarded from the exemplars. - * This contains some of the discards, and is intended for debugging. - * @internal - */ - public List getNoDistinctSorting() { - return noDistinctSorting; - } - - /** - * As the index is built, items may be discarded from the exemplars. - * This contains some of the discards, and is intended for debugging. - * @internal - */ - public List getNotAlphabetic() { - return notAlphabetic; - } - - /* - * Comparator that returns "better" items first, where shorter NFKD is better, - * and otherwise NFKD binary order is better, and otherwise binary order is better. - */ - private static class PreferenceComparator implements Comparator { - static final Comparator binary = new UTF16.StringComparator(true,false,0); - final Collator collator; - - public PreferenceComparator(Collator collator) { - this.collator = collator; - } - - public int compare(Object o1, Object o2) { - if (o1 == o2) { - return 0; - } - String s1 = (String) o1; - String s2 = (String) o2; - String n1 = Normalizer.decompose(s1, true); - String n2 = Normalizer.decompose(s2, true); - int result = n1.length() - n2.length(); - if (result != 0) { - return result; - } - result = collator.compare(n1, n2); - if (result != 0) { - return result; - } - return binary.compare(s1, s2); - } - } -} -//#endif +//##header J2SE15 +//#if defined(FOUNDATION10) || defined(J2SE13) +//#else +/* + ******************************************************************************* + * Copyright (C) 2008-2009, Google Inc, International Business Machines Corporation + * and others. All Rights Reserved. + ******************************************************************************* + */ +package com.ibm.icu.text; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.util.LocaleData; +import com.ibm.icu.util.ULocale; +import com.ibm.icu.impl.CollectionUtilities.MultiComparator; + +/** + * A set of characters for use as a UI "index", that is, a + * list of clickable characters (or character sequences) that allow the user to + * see a segment of a larger "target" list. That is, each character corresponds + * to a bucket in the target list, where everything in the bucket is greater + * than or equal to the character (according to the locale's collation). The + * intention is to have two main functions; one that produces an index list that + * is relatively static, and the other is a list that produces roughly + * equally-sized buckets. Only the first is currently provided. + *

+ * The static list would be presented as something like + * + *

+ *  A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
+ * 
+ * + * In the UI, an index character could be omitted if its bucket is empty. For + * example, if there is nothing in the bucket for Q, then Q could be omitted. + *

+ * Important Notes: + *

+ * + * @author markdavis + * @draft ICU 4.2 + * @provisional This API might change or be removed in a future release. + */ +//TODO(markdavis) return an additional character that is the "least greater" character than +//the last character. +public class IndexCharacters { + public static final char CGJ = '\u034F'; + private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]"); + private static final UnicodeSet HANGUL = new UnicodeSet("[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14 \uC0AC \uC544 \uC790 \uCC28 \uCE74 \uD0C0 \uD30C \uD558]"); + private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"); + private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]"); + + private ULocale locale; + private Collator comparator; + private Set indexCharacters; + private LinkedHashMap alreadyIn = new LinkedHashMap(); + private List noDistinctSorting = new ArrayList(); + private List notAlphabetic = new ArrayList(); + + /** + * Create the index object. + * @param locale + * @draft ICU 4.2 + * @provisional This API might change or be removed in a future release. + */ + public IndexCharacters(ULocale locale) { + this.locale = locale; + comparator = Collator.getInstance(locale); + comparator.setStrength(Collator.PRIMARY); + + // get the exemplars, and handle special cases + + UnicodeSet exemplars = LocaleData.getExemplarSet(locale, LocaleData.ES_STANDARD); + // question: should we add auxiliary exemplars? + if (exemplars.containsSome(CORE_LATIN)) { + exemplars.addAll(CORE_LATIN); + } + if (exemplars.containsSome(HANGUL)) { + // cut down to small list + exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL); + } + if (exemplars.containsSome(ETHIOPIC)) { + // cut down to small list + // make use of the fact that Ethiopic is allocated in 8's, where + // the base is 0 mod 8. + for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) { + if ((it.codepoint & 0x7) != 0) { + exemplars.remove(it.codepoint); + } + } + } + + // first sort them, with an "best" ordering among items that are the same according + // to the collator + + Set preferenceSorting = new TreeSet(new MultiComparator(new Comparator[]{ + comparator, new PreferenceComparator(Collator.getInstance(locale))})); + for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) { + preferenceSorting.add(it.getString()); + } + + indexCharacters = new TreeSet(comparator); + + // We nw make a sorted array of elements, uppercased + // Some of the input may, however, be redundant. + // That is, we might have c, ch, d, where "ch" sorts just like "c", "h" + // So we make a pass through, filtering out those cases. + + for (Iterator it = preferenceSorting.iterator(); it.hasNext();) { + String item = (String) it.next(); + item = UCharacter.toUpperCase(locale, item); + if (indexCharacters.contains(item)) { + for (Iterator it2 = indexCharacters.iterator(); it2.hasNext();) { + Object itemAlreadyIn = it2.next(); + if (comparator.compare(item, itemAlreadyIn) == 0) { + Set targets = (Set) alreadyIn.get(itemAlreadyIn); + if (targets == null) { + alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet()); + } + targets.add(item); + break; + } + } + } else if (UTF16.countCodePoint(item) > 1 && comparator.compare(item, separated(item)) == 0){ + noDistinctSorting.add(item); + } else if (!ALPHABETIC.containsSome(item)) { + notAlphabetic.add(item); + } else { + indexCharacters.add(item); + } + } + + // if the result is still too large, cut down to 100 elements + + final int size = indexCharacters.size() - 1; + if (size > 99) { + int count = 0; + int old = -1; + for (Iterator it = indexCharacters.iterator(); it.hasNext();) { + ++ count; + it.next(); + final int bump = count * 99 / size; + if (bump == old) { + it.remove(); + } else { + old = bump; + } + } + } + indexCharacters = Collections.unmodifiableSet(indexCharacters); + } + + /* + * Return the string with interspersed CGJs. Input must have more than 2 codepoints. + */ + private String separated(String item) { + StringBuffer result = new StringBuffer(); + // add a CGJ except within surrogates + char last = item.charAt(0); + result.append(last); + for (int i = 1; i < item.length(); ++i) { + char ch = item.charAt(i); + if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) { + result.append(CGJ); + } + result.append(ch); + last = ch; + } + return result.toString(); + } + + /** + * Get the index characters. + * @return A collection including the index characters + * @draft ICU 4.2 + * @provisional This API might change or be removed in a future release. + */ + public Collection getIndexCharacters() { + return indexCharacters; + } + + /** + * Get the locale + * @return The locale. + * @draft ICU 4.2 + * @provisional This API might change or be removed in a future release. + */ + public ULocale getLocale() { + return locale; + } + + /** + * As the index is built, items may be discarded from the exemplars. + * This contains some of the discards, and is intended for debugging. + * @internal + */ + public Map getAlreadyIn() { + return alreadyIn; + } + + /** + * As the index is built, items may be discarded from the exemplars. + * This contains some of the discards, and is intended for debugging. + * @internal + */ + public List getNoDistinctSorting() { + return noDistinctSorting; + } + + /** + * As the index is built, items may be discarded from the exemplars. + * This contains some of the discards, and is intended for debugging. + * @internal + */ + public List getNotAlphabetic() { + return notAlphabetic; + } + + /* + * Comparator that returns "better" items first, where shorter NFKD is better, + * and otherwise NFKD binary order is better, and otherwise binary order is better. + */ + private static class PreferenceComparator implements Comparator { + static final Comparator binary = new UTF16.StringComparator(true,false,0); + final Collator collator; + + public PreferenceComparator(Collator collator) { + this.collator = collator; + } + + public int compare(Object o1, Object o2) { + if (o1 == o2) { + return 0; + } + String s1 = (String) o1; + String s2 = (String) o2; + String n1 = Normalizer.decompose(s1, true); + String n2 = Normalizer.decompose(s2, true); + int result = n1.length() - n2.length(); + if (result != 0) { + return result; + } + result = collator.compare(n1, n2); + if (result != 0) { + return result; + } + return binary.compare(s1, s2); + } + } +} +//#endif