//##header //#if defined(FOUNDATION10) || defined(J2SE13) //#else /* ******************************************************************************* * Copyright (C) 2008-2009, Google Inc, International Business Machines Corporation * and others. All Rights Reserved. ******************************************************************************* */ package com.ibm.icu.text; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.util.LocaleData; import com.ibm.icu.util.ULocale; import com.ibm.icu.impl.CollectionUtilities.MultiComparator; /** * A set of characters for use as a UI "index", that is, a * list of clickable characters (or character sequences) that allow the user to * see a segment of a larger "target" list. That is, each character corresponds * to a bucket in the target list, where everything in the bucket is greater * than or equal to the character (according to the locale's collation). The * intention is to have two main functions; one that produces an index list that * is relatively static, and the other is a list that produces roughly * equally-sized buckets. Only the first is currently provided. *

* The static list would be presented as something like * *

 *  A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 * 
* * In the UI, an index character could be omitted if its bucket is empty. For * example, if there is nothing in the bucket for Q, then Q could be omitted. *

* Important Notes: *

* * @author markdavis * @draft ICU 4.2 * @provisional This API might change or be removed in a future release. */ //TODO(markdavis) return an additional character that is the "least greater" character than //the last character. public class IndexCharacters { public static final char CGJ = '\u034F'; private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]"); private static final UnicodeSet HANGUL = new UnicodeSet("[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14 \uC0AC \uC544 \uC790 \uCC28 \uCE74 \uD0C0 \uD30C \uD558]"); private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"); private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]"); private ULocale locale; private Collator comparator; private Set indexCharacters; private LinkedHashMap alreadyIn = new LinkedHashMap(); private List noDistinctSorting = new ArrayList(); private List notAlphabetic = new ArrayList(); /** * Create the index object. * @param locale * @draft ICU 4.2 * @provisional This API might change or be removed in a future release. */ public IndexCharacters(ULocale locale) { this.locale = locale; comparator = Collator.getInstance(locale); comparator.setStrength(Collator.PRIMARY); // get the exemplars, and handle special cases UnicodeSet exemplars = LocaleData.getExemplarSet(locale, LocaleData.ES_STANDARD); // question: should we add auxiliary exemplars? if (exemplars.containsSome(CORE_LATIN)) { exemplars.addAll(CORE_LATIN); } if (exemplars.containsSome(HANGUL)) { // cut down to small list exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL); } if (exemplars.containsSome(ETHIOPIC)) { // cut down to small list // make use of the fact that Ethiopic is allocated in 8's, where // the base is 0 mod 8. for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) { if ((it.codepoint & 0x7) != 0) { exemplars.remove(it.codepoint); } } } // first sort them, with an "best" ordering among items that are the same according // to the collator Set preferenceSorting = new TreeSet(new MultiComparator(new Comparator[]{ comparator, new PreferenceComparator(Collator.getInstance(locale))})); for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) { preferenceSorting.add(it.getString()); } indexCharacters = new TreeSet(comparator); // We nw make a sorted array of elements, uppercased // Some of the input may, however, be redundant. // That is, we might have c, ch, d, where "ch" sorts just like "c", "h" // So we make a pass through, filtering out those cases. for (Iterator it = preferenceSorting.iterator(); it.hasNext();) { String item = (String) it.next(); item = UCharacter.toUpperCase(locale, item); if (indexCharacters.contains(item)) { for (Iterator it2 = indexCharacters.iterator(); it2.hasNext();) { Object itemAlreadyIn = it2.next(); if (comparator.compare(item, itemAlreadyIn) == 0) { Set targets = (Set) alreadyIn.get(itemAlreadyIn); if (targets == null) { alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet()); } targets.add(item); break; } } } else if (UTF16.countCodePoint(item) > 1 && comparator.compare(item, separated(item)) == 0){ noDistinctSorting.add(item); } else if (!ALPHABETIC.containsSome(item)) { notAlphabetic.add(item); } else { indexCharacters.add(item); } } // if the result is still too large, cut down to 100 elements final int size = indexCharacters.size() - 1; if (size > 99) { int count = 0; int old = -1; for (Iterator it = indexCharacters.iterator(); it.hasNext();) { ++ count; it.next(); final int bump = count * 99 / size; if (bump == old) { it.remove(); } else { old = bump; } } } indexCharacters = Collections.unmodifiableSet(indexCharacters); } /* * Return the string with interspersed CGJs. Input must have more than 2 codepoints. */ private String separated(String item) { StringBuffer result = new StringBuffer(); // add a CGJ except within surrogates char last = item.charAt(0); result.append(last); for (int i = 1; i < item.length(); ++i) { char ch = item.charAt(i); if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) { result.append(CGJ); } result.append(ch); last = ch; } return result.toString(); } /** * Get the index characters. * @return A collection including the index characters * @draft ICU 4.2 * @provisional This API might change or be removed in a future release. */ public Collection getIndexCharacters() { return indexCharacters; } /** * Get the locale * @return The locale. * @draft ICU 4.2 * @provisional This API might change or be removed in a future release. */ public ULocale getLocale() { return locale; } /** * As the index is built, items may be discarded from the exemplars. * This contains some of the discards, and is intended for debugging. * @internal */ public Map getAlreadyIn() { return alreadyIn; } /** * As the index is built, items may be discarded from the exemplars. * This contains some of the discards, and is intended for debugging. * @internal */ public List getNoDistinctSorting() { return noDistinctSorting; } /** * As the index is built, items may be discarded from the exemplars. * This contains some of the discards, and is intended for debugging. * @internal */ public List getNotAlphabetic() { return notAlphabetic; } /* * Comparator that returns "better" items first, where shorter NFKD is better, * and otherwise NFKD binary order is better, and otherwise binary order is better. */ private static class PreferenceComparator implements Comparator { static final Comparator binary = new UTF16.StringComparator(true,false,0); final Collator collator; public PreferenceComparator(Collator collator) { this.collator = collator; } public int compare(Object o1, Object o2) { if (o1 == o2) { return 0; } String s1 = (String) o1; String s2 = (String) o2; String n1 = Normalizer.decompose(s1, true); String n2 = Normalizer.decompose(s2, true); int result = n1.length() - n2.length(); if (result != 0) { return result; } result = collator.compare(n1, n2); if (result != 0) { return result; } return binary.compare(s1, s2); } } } //#endif