2 //#if defined(FOUNDATION10) || defined(J2SE13)
5 *******************************************************************************
6 * Copyright (C) 2008-2009, Google Inc, International Business Machines Corporation
7 * and others. All Rights Reserved.
8 *******************************************************************************
10 package com.ibm.icu.text;
12 import java.util.ArrayList;
13 import java.util.Collection;
14 import java.util.Collections;
15 import java.util.Comparator;
16 import java.util.Iterator;
17 import java.util.LinkedHashMap;
18 import java.util.LinkedHashSet;
19 import java.util.List;
22 import java.util.TreeSet;
24 import com.ibm.icu.lang.UCharacter;
25 import com.ibm.icu.util.LocaleData;
26 import com.ibm.icu.util.ULocale;
27 import com.ibm.icu.impl.CollectionUtilities.MultiComparator;
30 * A set of characters for use as a UI "index", that is, a
31 * list of clickable characters (or character sequences) that allow the user to
32 * see a segment of a larger "target" list. That is, each character corresponds
33 * to a bucket in the target list, where everything in the bucket is greater
34 * than or equal to the character (according to the locale's collation). The
35 * intention is to have two main functions; one that produces an index list that
36 * is relatively static, and the other is a list that produces roughly
37 * equally-sized buckets. Only the first is currently provided.
39 * The static list would be presented as something like
42 * A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
45 * In the UI, an index character could be omitted if its bucket is empty. For
46 * example, if there is nothing in the bucket for Q, then Q could be omitted.
48 * <b>Important Notes:</b>
50 * <li>Although we say "character" above, the index character could be a
51 * sequence, like "CH".</li>
52 * <li>There could be items in a target list that are less than the first or
53 * (much) greater than the last; examples include words from other scripts. The
54 * UI could bucket them with the first or last respectively, or have some symbol
55 * for those categories.</li>
56 * <li>The use of the list requires that the target list be sorted according to
57 * the locale that is used to create that list.</li>
58 * <li>For languages without widely accepted sorting methods (eg Chinese/Japanese)
59 * the results may appear arbitrary, and it may be best not to use these methods.</li>
60 * <li>In the initial version, an arbitrary limit of 100 is placed on these lists.</li>
65 * @provisional This API might change or be removed in a future release.
67 //TODO(markdavis) return an additional character that is the "least greater" character than
69 public class IndexCharacters {
70 public static final char CGJ = '\u034F';
71 private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]");
72 private static final UnicodeSet HANGUL = new UnicodeSet("[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14 \uC0AC \uC544 \uC790 \uCC28 \uCE74 \uD0C0 \uD30C \uD558]");
73 private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
74 private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]");
76 private ULocale locale;
77 private Collator comparator;
78 private Set indexCharacters;
79 private LinkedHashMap alreadyIn = new LinkedHashMap();
80 private List noDistinctSorting = new ArrayList();
81 private List notAlphabetic = new ArrayList();
84 * Create the index object.
87 * @provisional This API might change or be removed in a future release.
89 public IndexCharacters(ULocale locale) {
91 comparator = Collator.getInstance(locale);
92 comparator.setStrength(Collator.PRIMARY);
94 // get the exemplars, and handle special cases
96 UnicodeSet exemplars = LocaleData.getExemplarSet(locale, LocaleData.ES_STANDARD);
97 // question: should we add auxiliary exemplars?
98 if (exemplars.containsSome(CORE_LATIN)) {
99 exemplars.addAll(CORE_LATIN);
101 if (exemplars.containsSome(HANGUL)) {
102 // cut down to small list
103 exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL);
105 if (exemplars.containsSome(ETHIOPIC)) {
106 // cut down to small list
107 // make use of the fact that Ethiopic is allocated in 8's, where
108 // the base is 0 mod 8.
109 for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) {
110 if ((it.codepoint & 0x7) != 0) {
111 exemplars.remove(it.codepoint);
116 // first sort them, with an "best" ordering among items that are the same according
119 Set preferenceSorting = new TreeSet(new MultiComparator(new Comparator[]{
120 comparator, new PreferenceComparator(Collator.getInstance(locale))}));
121 for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {
122 preferenceSorting.add(it.getString());
125 indexCharacters = new TreeSet(comparator);
127 // We nw make a sorted array of elements, uppercased
128 // Some of the input may, however, be redundant.
129 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
130 // So we make a pass through, filtering out those cases.
132 for (Iterator it = preferenceSorting.iterator(); it.hasNext();) {
133 String item = (String) it.next();
134 item = UCharacter.toUpperCase(locale, item);
135 if (indexCharacters.contains(item)) {
136 for (Iterator it2 = indexCharacters.iterator(); it2.hasNext();) {
137 Object itemAlreadyIn = it2.next();
138 if (comparator.compare(item, itemAlreadyIn) == 0) {
139 Set targets = (Set) alreadyIn.get(itemAlreadyIn);
140 if (targets == null) {
141 alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet());
147 } else if (UTF16.countCodePoint(item) > 1 && comparator.compare(item, separated(item)) == 0){
148 noDistinctSorting.add(item);
149 } else if (!ALPHABETIC.containsSome(item)) {
150 notAlphabetic.add(item);
152 indexCharacters.add(item);
156 // if the result is still too large, cut down to 100 elements
158 final int size = indexCharacters.size() - 1;
162 for (Iterator it = indexCharacters.iterator(); it.hasNext();) {
165 final int bump = count * 99 / size;
173 indexCharacters = Collections.unmodifiableSet(indexCharacters);
177 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
179 private String separated(String item) {
180 StringBuffer result = new StringBuffer();
181 // add a CGJ except within surrogates
182 char last = item.charAt(0);
184 for (int i = 1; i < item.length(); ++i) {
185 char ch = item.charAt(i);
186 if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
192 return result.toString();
196 * Get the index characters.
197 * @return A collection including the index characters
199 * @provisional This API might change or be removed in a future release.
201 public Collection getIndexCharacters() {
202 return indexCharacters;
207 * @return The locale.
209 * @provisional This API might change or be removed in a future release.
211 public ULocale getLocale() {
216 * As the index is built, items may be discarded from the exemplars.
217 * This contains some of the discards, and is intended for debugging.
220 public Map getAlreadyIn() {
225 * As the index is built, items may be discarded from the exemplars.
226 * This contains some of the discards, and is intended for debugging.
229 public List getNoDistinctSorting() {
230 return noDistinctSorting;
234 * As the index is built, items may be discarded from the exemplars.
235 * This contains some of the discards, and is intended for debugging.
238 public List getNotAlphabetic() {
239 return notAlphabetic;
243 * Comparator that returns "better" items first, where shorter NFKD is better,
244 * and otherwise NFKD binary order is better, and otherwise binary order is better.
246 private static class PreferenceComparator implements Comparator {
247 static final Comparator binary = new UTF16.StringComparator(true,false,0);
248 final Collator collator;
250 public PreferenceComparator(Collator collator) {
251 this.collator = collator;
254 public int compare(Object o1, Object o2) {
258 String s1 = (String) o1;
259 String s2 = (String) o2;
260 String n1 = Normalizer.decompose(s1, true);
261 String n2 = Normalizer.decompose(s2, true);
262 int result = n1.length() - n2.length();
266 result = collator.compare(n1, n2);
270 return binary.compare(s1, s2);