2 *******************************************************************************
\r
3 * Copyright (C) 2008-2010, Google Inc, International Business Machines Corporation
\r
4 * and others. All Rights Reserved.
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.util.ArrayList;
\r
10 import java.util.Collection;
\r
11 import java.util.Collections;
\r
12 import java.util.Comparator;
\r
13 import java.util.Iterator;
\r
14 import java.util.LinkedHashMap;
\r
15 import java.util.LinkedHashSet;
\r
16 import java.util.List;
\r
17 import java.util.Map;
\r
18 import java.util.Set;
\r
19 import java.util.TreeSet;
\r
21 import com.ibm.icu.impl.MultiComparator;
\r
22 import com.ibm.icu.lang.UCharacter;
\r
23 import com.ibm.icu.util.LocaleData;
\r
24 import com.ibm.icu.util.ULocale;
\r
27 * A set of characters for use as a UI "index", that is, a
\r
28 * list of clickable characters (or character sequences) that allow the user to
\r
29 * see a segment of a larger "target" list. That is, each character corresponds
\r
30 * to a bucket in the target list, where everything in the bucket is greater
\r
31 * than or equal to the character (according to the locale's collation). The
\r
32 * intention is to have two main functions; one that produces an index list that
\r
33 * is relatively static, and the other is a list that produces roughly
\r
34 * equally-sized buckets. Only the first is currently provided.
\r
36 * The static list would be presented as something like
\r
39 * A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
\r
42 * In the UI, an index character could be omitted if its bucket is empty. For
\r
43 * example, if there is nothing in the bucket for Q, then Q could be omitted.
\r
45 * <b>Important Notes:</b>
\r
47 * <li>Although we say "character" above, the index character could be a
\r
48 * sequence, like "CH".</li>
\r
49 * <li>There could be items in a target list that are less than the first or
\r
50 * (much) greater than the last; examples include words from other scripts. The
\r
51 * UI could bucket them with the first or last respectively, or have some symbol
\r
52 * for those categories.</li>
\r
53 * <li>The use of the list requires that the target list be sorted according to
\r
54 * the locale that is used to create that list.</li>
\r
55 * <li>For languages without widely accepted sorting methods (eg Chinese/Japanese)
\r
56 * the results may appear arbitrary, and it may be best not to use these methods.</li>
\r
57 * <li>In the initial version, an arbitrary limit of 100 is placed on these lists.</li>
\r
62 * @provisional This API might change or be removed in a future release.
\r
64 //TODO(markdavis) return an additional character that is the "least greater" character than
\r
65 //the last character.
\r
66 public class IndexCharacters {
\r
67 private static final char CGJ = '\u034F';
\r
68 private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]");
\r
69 private static final UnicodeSet HANGUL = new UnicodeSet("[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14 \uC0AC \uC544 \uC790 \uCC28 \uCE74 \uD0C0 \uD30C \uD558]");
\r
70 private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
\r
71 private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]");
\r
73 private ULocale locale;
\r
74 private Collator comparator;
\r
75 private Set<String> indexCharacters;
\r
76 private LinkedHashMap<String, Set<String>> alreadyIn = new LinkedHashMap<String, Set<String>>();
\r
77 private List<String> noDistinctSorting = new ArrayList<String>();
\r
78 private List<String> notAlphabetic = new ArrayList<String>();
\r
81 * Create the index object.
\r
82 * @param locale The locale to be passed.
\r
84 * @provisional This API might change or be removed in a future release.
\r
86 public IndexCharacters(ULocale locale) {
\r
87 this(locale, LocaleData.getExemplarSet(locale, LocaleData.ES_STANDARD), Collator.getInstance(locale));
\r
92 * @deprecated This API is ICU internal only.
\r
94 @SuppressWarnings("unchecked")
\r
95 public IndexCharacters(ULocale locale, UnicodeSet exemplarSet, Collator collator) {
\r
96 this.locale = locale;
\r
98 comparator = (Collator) collator.clone();
\r
99 } catch (CloneNotSupportedException e) {
\r
100 throw new IllegalArgumentException(e);
\r
102 comparator.setStrength(Collator.PRIMARY);
\r
104 // get the exemplars, and handle special cases
\r
106 UnicodeSet exemplars = exemplarSet.cloneAsThawed();
\r
107 // question: should we add auxiliary exemplars?
\r
108 if (exemplars.containsSome(CORE_LATIN)) {
\r
109 exemplars.addAll(CORE_LATIN);
\r
111 if (exemplars.containsSome(HANGUL)) {
\r
112 // cut down to small list
\r
113 exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL);
\r
115 if (exemplars.containsSome(ETHIOPIC)) {
\r
116 // cut down to small list
\r
117 // make use of the fact that Ethiopic is allocated in 8's, where
\r
118 // the base is 0 mod 8.
\r
119 for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) {
\r
120 if ((it.codepoint & 0x7) != 0) {
\r
121 exemplars.remove(it.codepoint);
\r
126 // first sort them, with an "best" ordering among items that are the same according
\r
128 Comparator<Object>[] comparators = (Comparator<Object>[])new Comparator[2];
\r
129 comparators[0] = comparator;
\r
130 comparators[1] = new PreferenceComparator(Collator.getInstance(locale));
\r
132 Set<String> preferenceSorting = new TreeSet<String>(new MultiComparator<Object>(comparators));
\r
133 for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {
\r
134 preferenceSorting.add(it.getString());
\r
137 indexCharacters = new TreeSet<String>(comparator);
\r
139 // We nw make a sorted array of elements, uppercased
\r
140 // Some of the input may, however, be redundant.
\r
141 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
\r
142 // So we make a pass through, filtering out those cases.
\r
144 for (String item : preferenceSorting) {
\r
145 item = UCharacter.toUpperCase(locale, item);
\r
146 if (indexCharacters.contains(item)) {
\r
147 for (String itemAlreadyIn : indexCharacters) {
\r
148 if (comparator.compare(item, itemAlreadyIn) == 0) {
\r
149 Set<String> targets = alreadyIn.get(itemAlreadyIn);
\r
150 if (targets == null) {
\r
151 alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>());
\r
157 } else if (UTF16.countCodePoint(item) > 1 && comparator.compare(item, separated(item)) == 0){
\r
158 noDistinctSorting.add(item);
\r
159 } else if (!ALPHABETIC.containsSome(item)) {
\r
160 notAlphabetic.add(item);
\r
162 indexCharacters.add(item);
\r
166 // if the result is still too large, cut down to 100 elements
\r
168 final int size = indexCharacters.size() - 1;
\r
172 for (Iterator<String> it = indexCharacters.iterator(); it.hasNext();) {
\r
175 final int bump = count * 99 / size;
\r
183 indexCharacters = Collections.unmodifiableSet(indexCharacters);
\r
187 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
\r
189 private String separated(String item) {
\r
190 StringBuilder result = new StringBuilder();
\r
191 // add a CGJ except within surrogates
\r
192 char last = item.charAt(0);
\r
193 result.append(last);
\r
194 for (int i = 1; i < item.length(); ++i) {
\r
195 char ch = item.charAt(i);
\r
196 if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
\r
197 result.append(CGJ);
\r
202 return result.toString();
\r
206 * Get the index characters.
\r
207 * @return A collection including the index characters
\r
209 * @provisional This API might change or be removed in a future release.
\r
211 public Collection<String> getIndexCharacters() {
\r
212 return indexCharacters;
\r
217 * @return The locale.
\r
219 * @provisional This API might change or be removed in a future release.
\r
221 public ULocale getLocale() {
\r
226 * As the index is built, items may be discarded from the exemplars.
\r
227 * This contains some of the discards, and is intended for debugging.
\r
229 * @deprecated This API is ICU internal only.
\r
231 public Map<String, Set<String>> getAlreadyIn() {
\r
236 * As the index is built, items may be discarded from the exemplars.
\r
237 * This contains some of the discards, and is intended for debugging.
\r
239 * @deprecated This API is ICU internal only.
\r
241 public List<String> getNoDistinctSorting() {
\r
242 return noDistinctSorting;
\r
246 * As the index is built, items may be discarded from the exemplars.
\r
247 * This contains some of the discards, and is intended for debugging.
\r
249 * @deprecated This API is ICU internal only.
\r
251 public List<String> getNotAlphabetic() {
\r
252 return notAlphabetic;
\r
256 * Comparator that returns "better" items first, where shorter NFKD is better,
\r
257 * and otherwise NFKD binary order is better, and otherwise binary order is better.
\r
259 private static class PreferenceComparator implements Comparator<Object> {
\r
260 static final Comparator<String> binary = new UTF16.StringComparator(true,false,0);
\r
261 final Collator collator;
\r
263 public PreferenceComparator(Collator collator) {
\r
264 this.collator = collator;
\r
267 public int compare(Object o1, Object o2) {
\r
268 return compare((String)o1, (String)o2);
\r
271 public int compare(String s1, String s2) {
\r
275 String n1 = Normalizer.decompose(s1, true);
\r
276 String n2 = Normalizer.decompose(s2, true);
\r
277 int result = n1.length() - n2.length();
\r
281 result = collator.compare(n1, n2);
\r
285 return binary.compare(s1, s2);
\r