2 //#if defined(FOUNDATION10) || defined(J2SE13)
\r
5 *******************************************************************************
\r
6 * Copyright (C) 2008-2009, Google Inc, International Business Machines Corporation
\r
7 * and others. All Rights Reserved.
\r
8 *******************************************************************************
\r
10 package com.ibm.icu.text;
\r
12 import java.util.ArrayList;
\r
13 import java.util.Collection;
\r
14 import java.util.Collections;
\r
15 import java.util.Comparator;
\r
16 import java.util.Iterator;
\r
17 import java.util.LinkedHashMap;
\r
18 import java.util.LinkedHashSet;
\r
19 import java.util.List;
\r
20 import java.util.Map;
\r
21 import java.util.Set;
\r
22 import java.util.TreeSet;
\r
24 import com.ibm.icu.lang.UCharacter;
\r
25 import com.ibm.icu.util.LocaleData;
\r
26 import com.ibm.icu.util.ULocale;
\r
27 import com.ibm.icu.impl.CollectionUtilities.MultiComparator;
\r
30 * A set of characters for use as a UI "index", that is, a
\r
31 * list of clickable characters (or character sequences) that allow the user to
\r
32 * see a segment of a larger "target" list. That is, each character corresponds
\r
33 * to a bucket in the target list, where everything in the bucket is greater
\r
34 * than or equal to the character (according to the locale's collation). The
\r
35 * intention is to have two main functions; one that produces an index list that
\r
36 * is relatively static, and the other is a list that produces roughly
\r
37 * equally-sized buckets. Only the first is currently provided.
\r
39 * The static list would be presented as something like
\r
42 * A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
\r
45 * In the UI, an index character could be omitted if its bucket is empty. For
\r
46 * example, if there is nothing in the bucket for Q, then Q could be omitted.
\r
48 * <b>Important Notes:</b>
\r
50 * <li>Although we say "character" above, the index character could be a
\r
51 * sequence, like "CH".</li>
\r
52 * <li>There could be items in a target list that are less than the first or
\r
53 * (much) greater than the last; examples include words from other scripts. The
\r
54 * UI could bucket them with the first or last respectively, or have some symbol
\r
55 * for those categories.</li>
\r
56 * <li>The use of the list requires that the target list be sorted according to
\r
57 * the locale that is used to create that list.</li>
\r
58 * <li>For languages without widely accepted sorting methods (eg Chinese/Japanese)
\r
59 * the results may appear arbitrary, and it may be best not to use these methods.</li>
\r
60 * <li>In the initial version, an arbitrary limit of 100 is placed on these lists.</li>
\r
65 * @provisional This API might change or be removed in a future release.
\r
67 //TODO(markdavis) return an additional character that is the "least greater" character than
\r
68 //the last character.
\r
69 public class IndexCharacters {
\r
70 public static final char CGJ = '\u034F';
\r
71 private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]");
\r
72 private static final UnicodeSet HANGUL = new UnicodeSet("[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14 \uC0AC \uC544 \uC790 \uCC28 \uCE74 \uD0C0 \uD30C \uD558]");
\r
73 private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
\r
74 private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]");
\r
76 private ULocale locale;
\r
77 private Collator comparator;
\r
78 private Set indexCharacters;
\r
79 private LinkedHashMap alreadyIn = new LinkedHashMap();
\r
80 private List noDistinctSorting = new ArrayList();
\r
81 private List notAlphabetic = new ArrayList();
\r
84 * Create the index object.
\r
87 * @provisional This API might change or be removed in a future release.
\r
89 public IndexCharacters(ULocale locale) {
\r
90 this.locale = locale;
\r
91 comparator = Collator.getInstance(locale);
\r
92 comparator.setStrength(Collator.PRIMARY);
\r
94 // get the exemplars, and handle special cases
\r
96 UnicodeSet exemplars = LocaleData.getExemplarSet(locale, LocaleData.ES_STANDARD);
\r
97 // question: should we add auxiliary exemplars?
\r
98 if (exemplars.containsSome(CORE_LATIN)) {
\r
99 exemplars.addAll(CORE_LATIN);
\r
101 if (exemplars.containsSome(HANGUL)) {
\r
102 // cut down to small list
\r
103 exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL);
\r
105 if (exemplars.containsSome(ETHIOPIC)) {
\r
106 // cut down to small list
\r
107 // make use of the fact that Ethiopic is allocated in 8's, where
\r
108 // the base is 0 mod 8.
\r
109 for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) {
\r
110 if ((it.codepoint & 0x7) != 0) {
\r
111 exemplars.remove(it.codepoint);
\r
116 // first sort them, with an "best" ordering among items that are the same according
\r
119 Set preferenceSorting = new TreeSet(new MultiComparator(new Comparator[]{
\r
120 comparator, new PreferenceComparator(Collator.getInstance(locale))}));
\r
121 for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {
\r
122 preferenceSorting.add(it.getString());
\r
125 indexCharacters = new TreeSet(comparator);
\r
127 // We nw make a sorted array of elements, uppercased
\r
128 // Some of the input may, however, be redundant.
\r
129 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
\r
130 // So we make a pass through, filtering out those cases.
\r
132 for (Iterator it = preferenceSorting.iterator(); it.hasNext();) {
\r
133 String item = (String) it.next();
\r
134 item = UCharacter.toUpperCase(locale, item);
\r
135 if (indexCharacters.contains(item)) {
\r
136 for (Iterator it2 = indexCharacters.iterator(); it2.hasNext();) {
\r
137 Object itemAlreadyIn = it2.next();
\r
138 if (comparator.compare(item, itemAlreadyIn) == 0) {
\r
139 Set targets = (Set) alreadyIn.get(itemAlreadyIn);
\r
140 if (targets == null) {
\r
141 alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet());
\r
147 } else if (UTF16.countCodePoint(item) > 1 && comparator.compare(item, separated(item)) == 0){
\r
148 noDistinctSorting.add(item);
\r
149 } else if (!ALPHABETIC.containsSome(item)) {
\r
150 notAlphabetic.add(item);
\r
152 indexCharacters.add(item);
\r
156 // if the result is still too large, cut down to 100 elements
\r
158 final int size = indexCharacters.size() - 1;
\r
162 for (Iterator it = indexCharacters.iterator(); it.hasNext();) {
\r
165 final int bump = count * 99 / size;
\r
173 indexCharacters = Collections.unmodifiableSet(indexCharacters);
\r
177 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
\r
179 private String separated(String item) {
\r
180 StringBuffer result = new StringBuffer();
\r
181 // add a CGJ except within surrogates
\r
182 char last = item.charAt(0);
\r
183 result.append(last);
\r
184 for (int i = 1; i < item.length(); ++i) {
\r
185 char ch = item.charAt(i);
\r
186 if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
\r
187 result.append(CGJ);
\r
192 return result.toString();
\r
196 * Get the index characters.
\r
197 * @return A collection including the index characters
\r
199 * @provisional This API might change or be removed in a future release.
\r
201 public Collection getIndexCharacters() {
\r
202 return indexCharacters;
\r
207 * @return The locale.
\r
209 * @provisional This API might change or be removed in a future release.
\r
211 public ULocale getLocale() {
\r
216 * As the index is built, items may be discarded from the exemplars.
\r
217 * This contains some of the discards, and is intended for debugging.
\r
220 public Map getAlreadyIn() {
\r
225 * As the index is built, items may be discarded from the exemplars.
\r
226 * This contains some of the discards, and is intended for debugging.
\r
229 public List getNoDistinctSorting() {
\r
230 return noDistinctSorting;
\r
234 * As the index is built, items may be discarded from the exemplars.
\r
235 * This contains some of the discards, and is intended for debugging.
\r
238 public List getNotAlphabetic() {
\r
239 return notAlphabetic;
\r
243 * Comparator that returns "better" items first, where shorter NFKD is better,
\r
244 * and otherwise NFKD binary order is better, and otherwise binary order is better.
\r
246 private static class PreferenceComparator implements Comparator {
\r
247 static final Comparator binary = new UTF16.StringComparator(true,false,0);
\r
248 final Collator collator;
\r
250 public PreferenceComparator(Collator collator) {
\r
251 this.collator = collator;
\r
254 public int compare(Object o1, Object o2) {
\r
258 String s1 = (String) o1;
\r
259 String s2 = (String) o2;
\r
260 String n1 = Normalizer.decompose(s1, true);
\r
261 String n2 = Normalizer.decompose(s2, true);
\r
262 int result = n1.length() - n2.length();
\r
266 result = collator.compare(n1, n2);
\r
270 return binary.compare(s1, s2);
\r