2 *******************************************************************************
3 * Copyright (C) 2008-2011, Google Inc, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 *******************************************************************************
7 package com.ibm.icu.text;
9 import java.util.ArrayList;
10 import java.util.Arrays;
11 import java.util.Collection;
12 import java.util.Collections;
13 import java.util.Comparator;
14 import java.util.HashMap;
15 import java.util.Iterator;
16 import java.util.LinkedHashMap;
17 import java.util.LinkedHashSet;
18 import java.util.List;
19 import java.util.Locale;
22 import java.util.TreeSet;
24 import com.ibm.icu.impl.MultiComparator;
25 import com.ibm.icu.lang.UCharacter;
26 import com.ibm.icu.lang.UProperty;
27 import com.ibm.icu.lang.UScript;
28 import com.ibm.icu.text.AlphabeticIndex.Bucket;
29 import com.ibm.icu.text.AlphabeticIndex.Bucket.LabelType;
30 import com.ibm.icu.util.LocaleData;
31 import com.ibm.icu.util.ULocale;
34 * AlphabeticIndex supports the creation of a UI index appropriate for a given language. It can support either direct
35 * use, or use with a client that doesn't support localized collation. The following is an example of what an index
36 * might look like in a UI:
39 * <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ...</b>
50 * The class can generate a list of labels for use as a UI "index", that is, a list of clickable characters (or
51 * character sequences) that allow the user to see a segment (bucket) of a larger "target" list. That is, each label
52 * corresponds to a bucket in the target list, where everything in the bucket is greater than or equal to the character
53 * (according to the locale's collation). Strings can be added to the index; they will be in sorted order in the right
56 * The class also supports having buckets for strings before the first (underflow), after the last (overflow), and
57 * between scripts (inflow). For example, if the index is constructed with labels for Russian and English, Greek
58 * characters would fall into an inflow bucket between the other two scripts.</p>
60 * <p><em>Note:</em> If you expect to have a lot of ASCII or Latin characters as well as characters from the user's language, then it is a good idea to call addLabels(ULocale.English).</p>
63 * <p>The following shows an example of building an index directly.
64 * The "show..." methods below are just to illustrate usage.
67 * // Create a simple index where the values for the strings are Integers, and add the strings
69 * AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(desiredLocale).addLabels(additionalLocale);
71 * for (String item : test) {
72 * index.addRecord(item, counter++);
75 * // Show index at top. We could skip or gray out empty buckets
77 * for (AlphabeticIndex.Bucket<Integer> bucket : index) {
78 * if (showAll || bucket.size() != 0) {
79 * showLabelAtTop(UI, bucket.getLabel());
83 * // Show the buckets with their contents, skipping empty buckets
85 * for (AlphabeticIndex.Bucket<Integer> bucket : index) {
86 * if (bucket.size() != 0) {
87 * showLabelInList(UI, bucket.getLabel());
88 * for (AlphabeticIndex.Record<Integer> item : bucket) {
89 * showIndexedItem(UI, item.getName(), item.getData());
93 * The caller can build different UIs using this class. For example, an index character could be omitted or grayed-out
94 * if its bucket is empty. Small buckets could also be combined based on size, such as:
97 * <b>... A-F G-N O-Z ...</b>
100 * <h2>Client Support</h2>
102 * Callers can also use the AlphabeticIndex to support sorting on a client that doesn't support collation.
104 * <li>getLabels() can be used to get a list of the labels, such as "...", "A", "B",..., and send that list to the client.
106 * <li>When the client has a new name, it sends that name to the server. The server needs to call the following methods,
107 * and communicate the bucketIndex and collationKey back to the client.
110 * int bucketIndex = alphabeticIndex.getBucketIndex(name);
111 * RawCollationKey collationKey = collator.getRawCollationKey(name, null);
114 * <li>The client would put the name (and associated information) into its bucket for bucketIndex. The collationKey is a
115 * sequence of bytes that can be compared with a binary compare, and produce the right localized result.</li>
121 * <li>Additional collation parameters can be passed in as part of the locale name. For example, German plus numeric
122 * sorting would be "de@kn-true".
126 * @provisional This API might change or be removed in a future release.
128 public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
133 static final boolean HACK_CODED_FIRSTS = true;
135 private static UnicodeSet UNIHAN = new UnicodeSet("[:script=Hani:]").freeze();
137 static final String BASE = "\uFDD0";
138 // these are generated. Later, get from CLDR data.
140 static final UnicodeSet PINYIN_LABELS = new UnicodeSet("[A-Z{\uFDD0A}{\uFDD0B}{\uFDD0C}{\uFDD0D}{\uFDD0E}{\uFDD0F}{\uFDD0G}{\uFDD0H}{\uFDD0I}{\uFDD0J}{\uFDD0K}{\uFDD0L}{\uFDD0M}{\uFDD0N}{\uFDD0O}{\uFDD0P}{\uFDD0Q}{\uFDD0R}{\uFDD0S}{\uFDD0T}{\uFDD0U}{\uFDD0V}{\uFDD0W}{\uFDD0X}{\uFDD0Y}{\uFDD0Z}]").freeze();
141 static final UnicodeSet STROKE_LABELS = new UnicodeSet("[{\uFDD0\u2801}{\uFDD0\u2802}{\uFDD0\u2803}{\uFDD0\u2804}{\uFDD0\u2805}{\uFDD0\u2806}{\uFDD0\u2807}{\uFDD0\u2808}{\uFDD0\u2809}{\uFDD0\u280A}{\uFDD0\u280B}{\uFDD0\u280C}{\uFDD0\u280D}{\uFDD0\u280E}{\uFDD0\u280F}{\uFDD0\u2810}{\uFDD0\u2811}{\uFDD0\u2812}{\uFDD0\u2813}{\uFDD0\u2814}{\uFDD0\u2815}{\uFDD0\u2816}{\uFDD0\u2817}{\uFDD0\u2818}{\uFDD0\u2819}{\uFDD0\u281A}{\uFDD0\u281B}{\uFDD0\u281C}{\uFDD0\u281D}{\uFDD0\u281E}{\uFDD0\u281F}{\uFDD0\u2820}{\uFDD0\u2821}{\uFDD0\u2822}{\uFDD0\u2823}{\uFDD0\u2824}{\uFDD0\u2825}{\uFDD0\u2826}{\uFDD0\u2827}{\uFDD0\u2828}{\uFDD0\u2829}{\uFDD0\u282A}{\uFDD0\u282B}{\uFDD0\u282C}{\uFDD0\u282E}{\uFDD0\u2830}{\uFDD0\u2834}{\uFDD0\u2840}]").freeze();
142 static final UnicodeSet RADICAL_LABELS = new UnicodeSet("[{\uFDD0\u2E80}{\uFDD0\u2E81}{\uFDD0\u2E84}{\uFDD0\u2E85}{\uFDD0\u2E86}{\uFDD0\u2E87}{\uFDD0\u2E88}{\uFDD0\u2E8A}{\uFDD0\u2E8B}{\uFDD0\u2E8C}{\uFDD0\u2E91}{\uFDD0\u2E92}{\uFDD0\u2E93}{\uFDD0\u2E95}{\uFDD0\u2E97}{\uFDD0\u2E98}{\uFDD0\u2E99}{\uFDD0\u2E9B}{\uFDD0\u2E9D}{\uFDD0\u2E9E}{\uFDD0\u2E9F}{\uFDD0\u2EA0}{\uFDD0\u2EA2}{\uFDD0\u2EA3}{\uFDD0\u2EA4}{\uFDD0\u2EA7}{\uFDD0\u2EA8}{\uFDD0\u2EA9}{\uFDD0\u2EAA}{\uFDD0\u2EAB}{\uFDD0\u2EAC}{\uFDD0\u2EAE}{\uFDD0\u2EAF}{\uFDD0\u2EB0}{\uFDD0\u2EB4}{\uFDD0\u2EB8}{\uFDD0\u2EB9}{\uFDD0\u2EBB}{\uFDD0\u2EBC}{\uFDD0\u2EBD}{\uFDD0\u2EC0}{\uFDD0\u2EC1}{\uFDD0\u2EC2}{\uFDD0\u2EC3}{\uFDD0\u2EC5}{\uFDD0\u2EC6}{\uFDD0\u2EC8}{\uFDD0\u2EC9}{\uFDD0\u2ECA}{\uFDD0\u2ECB}{\uFDD0\u2ECF}{\uFDD0\u2ED0}{\uFDD0\u2ED1}{\uFDD0\u2ED3}{\uFDD0\u2ED4}{\uFDD0\u2ED6}{\uFDD0\u2ED7}{\uFDD0\u2ED8}{\uFDD0\u2ED9}{\uFDD0\u2EDA}{\uFDD0\u2EDB}{\uFDD0\u2EDC}{\uFDD0\u2EDD}{\uFDD0\u2EE0}{\uFDD0\u2EE1}{\uFDD0\u2EE2}{\uFDD0\u2EE3}{\uFDD0\u2EE4}{\uFDD0\u2EE5}{\uFDD0\u2EE6}{\uFDD0\u2EE7}{\uFDD0\u2EE8}{\uFDD0\u2EEA}{\uFDD0\u2EEB}{\uFDD0\u2EED}{\uFDD0\u2EEE}{\uFDD0\u2EEF}{\uFDD0\u2EF0}{\uFDD0\u2EF2}{\uFDD0\u2EF3}{\uFDD0\u2F00}{\uFDD0\u2F01}{\uFDD0\u2F02}{\uFDD0\u2F03}{\uFDD0\u2F05}{\uFDD0\u2F06}{\uFDD0\u2F07}{\uFDD0\u2F09}{\uFDD0\u2F0A}{\uFDD0\u2F0B}{\uFDD0\u2F0D}{\uFDD0\u2F0E}{\uFDD0\u2F10}{\uFDD0\u2F12}{\uFDD0\u2F13}{\uFDD0\u2F14}{\uFDD0\u2F15}{\uFDD0\u2F16}{\uFDD0\u2F17}{\uFDD0\u2F1B}{\uFDD0\u2F1D}{\uFDD0\u2F1E}{\uFDD0\u2F1F}{\uFDD0\u2F20}{\uFDD0\u2F21}{\uFDD0\u2F22}{\uFDD0\u2F23}{\uFDD0\u2F24}{\uFDD0\u2F25}{\uFDD0\u2F26}{\uFDD0\u2F27}{\uFDD0\u2F28}{\uFDD0\u2F2B}{\uFDD0\u2F2C}{\uFDD0\u2F2D}{\uFDD0\u2F2E}{\uFDD0\u2F2F}{\uFDD0\u2F31}{\uFDD0\u2F32}{\uFDD0\u2F34}{\uFDD0\u2F35}{\uFDD0\u2F36}{\uFDD0\u2F37}{\uFDD0\u2F38}{\uFDD0\u2F3A}{\uFDD0\u2F3B}{\uFDD0\u2F3D}{\uFDD0\u2F3E}{\uFDD0\u2F40}{\uFDD0\u2F42}{\uFDD0\u2F43}{\uFDD0\u2F44}{\uFDD0\u2F45}{\uFDD0\u2F46}{\uFDD0\u2F48}{\uFDD0\u2F4A}{\uFDD0\u2F4B}{\uFDD0\u2F4C}{\uFDD0\u2F4E}{\uFDD0\u2F50}{\uFDD0\u2F51}{\uFDD0\u2F53}{\uFDD0\u2F57}{\uFDD0\u2F58}{\uFDD0\u2F59}{\uFDD0\u2F5A}{\uFDD0\u2F5B}{\uFDD0\u2F5E}{\uFDD0\u2F60}{\uFDD0\u2F61}{\uFDD0\u2F62}{\uFDD0\u2F63}{\uFDD0\u2F64}{\uFDD0\u2F65}{\uFDD0\u2F67}{\uFDD0\u2F68}{\uFDD0\u2F69}{\uFDD0\u2F6A}{\uFDD0\u2F6B}{\uFDD0\u2F6D}{\uFDD0\u2F6E}{\uFDD0\u2F6F}{\uFDD0\u2F71}{\uFDD0\u2F72}{\uFDD0\u2F73}{\uFDD0\u2F74}{\uFDD0\u2F76}{\uFDD0\u2F78}{\uFDD0\u2F7B}{\uFDD0\u2F7D}{\uFDD0\u2F7E}{\uFDD0\u2F7F}{\uFDD0\u2F82}{\uFDD0\u2F83}{\uFDD0\u2F84}{\uFDD0\u2F86}{\uFDD0\u2F87}{\uFDD0\u2F88}{\uFDD0\u2F89}{\uFDD0\u2F8A}{\uFDD0\u2F8D}{\uFDD0\u2F8E}{\uFDD0\u2F8F}{\uFDD0\u2F92}{\uFDD0\u2F94}{\uFDD0\u2F95}{\uFDD0\u2F96}{\uFDD0\u2F97}{\uFDD0\u2F98}{\uFDD0\u2F99}{\uFDD0\u2F9A}{\uFDD0\u2F9B}{\uFDD0\u2F9D}{\uFDD0\u2F9E}{\uFDD0\u2F9F}{\uFDD0\u2FA0}{\uFDD0\u2FA1}{\uFDD0\u2FA3}{\uFDD0\u2FA4}{\uFDD0\u2FA5}{\uFDD0\u2FA6}{\uFDD0\u2FA8}{\uFDD0\u2FAA}{\uFDD0\u2FAB}{\uFDD0\u2FAE}{\uFDD0\u2FAF}{\uFDD0\u2FB0}{\uFDD0\u2FB1}{\uFDD0\u2FB2}{\uFDD0\u2FB3}{\uFDD0\u2FB4}{\uFDD0\u2FB5}{\uFDD0\u2FB6}{\uFDD0\u2FB9}{\uFDD0\u2FBA}{\uFDD0\u2FBC}{\uFDD0\u2FBD}{\uFDD0\u2FBE}{\uFDD0\u2FBF}{\uFDD0\u2FC0}{\uFDD0\u2FC2}{\uFDD0\u2FC3}{\uFDD0\u2FC4}{\uFDD0\u2FC5}{\uFDD0\u2FC6}{\uFDD0\u2FC7}{\uFDD0\u2FC8}{\uFDD0\u2FC9}{\uFDD0\u2FCA}{\uFDD0\u2FCB}{\uFDD0\u2FCC}{\uFDD0\u2FCD}{\uFDD0\u2FCE}{\uFDD0\u2FCF}{\uFDD0\u2FD0}{\uFDD0\u2FD1}{\uFDD0\u2FD5}]").freeze();
143 static final List<String> PROBES = Arrays.asList("\u4E00", "\uFDD0A", "\uFDD0\u2801", "\uFDD0\u2E80");
144 static final int PINYIN_PROBE_INDEX = 1;
145 static final UnicodeSet[] MATCHING = {null, PINYIN_LABELS, STROKE_LABELS, RADICAL_LABELS};
147 private static final char CGJ = '\u034F';
148 private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]").add(BASE).freeze();
149 private static final UnicodeSet HANGUL = new UnicodeSet(
150 "[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14 \uC0AC \uC544 \uC790 \uCC28 \uCE74 \uD0C0 \uD30C \uD558]").freeze();
151 private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]").freeze();
152 private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]").freeze();
154 private final RuleBasedCollator collatorOriginal;
155 private final RuleBasedCollator collatorPrimaryOnly;
156 private RuleBasedCollator collatorExternal;
159 private final LinkedHashMap<String, Set<String>> alreadyIn = new LinkedHashMap<String, Set<String>>();
160 private final List<String> noDistinctSorting = new ArrayList<String>();
161 private final List<String> notAlphabetic = new ArrayList<String>();
163 // We accumulate these as we build up the input parameters
165 private final UnicodeSet initialLabels = new UnicodeSet();
166 private final Collection<Record<V>> inputList = new ArrayList<Record<V>>();
168 // Lazy evaluated: null means that we have not built yet.
170 private BucketList buckets;
172 private String overflowLabel = "\u2026";
173 private String underflowLabel = "\u2026";
174 private String inflowLabel = "\u2026";
175 private boolean hasPinyin;
178 * Create the index object.
181 * The locale for the index.
183 * @provisional This API might change or be removed in a future release.
185 public AlphabeticIndex(ULocale locale) {
186 this(locale, null, null);
190 * Create the index object.
193 * The locale for the index.
195 * @provisional This API might change or be removed in a future release.
197 public AlphabeticIndex(Locale locale) {
198 this(ULocale.forLocale(locale));
203 // * @deprecated This API is ICU internal only, for testing purposes and use with CLDR.
205 // public enum LangType {
208 // * @deprecated This API is ICU internal only, for testing purposes and use with CLDR.
213 // * @deprecated This API is ICU internal only, for testing purposes and use with CLDR.
218 // * @deprecated This API is ICU internal only, for testing purposes and use with CLDR.
223 // * @deprecated This API is ICU internal only, for testing purposes and use with CLDR.
225 // public static LangType fromLocale(ULocale locale) {
226 // String lang = locale.getLanguage();
227 // if (lang.equals("zh")) {
228 // if ("Hant".equals(locale.getScript()) || "TW".equals(locale.getCountry())) {
229 // return TRADITIONAL;
231 // return SIMPLIFIED;
239 * @deprecated This API is ICU internal only, for testing purposes and use with CLDR.
241 public AlphabeticIndex(ULocale locale, RuleBasedCollator collator, UnicodeSet exemplarChars) {
242 // langType = LangType.fromLocale(locale);
243 // // HACK because we have to know the type of the collation for Chinese
244 // if (langType != LangType.NORMAL) {
245 // locale = locale.setKeywordValue("collation", langType == LangType.TRADITIONAL ? "stroke" : "pinyin");
248 collatorOriginal = collator != null ? collator : (RuleBasedCollator) Collator.getInstance(locale);
250 collatorPrimaryOnly = (RuleBasedCollator) (collatorOriginal.clone());
251 } catch (Exception e) {
252 // should never happen
253 throw new IllegalStateException("Collator cannot be cloned", e);
255 collatorPrimaryOnly.setStrength(Collator.PRIMARY);
256 if (exemplarChars == null) {
257 exemplarChars = getIndexExemplars(locale);
259 addLabels(exemplarChars);
263 * Add more index characters (aside from what are in the locale)
264 * @param additions additional characters to add to the index, such as A-Z.
265 * @return this, for chaining
267 * @provisional This API might change or be removed in a future release.
269 public AlphabeticIndex<V> addLabels(UnicodeSet additions) {
270 initialLabels.addAll(additions);
276 * Add more index characters (aside from what are in the locale)
277 * @param additions additional characters to add to the index, such as those in Swedish.
278 * @return this, for chaining
280 * @provisional This API might change or be removed in a future release.
282 public AlphabeticIndex<V> addLabels(ULocale... additions) {
283 for (ULocale addition : additions) {
284 initialLabels.addAll(getIndexExemplars(addition));
291 * Add more index characters (aside from what are in the locale)
292 * @param additions additional characters to add to the index, such as those in Swedish.
293 * @return this, for chaining
295 * @provisional This API might change or be removed in a future release.
297 public AlphabeticIndex<V> addLabels(Locale... additions) {
298 for (Locale addition : additions) {
299 initialLabels.addAll(getIndexExemplars(ULocale.forLocale(addition)));
306 * Set the overflow label
307 * @param overflowLabel see class description
308 * @return this, for chaining
310 * @provisional This API might change or be removed in a future release.
312 public AlphabeticIndex<V> setOverflowLabel(String overflowLabel) {
313 this.overflowLabel = overflowLabel;
318 * Get the default label used in the IndexCharacters' locale for underflow, eg the last item in: X Y Z ...
320 * @return underflow label
322 * @provisional This API might change or be removed in a future release.
324 public String getUnderflowLabel() {
325 return underflowLabel; // TODO get localized version
330 * Set the underflowLabel label
331 * @param underflowLabel see class description
332 * @return this, for chaining
334 * @provisional This API might change or be removed in a future release.
336 public AlphabeticIndex<V> setUnderflowLabel(String underflowLabel) {
337 this.underflowLabel = underflowLabel;
342 * Get the default label used in the IndexCharacters' locale for overflow, eg the first item in: ... A B C
344 * @return overflow label
346 * @provisional This API might change or be removed in a future release.
348 public String getOverflowLabel() {
349 return overflowLabel; // TODO get localized version
354 * Set the inflowLabel label
355 * @param inflowLabel see class description
356 * @return this, for chaining
358 * @provisional This API might change or be removed in a future release.
360 public AlphabeticIndex<V> setInflowLabel(String inflowLabel) {
361 this.inflowLabel = inflowLabel;
366 * Get the default label used for abbreviated buckets <i>between</i> other labels. For example, consider the labels
367 * for Latin and Greek are used: X Y Z ... Α Β Γ.
369 * @return inflow label
371 * @provisional This API might change or be removed in a future release.
373 public String getInflowLabel() {
374 return inflowLabel; // TODO get localized version
379 * Get the limit on the number of labels in the index. The number of buckets can be slightly larger: see getBucketCount().
381 * @return maxLabelCount maximum number of labels.
383 * @provisional This API might change or be removed in a future release.
385 public int getMaxLabelCount() {
386 return maxLabelCount;
390 * Set a limit on the number of labels in the index. The number of buckets can be slightly larger: see
393 * @return maxLabelCount label Set the maximum number of labels. Currently, if the number is exceeded, then every
394 * nth item is removed to bring the count down. A more sophisticated mechanism may be available in the
397 * @provisional This API might change or be removed in a future release.
399 public AlphabeticIndex<V> setMaxLabelCount(int maxLabelCount) {
400 this.maxLabelCount = maxLabelCount;
405 * Determine the best labels to use. This is based on the exemplars, but we also process to make sure that they are unique,
406 * and sort differently, and that the overall list is small enough.
409 private ArrayList<String> initLabels() {
410 UnicodeSet exemplars = new UnicodeSet(initialLabels);
412 // First sort them, with an "best" ordering among items that are the same according
414 // Re the warning: the JDK inexplicably didn't make Collators be Comparator<String>!
415 @SuppressWarnings("unchecked")
416 Set<String> preferenceSorting = new TreeSet<String>(new MultiComparator<Object>(collatorPrimaryOnly, PREFERENCE_COMPARATOR));
417 exemplars.addAllTo(preferenceSorting);
419 TreeSet<String> indexCharacterSet = new TreeSet<String>(collatorPrimaryOnly);
421 // We nw make a sorted array of elements
422 // Some of the input may, however, be redundant.
423 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
424 // So we make a pass through, filtering out those cases.
426 for (String item : preferenceSorting) {
427 if (indexCharacterSet.contains(item)) {
428 for (String itemAlreadyIn : indexCharacterSet) {
429 if (collatorPrimaryOnly.compare(item, itemAlreadyIn) == 0) {
430 Set<String> targets = alreadyIn.get(itemAlreadyIn);
431 if (targets == null) {
432 alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>());
438 } else if (UTF16.countCodePoint(item) > 1 && collatorPrimaryOnly.compare(item, separated(item)) == 0) {
439 noDistinctSorting.add(item);
440 } else if (!ALPHABETIC.containsSome(item)) {
441 notAlphabetic.add(item);
443 indexCharacterSet.add(item);
447 // if the result is still too large, cut down to maxCount elements, by removing every nth element
449 final int size = indexCharacterSet.size() - 1;
450 if (size > maxLabelCount) {
453 for (Iterator<String> it = indexCharacterSet.iterator(); it.hasNext();) {
456 final int bump = count * maxLabelCount / size;
465 return new ArrayList<String>(indexCharacterSet);
469 * This method is called to get the index exemplars. Normally these come from the locale directly,
470 * but if they aren't available, we have to synthesize them.
474 private UnicodeSet getIndexExemplars(ULocale locale) {
475 UnicodeSet exemplars;
477 exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_INDEX);
478 if (exemplars != null) {
480 final String language = locale.getLanguage();
481 if (language.equals("zh") || language.equals("ja") || language.equals("ko")) {
482 // find out which one we are using
483 TreeSet<String> probeSet = new TreeSet<String>(collatorOriginal);
485 // UnicodeSet tailored = collatorOriginal.getTailoredSet();
486 // tailored.addAllTo(probeSet);
487 // System.out.println(probeSet);
490 probeSet.addAll(PROBES);
491 String first = probeSet.iterator().next();
492 int location = PROBES.indexOf(first);
494 if (location == PINYIN_PROBE_INDEX) {
497 exemplars.clear().addAll(MATCHING[location]);
500 // LangType langType2 = LangType.fromLocale(locale);
501 // if (langType2 == LangType.TRADITIONAL) {
502 // Collator collator = Collator.getInstance(locale);
503 // if (collator.getTailoredSet().contains(probeCharInLongStroke)) {
504 // exemplars = HACK_LONG_TRAD_EXEMPLARS;
506 // exemplars = HACK_SHORT_TRAD_EXEMPLARS;
513 // Synthesize the index exemplars
515 exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_STANDARD);
517 // get the exemplars, and handle special cases
519 exemplars = exemplars.cloneAsThawed();
520 // question: should we add auxiliary exemplars?
521 if (exemplars.containsSome(CORE_LATIN) || exemplars.size() == 0) {
522 exemplars.addAll(CORE_LATIN);
524 if (exemplars.containsSome(HANGUL)) {
525 // cut down to small list
526 exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL);
528 if (exemplars.containsSome(ETHIOPIC)) {
529 // cut down to small list
530 // make use of the fact that Ethiopic is allocated in 8's, where
531 // the base is 0 mod 8.
532 for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) {
533 if ((it.codepoint & 0x7) != 0) {
534 exemplars.remove(it.codepoint);
539 UnicodeSet uppercased = new UnicodeSet();
540 for (String item : exemplars) {
541 uppercased.add(UCharacter.toUpperCase(locale, item));
548 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
549 * <p>This is used to test whether contractions sort differently from their components.
551 private String separated(String item) {
552 StringBuilder result = new StringBuilder();
553 // add a CGJ except within surrogates
554 char last = item.charAt(0);
556 for (int i = 1; i < item.length(); ++i) {
557 char ch = item.charAt(i);
558 if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
564 return result.toString();
570 * @return A collection listing the labels, after processing.
572 * @provisional This API might change or be removed in a future release.
574 public List<String> getBucketLabels() {
575 if (buckets == null) {
578 ArrayList<String> result = new ArrayList<String>();
579 for (Bucket<V> bucket : buckets) {
580 result.add(bucket.getLabel());
586 * Get a clone of the collator used internally. Note that for performance reasons, the clone is only done once, and
587 * then stored. The next time it is accessed, the same instance is returned.
589 * <b><i>Don't use this method across threads if you are changing the settings on the collator, at least not without
590 * synchronizing.</i></b>
592 * @return a clone of the collator used internally
594 * @provisional This API might change or be removed in a future release.
596 public RuleBasedCollator getCollator() {
597 if (collatorExternal == null) {
599 collatorExternal = (RuleBasedCollator) (collatorOriginal.clone());
600 } catch (Exception e) {
601 // should never happen
602 throw new IllegalStateException("Collator cannot be cloned", e);
605 return collatorExternal;
609 * Add a record (name and data) to the index. The name will be used to sort the items into buckets, and to sort
610 * within the bucket. Two records may have the same name. When they do, the sort order is according to the order added:
611 * the first added comes first.
614 * Name, such as a name
616 * Data, such as an address or link
617 * @return this, for chaining
619 * @provisional This API might change or be removed in a future release.
621 public AlphabeticIndex<V> addRecord(CharSequence name, V data) {
622 // TODO instead of invalidating, just add to unprocessed list.
623 buckets = null; // invalidate old bucketlist
624 inputList.add(new Record<V>(name, data, inputList.size()));
629 * Get the bucket number for the given name. This routine permits callers to implement their own bucket handling
630 * mechanisms, including client-server handling. For example, when a new name is created on the client, it can ask
631 * the server for the bucket for that name, and the sortkey (using getCollator). Once the client has that
632 * information, it can put the name into the right bucket, and sort it within that bucket, without having access to
633 * the index or collator.
635 * Note that the bucket number (and sort key) are only valid for the settings of the current AlphabeticIndex; if
636 * those are changed, then the bucket number and sort key must be regenerated.
639 * Name, such as a name
640 * @return this, for chaining
642 * @provisional This API might change or be removed in a future release.
644 public int getBucketIndex(CharSequence name) {
645 if (buckets == null) {
648 // if (langType == LangType.SIMPLIFIED) {
649 // String hackPrefix = hackName(name, collatorPrimaryOnly);
650 // if (hackPrefix != null) {
651 // name = hackPrefix + name;
654 return rawGetBucketIndex(name);
657 private int rawGetBucketIndex(CharSequence name) {
658 // TODO use a binary search
660 Bucket<V> lastBucket = null;
661 Bucket<V> bucket = null;
662 for (Iterator<Bucket<V>> it = buckets.fullIterator(); it.hasNext();) {
664 if (bucket.lowerBoundary == null) { // last bucket
665 bucket = lastBucket; // back up the bucket
669 int bucketLower2name = collatorPrimaryOnly.compare(bucket.lowerBoundary, name);
670 if (bucketLower2name > 0) { // the first boundary is always "", and so -1 will never be returned
671 bucket = lastBucket; // back up the bucket
674 } else if (bucketLower2name == 0) {
680 // we will always have at least one bucket
681 // see if we need to remap
682 if (buckets.rebucket != null) {
683 Bucket<V> temp = buckets.rebucket.get(bucket);
688 for (Bucket<V> bucket2 : buckets) {
689 if (bucket2 == bucket) {
701 * @return this, for chaining
703 * @provisional This API might change or be removed in a future release.
705 public AlphabeticIndex<V> clearRecords() {
712 * Return the number of buckets in the index. This will be the same as the number of labels, plus buckets for the underflow, overflow, and inflow(s).
714 * @return number of buckets
716 * @provisional This API might change or be removed in a future release.
718 public int getBucketCount() {
719 if (buckets == null) {
722 return buckets.bucketList.size();
726 * Return the number of records in the index: that is, the total number of distinct <name,data> pairs added with addRecord(...), over all the buckets.
728 * @return total number of records in buckets
730 * @provisional This API might change or be removed in a future release.
732 public int getRecordCount() {
733 return inputList.size();
737 * Return an iterator over the buckets.
739 * @return iterator over buckets.
741 * @provisional This API might change or be removed in a future release.
743 public Iterator<Bucket<V>> iterator() {
744 if (buckets == null) {
747 return buckets.iterator();
751 * Convenience routine to bucket a list of input strings according to the index.<br>
752 * Warning: if a UI suppresses buckets that are empty, this may result in the special buckets (underflow, overflow,
753 * inflow) being adjacent. In that case, the application may want to combine them.
756 * List of strings to be sorted and bucketed according to the labels.
758 * @provisional This API might change or be removed in a future release.
760 private void initBuckets() {
761 buckets = new BucketList();
763 // Make a collator for records. Do this so that the Records can be static classes, and not know about the collators.
764 // TODO make this a member of the class.
765 Comparator<Record<V>> fullComparator = new Comparator<Record<V>>() {
766 public int compare(Record<V> o1, Record<V> o2) {
767 int result = collatorOriginal.compare(o1.name, o2.name);
771 return o1.counter - o2.counter;
775 // // If we have Pinyin, then we have a special hack to bucket items with ASCII.
777 // Map<String,Bucket<V>> rebucketMap = new HashMap<String, Bucket<V>>();
778 // for (Record<V> name : inputList) {
779 // String key = hackName(name.name, collatorOriginal);
780 // if (key == null) continue;
781 // Bucket<V> bucket = rebucketMap.get(key);
782 // if (bucket == null) {
783 // int index = rawGetBucketIndex(key);
784 // bucket = buckets.bucketList.get(index);
786 // rebucketMap.put(key, bucket);
787 // name.rebucket = bucket;
791 // Set up a sorted list of the input
792 TreeSet<Record<V>> sortedInput = new TreeSet<Record<V>>(fullComparator);
793 sortedInput.addAll(inputList);
795 // Now, we traverse all of the input, which is now sorted.
796 // If the item doesn't go in the current bucket, we find the next bucket that contains it.
797 // This makes the process order n*log(n), since we just sort the list and then do a linear process.
798 // However, if the user adds item at a time and then gets the buckets, this isn't efficient, so
799 // we need to improve it for that case.
801 Iterator<Bucket<V>> bucketIterator = buckets.fullIterator();
802 Bucket<V> currentBucket = bucketIterator.next();
803 Bucket<V> nextBucket = bucketIterator.next();
804 String upperBoundary = nextBucket.lowerBoundary; // there is always at least one bucket, so this is safe
805 boolean atEnd = false;
806 for (Record<V> s : sortedInput) {
807 // // special hack for pinyin
808 // if (s.rebucket != null) {
809 // s.rebucket.records.add(s);
812 // if the current bucket isn't the right one, find the one that is
813 // We have a special flag for the last bucket so that we don't look any further
814 while (!atEnd && collatorPrimaryOnly.compare(s.name, upperBoundary) >= 0) {
815 currentBucket = nextBucket;
816 // now reset the boundary that we compare against
817 if (bucketIterator.hasNext()) {
818 nextBucket = bucketIterator.next();
819 upperBoundary = nextBucket.lowerBoundary;
820 if (upperBoundary == null) {
827 // now put the record into the bucket.
828 buckets.addTo(s, currentBucket);
833 * Get the Unicode character (or tailored string) that defines an overflow bucket; that is anything greater than or
834 * equal to that string should go in that bucket, instead of with the last character. Normally that is the first
835 * character of the script after lowerLimit. Thus in X Y Z ... <i>Devanagari-ka</i>, the overflow character for Z
836 * would be the <i>Greek-alpha</i>.
839 * The character below the overflow (or inflow) bucket
840 * @return string that defines top of the overflow buck for lowerLimit, or null if there is none
842 * @deprecated This API is ICU internal only.
844 public String getOverflowComparisonString(String lowerLimit) {
845 // TODO Use collator method instead of this hack
846 for (String s : HACK_FIRST_CHARS_IN_SCRIPTS) {
847 if (collatorPrimaryOnly.compare(s, lowerLimit) > 0) {
855 * Return a list of the first character in each script, in collation order. Only exposed for testing.
857 * @return list of first characters in each script
859 * @deprecated This API is ICU internal only.
861 public List<String> getFirstScriptCharacters() {
862 return HACK_FIRST_CHARS_IN_SCRIPTS;
866 * As the index is built, strings may be discarded from the exemplars. This contains some of the discards, and is
867 * intended for debugging.
870 * @deprecated This API is ICU internal only.
872 public Map<String, Set<String>> getAlreadyIn() {
877 * As the index is built, strings may be discarded from the exemplars. This contains some of the discards, and is
878 * intended for debugging.
881 * @deprecated This API is ICU internal only.
883 public List<String> getNoDistinctSorting() {
884 return noDistinctSorting;
888 * As the index is built, strings may be discarded from the exemplars. This contains some of the discards, and is
889 * intended for debugging.
892 * @deprecated This API is ICU internal only.
894 public List<String> getNotAlphabetic() {
895 return notAlphabetic;
898 private static UnicodeSet getScriptSet(String codePoint) {
899 if (codePoint.startsWith(BASE)) {
900 return new UnicodeSet(UNIHAN);
902 return new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, UScript.getScript(codePoint.codePointAt(0)));
905 private static final UnicodeSet IGNORE_SCRIPTS = new UnicodeSet(
906 "[[:sc=Common:][:sc=inherited:][:script=Unknown:][:script=braille:]]").freeze();
908 private static final PreferenceComparator PREFERENCE_COMPARATOR = new PreferenceComparator();
909 private int maxLabelCount = 99;
912 * Comparator that returns "better" strings first, where shorter NFKD is better, and otherwise NFKD binary order is
913 * better, and otherwise binary order is better.
915 private static class PreferenceComparator implements Comparator<Object> {
916 static final Comparator<String> binary = new UTF16.StringComparator(true, false, 0);
918 public int compare(Object o1, Object o2) {
919 return compare((String) o1, (String) o2);
922 public int compare(String s1, String s2) {
926 String n1 = Normalizer.decompose(s1, true);
927 String n2 = Normalizer.decompose(s2, true);
928 int result = n1.length() - n2.length();
932 result = binary.compare(n1, n2);
936 return binary.compare(s1, s2);
941 * A record to be sorted into buckets with getIndexBucketCharacters.
944 * @provisional This API might change or be removed in a future release.
946 public static class Record<V> {
947 //private Bucket<V> rebucket = null; // special hack for Pinyin
948 private CharSequence name;
952 private Record(CharSequence name, V data, int counter) {
955 this.counter = counter;
963 * @provisional This API might change or be removed in a future release.
965 public CharSequence getName() {
974 * @provisional This API might change or be removed in a future release.
981 * Standard toString()
983 * @provisional This API might change or be removed in a future release.
985 public String toString() {
986 return name + "=" + data
987 //+ (rebucket == null ? "" : "{" + rebucket.label + "}")
993 * A "bucket", containing records sorted under an index string by getIndexBucketCharacters. Is created by the
994 * addBucket method in BucketList. A typical implementation will provide methods getLabel(), getSpecial(), and
996 * See com.ibm.icu.dev.test.collator.IndexCharactersTest for an example.
1001 * @provisional This API might change or be removed in a future release.
1003 public static class Bucket<V> implements Iterable<Record<V>> {
1004 private final String label;
1005 private final String lowerBoundary;
1006 private final LabelType labelType;
1007 private final List<Record<V>> records = new ArrayList<Record<V>>();
1013 * @provisional This API might change or be removed in a future release.
1015 public enum LabelType {
1016 NORMAL, UNDERFLOW, INFLOW, OVERFLOW
1020 * Set up the bucket.
1023 * label for the bucket
1025 * is an underflow, overflow, or inflow bucket
1027 * @provisional This API might change or be removed in a future release.
1029 private Bucket(String label, String lowerBoundary, LabelType labelType) {
1030 // String hackLabel = HACK_TRADITIONAL.get(label);
1031 // if (hackLabel != null) {
1032 // label = hackLabel;
1035 this.lowerBoundary = lowerBoundary;
1036 this.labelType = labelType;
1039 String getLowerBoundary() {
1040 return lowerBoundary;
1046 * @return label for the bucket
1048 * @provisional This API might change or be removed in a future release.
1050 public String getLabel() {
1055 * Is a normal, underflow, overflow, or inflow bucket
1057 * @return is an underflow, overflow, or inflow bucket
1059 * @provisional This API might change or be removed in a future release.
1061 public LabelType getLabelType() {
1066 * Get the number of records in the bucket.
1068 * @return number of records in bucket
1070 * @provisional This API might change or be removed in a future release.
1073 return records.size();
1077 * Iterator over the records in the bucket
1079 * @provisional This API might change or be removed in a future release.
1081 public Iterator<Record<V>> iterator() {
1082 return records.iterator();
1086 * Standard toString()
1088 * @provisional This API might change or be removed in a future release.
1091 public String toString() {
1093 "labelType=" + labelType
1095 "lowerBoundary=" + lowerBoundary
1103 private class BucketList implements Iterable<Bucket<V>> {
1104 private final ArrayList<Bucket<V>> bucketList = new ArrayList<Bucket<V>>();
1105 private final HashMap<Bucket<V>,Bucket<V>> rebucket;
1106 private final List<Bucket<V>> immutableVisibleList;
1108 private BucketList() {
1109 // initialize indexCharacters;
1110 List<String> indexCharacters = initLabels();
1113 bucketList.add(new Bucket<V>(getUnderflowLabel(), "", Bucket.LabelType.UNDERFLOW));
1115 // fix up the list, adding underflow, additions, overflow
1116 // insert infix labels as needed, using \uFFFF.
1117 String last = indexCharacters.get(0);
1118 bucketList.add(new Bucket<V>(fixLabel(last), last, Bucket.LabelType.NORMAL));
1119 UnicodeSet lastSet = getScriptSet(last).removeAll(IGNORE_SCRIPTS);
1121 for (int i = 1; i < indexCharacters.size(); ++i) {
1122 String current = indexCharacters.get(i);
1123 UnicodeSet set = getScriptSet(current).removeAll(IGNORE_SCRIPTS);
1124 if (lastSet.containsNone(set)) {
1125 // check for adjacent
1126 String overflowComparisonString = getOverflowComparisonString(last);
1127 if (collatorPrimaryOnly.compare(overflowComparisonString, current) < 0) {
1128 bucketList.add(new Bucket<V>(getInflowLabel(), overflowComparisonString,
1129 Bucket.LabelType.INFLOW));
1134 bucketList.add(new Bucket<V>(fixLabel(current), current, Bucket.LabelType.NORMAL));
1139 String limitString = getOverflowComparisonString(last);
1140 bucketList.add(new Bucket<V>(getOverflowLabel(), limitString, Bucket.LabelType.OVERFLOW)); // final
1142 // add some redirects for Pinyin
1144 ArrayList<Bucket<V>> publicBucketList;
1146 rebucket = new HashMap<Bucket<V>,Bucket<V>>();
1147 publicBucketList = new ArrayList<Bucket<V>>();
1148 HashMap<String,Bucket<V>> rebucketLabel = new HashMap<String,Bucket<V>>();
1149 Bucket<V> flowBefore = null; // special handling for flow bucket before pinyin
1150 boolean flowRedirect = false;
1151 boolean havePinyin = false;
1153 for (Bucket<V> bucket : bucketList) {
1154 String label = bucket.getLabel();
1155 String lowerBound = bucket.getLowerBoundary();
1156 if (lowerBound != null && lowerBound.startsWith(BASE)) { // pinyin
1157 rebucket.put(bucket, rebucketLabel.get(label));
1159 } else { // not pinyin
1160 if (bucket.labelType != LabelType.NORMAL) { // special handling for flows
1161 if (flowRedirect == false) {
1163 // do a redirect from the last before pinyin to the first before;
1164 // we do it this way so that the buckets are joined, and any between stuff goes to the end
1165 // eg a b c alpha chinese gorp
1166 // we want to show as ... a b c ... with the alpha and gorp both in the final bucket.
1167 rebucket.put(flowBefore, bucket);
1168 publicBucketList.remove(flowBefore);
1169 flowRedirect = true;
1171 flowBefore = bucket;
1174 } else { // is NORMAL
1175 rebucketLabel.put(label, bucket);
1177 publicBucketList.add(bucket);
1182 publicBucketList = bucketList;
1184 immutableVisibleList = Collections.unmodifiableList(publicBucketList);
1189 * @param currentBucket
1191 private void addTo(Record<V> s, Bucket<V> currentBucket) {
1192 if (rebucket != null) {
1193 Bucket<V> newBucket = rebucket.get(currentBucket);
1194 if (newBucket != null) {
1195 currentBucket = newBucket;
1198 currentBucket.records.add(s);
1205 private String fixLabel(String current) {
1206 if (!current.startsWith(BASE)) {
1209 int rest = current.charAt(1);
1210 if (0x2800 < rest && rest <= 0x28FF) { // stroke count
1211 return (rest-0x2800) + "\u5283"; // HACK
1213 return current.substring(1);
1217 * Private iterator over all the buckets, visible and invisible
1219 private Iterator<Bucket<V>> fullIterator() {
1220 return bucketList.iterator();
1224 * Iterator over just the visible buckets.
1226 public Iterator<Bucket<V>> iterator() {
1227 return immutableVisibleList.iterator(); // use immutable list to prevent remove().
1236 // * Only gets called for simplified Chinese. Uses further hack to distinguish long from short pinyin table.
1238 // private String hackName(CharSequence name, RuleBasedCollator comparator) {
1239 // if (!UNIHAN.contains(Character.codePointAt(name, 0))) {
1242 // synchronized (PINYIN_LOWER_BOUNDS_LONG) {
1243 // if (PINYIN_LOWER_BOUNDS == null) {
1244 // if (comparator.getTailoredSet().contains(probeCharInLong)) {
1245 // PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_LONG;
1246 // HACK_PINYIN_LOOKUP = HACK_PINYIN_LOOKUP_LONG;
1248 // PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_SHORT;
1249 // HACK_PINYIN_LOOKUP = HACK_PINYIN_LOOKUP_SHORT;
1253 // int index = Arrays.binarySearch(HACK_PINYIN_LOOKUP, name, comparator);
1255 // index = -index - 2;
1257 // return PINYIN_LOWER_BOUNDS.substring(index, index + 1);
1260 // private static String PINYIN_LOWER_BOUNDS;
1262 // private static String[] HACK_PINYIN_LOOKUP;
1267 // * Generated with org.unicode.draft.GenerateUnihanCollator.
1270 // private static final int probeCharInLong = 0x28EAD;
1271 // private static final int probeCharInLongStroke = 0x2A6A5;
1273 // private static final String PINYIN_LOWER_BOUNDS_LONG = "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz";
1275 // private static final String[] HACK_PINYIN_LOOKUP_LONG = {
1277 // "\u516B", // b : \u516B [b\u0101]
1278 // "\uD863\uDEAD", // c : \U00028EAD [c\u0101]
1279 // "\uD844\uDE51", // d : \U00021251 [d\u0101]
1280 // "\u59B8", // e : \u59B8 [\u0113]
1281 // "\u53D1", // f : \u53D1 [f\u0101]
1282 // "\uD844\uDE45", // g : \U00021245 [g\u0101]
1283 // "\u54C8", // h : \u54C8 [h\u0101]
1284 // "\u4E0C", // j : \u4E0C [j\u012B]
1285 // "\u5494", // k : \u5494 [k\u0101]
1286 // "\u3547", // l : \u3547 [l\u0101]
1287 // "\u5452", // m : \u5452 [\u1E3F]
1288 // "\u5514", // n : \u5514 [\u0144]
1289 // "\u5594", // o : \u5594 [\u014D]
1290 // "\uD84F\uDC7A", // p : \U00023C7A [p\u0101]
1291 // "\u4E03", // q : \u4E03 [q\u012B]
1292 // "\u513F", // r : \u513F [r]
1293 // "\u4EE8", // s : \u4EE8 [s\u0101]
1294 // "\u4ED6", // t : \u4ED6 [t\u0101]
1295 // "\u7A75", // w : \u7A75 [w\u0101]
1296 // "\u5915", // x : \u5915 [x\u012B]
1297 // "\u4E2B", // y : \u4E2B [y\u0101]
1298 // "\u5E00", // z : \u5E00 [z\u0101]
1301 // private static String PINYIN_LOWER_BOUNDS_SHORT = "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz";
1303 // private static String[] HACK_PINYIN_LOOKUP_SHORT = {
1305 // "\u516B", // b : \u516B [b\u0101]
1306 // "\u5693", // c : \u5693 [c\u0101]
1307 // "\u5491", // d : \u5491 [d\u0101]
1308 // "\u59B8", // e : \u59B8 [\u0113]
1309 // "\u53D1", // f : \u53D1 [f\u0101]
1310 // "\u65EE", // g : \u65EE [g\u0101]
1311 // "\u54C8", // h : \u54C8 [h\u0101]
1312 // "\u4E0C", // j : \u4E0C [j\u012B]
1313 // "\u5494", // k : \u5494 [k\u0101]
1314 // "\u3547", // l : \u3547 [l\u0101]
1315 // "\u5452", // m : \u5452 [\u1E3F]
1316 // "\u5514", // n : \u5514 [\u0144]
1317 // "\u5594", // o : \u5594 [\u014D]
1318 // "\u5991", // p : \u5991 [p\u0101]
1319 // "\u4E03", // q : \u4E03 [q\u012B]
1320 // "\u513F", // r : \u513F [r]
1321 // "\u4EE8", // s : \u4EE8 [s\u0101]
1322 // "\u4ED6", // t : \u4ED6 [t\u0101]
1323 // "\u7A75", // w : \u7A75 [w\u0101]
1324 // "\u5915", // x : \u5915 [x\u012B]
1325 // "\u4E2B", // y : \u4E2B [y\u0101]
1326 // "\u5E00", // z : \u5E00 [z\u0101]
1329 // private static final Map<String,String> HACK_TRADITIONAL;
1331 // Map<String,String> temp = new HashMap<String,String>();
1332 // temp.put("\u4E00", "1\u5283");
1333 // temp.put("\u4E01", "2\u5283");
1334 // temp.put("\u4E07", "3\u5283");
1335 // temp.put("\u4E0D", "4\u5283");
1336 // temp.put("\u4E17", "5\u5283");
1337 // temp.put("\u3401", "6\u5283");
1338 // temp.put("\u4E23", "7\u5283");
1339 // temp.put("\u4E26", "8\u5283");
1340 // temp.put("\u4E34", "9\u5283");
1341 // temp.put("\uD840\uDC35", "9\u5283");
1342 // temp.put("\uD840\uDC3E", "10\u5283");
1343 // temp.put("\uD840\uDC3D", "10\u5283");
1344 // temp.put("\u3422", "11\u5283");
1345 // temp.put("\uD840\uDC41", "11\u5283");
1346 // temp.put("\uD840\uDC46", "12\u5283");
1347 // temp.put("\u4E82", "13\u5283");
1348 // temp.put("\uD840\uDC4C", "13\u5283");
1349 // temp.put("\uD840\uDC4E", "14\u5283");
1350 // temp.put("\u3493", "15\u5283");
1351 // temp.put("\uD840\uDC53", "15\u5283");
1352 // temp.put("\u4EB8", "16\u5283");
1353 // temp.put("\uD840\uDC55", "16\u5283");
1354 // temp.put("\u511F", "17\u5283");
1355 // temp.put("\uD840\uDC56", "17\u5283");
1356 // temp.put("\u512D", "18\u5283");
1357 // temp.put("\uD840\uDC5F", "18\u5283");
1358 // temp.put("\u3426", "19\u5283");
1359 // temp.put("\uD840\uDC7A", "19\u5283");
1360 // temp.put("\u34A5", "20\u5283");
1361 // temp.put("\uD840\uDC60", "20\u5283");
1362 // temp.put("\u34A7", "21\u5283");
1363 // temp.put("\uD840\uDD9E", "21\u5283");
1364 // temp.put("\u4EB9", "22\u5283");
1365 // temp.put("\uD840\uDC7B", "22\u5283");
1366 // temp.put("\u513D", "23\u5283");
1367 // temp.put("\uD840\uDCC8", "23\u5283");
1368 // temp.put("\u513E", "24\u5283");
1369 // temp.put("\uD840\uDD9F", "24\u5283");
1370 // temp.put("\u56D4", "25\u5283");
1371 // temp.put("\uD842\uDCCA", "25\u5283");
1372 // temp.put("\u3536", "26\u5283");
1373 // temp.put("\u34AA", "26\u5283");
1374 // temp.put("\u7065", "27\u5283");
1375 // temp.put("\uD842\uDE0B", "27\u5283");
1376 // temp.put("\u56D6", "28\u5283");
1377 // temp.put("\uD840\uDDA0", "28\u5283");
1378 // temp.put("\u7E9E", "29\u5283");
1379 // temp.put("\uD840\uDDA1", "29\u5283");
1380 // temp.put("\u53B5", "30\u5283");
1381 // temp.put("\uD842\uDD6C", "30\u5283");
1382 // temp.put("\u7069", "31\u5283");
1383 // temp.put("\uD844\uDD9F", "31\u5283");
1384 // temp.put("\u706A", "32\u5283");
1385 // temp.put("\uD842\uDED1", "32\u5283");
1386 // temp.put("\uD846\uDD3B", "33\u5283");
1387 // temp.put("\uD842\uDE0C", "33\u5283");
1388 // temp.put("\uD842\uDCCB", "34\u5283");
1389 // temp.put("\u9F7E", "35\u5283");
1390 // temp.put("\uD84C\uDF5C", "35\u5283");
1391 // temp.put("\u9F49", "36\u5283");
1392 // temp.put("\uD845\uDD19", "36\u5283");
1393 // temp.put("\uD86B\uDE9A", "37\u5283");
1394 // temp.put("\uD861\uDC04", "38\u5283");
1395 // temp.put("\u9750", "39\u5283");
1396 // temp.put("\uD845\uDD1A", "39\u5283");
1397 // temp.put("\uD864\uDDD3", "40\u5283");
1398 // temp.put("\uD869\uDCCA", "41\u5283");
1399 // temp.put("\uD85A\uDDC4", "42\u5283");
1400 // temp.put("\uD85C\uDD98", "43\u5283");
1401 // temp.put("\uD85E\uDCB1", "44\u5283");
1402 // temp.put("\uD865\uDE63", "46\u5283");
1403 // temp.put("\u9F98", "48\u5283");
1404 // temp.put("\uD85A\uDDC5", "48\u5283");
1405 // temp.put("\u4A3B", "52\u5283");
1406 // temp.put("\uD841\uDD3B", "64\u5283");
1407 // HACK_TRADITIONAL = Collections.unmodifiableMap(temp);
1413 private static final List<String> HACK_FIRST_CHARS_IN_SCRIPTS =
1414 Arrays.asList(new String[] {
1415 "a", "\u03B1", "\u2C81", "\u0430", "\u2C30", "\u10D0", "\u0561", "\u05D0", "\uD802\uDD00", "\u0800", "\u0621",
1417 "\u0840", // Mandaic
1418 "\u0780", "\u07CA", "\u2D30", "\u1200", "\u0950", "\u0985", "\u0A74", "\u0AD0", "\u0B05", "\u0BD0",
1419 "\u0C05", "\u0C85", "\u0D05", "\u0D85", "\uABC0", "\uA800", "\uA882", "\uD804\uDC83",
1420 "\u1B83", // Sundanese
1421 "\uD804\uDC05", // Brahmi (U+11005)
1422 "\uD802\uDE00", "\u0E01", "\u0E81", "\uAA80", "\u0F40", "\u1C00", "\uA840", "\u1900", "\u1700", "\u1720", "\u1740", "\u1760",
1423 "\u1A00", // Buginese
1425 "\uA930", "\uA90A", "\u1000", "\u1780", "\u1950", "\u1980", "\u1A20", "\uAA00", "\u1B05", "\uA984", "\u1880", "\u1C5A", "\u13A0", "\u1401", "\u1681", "\u16A0", "\uD803\uDC00", "\uA500", "\uA6A0", "\u1100",
1426 "\u3041", "\u30A1", "\u3105", "\uA000", "\uA4F8", "\uD800\uDE80", "\uD800\uDEA0", "\uD802\uDD20", "\uD800\uDF00", "\uD800\uDF30", "\uD801\uDC28", "\uD801\uDC50", "\uD801\uDC80", "\uD800\uDC00", "\uD802\uDC00", "\uD802\uDE60", "\uD802\uDF00", "\uD802\uDC40",
1427 "\uD802\uDF40", "\uD802\uDF60", "\uD800\uDF80", "\uD800\uDFA0", "\uD808\uDC00", "\uD80C\uDC00", "\u4E00"
1430 // private static final UnicodeSet HACK_SHORT_TRAD_EXEMPLARS = new UnicodeSet(
1431 // "[\u3401 \u3422 \u3426 \u3493 \u34A5 \u34A7 \u3536 \u4E00 \u4E01 \u4E07 \u4E0D \u4E17 \u4E23 \u4E26 \u4E34 \u4E82 \u4EB8 \u4EB9 \u511F \u512D \u513D" +
1432 // " \u513E \u53B5 \u56D4 \u56D6 \u7065 \u7069 \u706A \u7E9E \u9750 \u9F49 \u9F7E \u9F98 \\U0002003E \\U00020046 \\U0002004E \\U0002193B]").freeze();
1433 // private static final UnicodeSet HACK_LONG_TRAD_EXEMPLARS = new UnicodeSet(
1434 // "[\u3401\u34AA\u4A3B\u4E00\u4E01\u4E07\u4E0D\u4E17\u4E23\u4E26" +
1435 // "\\U00020035\\U0002003D\\U00020041\\U00020046\\U0002004C\\U0002004E\\U00020053\\U00020055\\U00020056\\U0002005F\\U00020060\\U0002007A\\U0002007B\\U000200C8" +
1436 // "\\U0002019E-\\U000201A1\\U0002053B\\U000208CA\\U000208CB\\U0002096C\\U00020A0B\\U00020A0C\\U00020AD1\\U0002119F\\U00021519\\U0002151A\\U0002335C\\U000269C4" +
1437 // "\\U000269C5\\U00027198\\U000278B1\\U00028404\\U000291D3\\U00029663\\U0002A4CA\\U0002AE9A]").freeze();
1439 * Only for testing...
1441 * @deprecated only for internal testing
1443 public static List<String> getFirstCharactersInScripts() {
1444 return HACK_FIRST_CHARS_IN_SCRIPTS;