2 *******************************************************************************
3 * Copyright (C) 2008-2013, Google Inc, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 *******************************************************************************
7 package com.ibm.icu.text;
9 import java.util.ArrayList;
10 import java.util.Arrays;
11 import java.util.Collection;
12 import java.util.Collections;
13 import java.util.Comparator;
14 import java.util.Iterator;
15 import java.util.List;
16 import java.util.Locale;
18 import com.ibm.icu.lang.UCharacter;
19 import com.ibm.icu.text.AlphabeticIndex.Bucket;
20 import com.ibm.icu.text.AlphabeticIndex.Bucket.LabelType;
21 import com.ibm.icu.util.LocaleData;
22 import com.ibm.icu.util.ULocale;
25 * AlphabeticIndex supports the creation of a UI index appropriate for a given language.
26 * It can support either direct use, or use with a client that doesn't support localized collation.
27 * The following is an example of what an index might look like in a UI:
30 * <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ...</b>
41 * The class can generate a list of labels for use as a UI "index", that is, a list of
42 * clickable characters (or character sequences) that allow the user to see a segment
43 * (bucket) of a larger "target" list. That is, each label corresponds to a bucket in
44 * the target list, where everything in the bucket is greater than or equal to the character
45 * (according to the locale's collation). Strings can be added to the index;
46 * they will be in sorted order in the right bucket.</p>
48 * The class also supports having buckets for strings before the first (underflow),
49 * after the last (overflow), and between scripts (inflow). For example, if the index
50 * is constructed with labels for Russian and English, Greek characters would fall
51 * into an inflow bucket between the other two scripts.</p>
53 * <p><em>Note:</em> If you expect to have a lot of ASCII or Latin characters
54 * as well as characters from the user's language,
55 * then it is a good idea to call addLabels(ULocale.English).</p>
58 * <p>The following shows an example of building an index directly.
59 * The "show..." methods below are just to illustrate usage.
62 * // Create a simple index where the values for the strings are Integers, and add the strings
64 * AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(desiredLocale).addLabels(additionalLocale);
66 * for (String item : test) {
67 * index.addRecord(item, counter++);
70 * // Show index at top. We could skip or gray out empty buckets
72 * for (AlphabeticIndex.Bucket<Integer> bucket : index) {
73 * if (showAll || bucket.size() != 0) {
74 * showLabelAtTop(UI, bucket.getLabel());
78 * // Show the buckets with their contents, skipping empty buckets
80 * for (AlphabeticIndex.Bucket<Integer> bucket : index) {
81 * if (bucket.size() != 0) {
82 * showLabelInList(UI, bucket.getLabel());
83 * for (AlphabeticIndex.Record<Integer> item : bucket) {
84 * showIndexedItem(UI, item.getName(), item.getData());
88 * The caller can build different UIs using this class.
89 * For example, an index character could be omitted or grayed-out
90 * if its bucket is empty. Small buckets could also be combined based on size, such as:
93 * <b>... A-F G-N O-Z ...</b>
96 * <h2>Client Support</h2>
97 * <p>Callers can also use the {@link AlphabeticIndex.ImmutableIndex}, or the AlphabeticIndex itself,
98 * to support sorting on a client that doesn't support AlphabeticIndex functionality.
100 * <p>The ImmutableIndex is both immutable and thread-safe.
101 * The corresponding AlphabeticIndex methods are not thread-safe because
102 * they "lazily" build the index buckets.
104 * <li>ImmutableIndex.getBucket(index) provides random access to all
105 * buckets and their labels and label types.
106 * <li>AlphabeticIndex.getBucketLabels() or the bucket iterator on either class
107 * can be used to get a list of the labels,
108 * such as "...", "A", "B",..., and send that list to the client.
109 * <li>When the client has a new name, it sends that name to the server.
110 * The server needs to call the following methods,
111 * and communicate the bucketIndex and collationKey back to the client.
114 * int bucketIndex = index.getBucketIndex(name);
115 * String label = immutableIndex.getBucket(bucketIndex).getLabel(); // optional
116 * RawCollationKey collationKey = collator.getRawCollationKey(name, null);
119 * <li>The client would put the name (and associated information) into its bucket for bucketIndex. The collationKey is a
120 * sequence of bytes that can be compared with a binary compare, and produce the right localized result.</li>
126 public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
128 * Prefix string for Chinese index buckets.
129 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
131 private static final String BASE = "\uFDD0";
133 private static final char CGJ = '\u034F';
135 private static final Comparator<String> binaryCmp = new UTF16.StringComparator(true, false, 0);
137 private final RuleBasedCollator collatorOriginal;
138 private final RuleBasedCollator collatorPrimaryOnly;
139 private RuleBasedCollator collatorExternal;
141 // Comparator for records, so that the Record class can be static.
142 private final Comparator<Record<V>> recordComparator = new Comparator<Record<V>>() {
143 public int compare(Record<V> o1, Record<V> o2) {
144 return collatorOriginal.compare(o1.name, o2.name);
148 private final List<String> firstCharsInScripts;
150 // We accumulate these as we build up the input parameters
151 private final UnicodeSet initialLabels = new UnicodeSet();
152 private List<Record<V>> inputList;
154 // Lazy evaluated: null means that we have not built yet.
155 private BucketList<V> buckets;
157 private String overflowLabel = "\u2026";
158 private String underflowLabel = "\u2026";
159 private String inflowLabel = "\u2026";
162 * Immutable, thread-safe version of {@link AlphabeticIndex}.
163 * This class provides thread-safe methods for bucketing,
164 * and random access to buckets and their properties,
165 * but does not offer adding records to the index.
167 * @param <V> The Record value type is unused. It can be omitted for this class
168 * if it was omitted for the AlphabeticIndex that built it.
170 * @provisional This API might change or be removed in a future release.
172 public static final class ImmutableIndex<V> implements Iterable<Bucket<V>> {
173 private final BucketList<V> buckets;
174 private final Collator collatorPrimaryOnly;
176 private ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly) {
177 this.buckets = bucketList;
178 this.collatorPrimaryOnly = collatorPrimaryOnly;
182 * Returns the number of index buckets and labels, including underflow/inflow/overflow.
184 * @return the number of index buckets
186 * @provisional This API might change or be removed in a future release.
188 public int getBucketCount() {
189 return buckets.getBucketCount();
193 * Finds the index bucket for the given name and returns the number of that bucket.
194 * Use {@link #getBucket(int)} to get the bucket's properties.
196 * @param name the string to be sorted into an index bucket
197 * @return the bucket number for the name
199 * @provisional This API might change or be removed in a future release.
201 public int getBucketIndex(CharSequence name) {
202 return buckets.getBucketIndex(name, collatorPrimaryOnly);
206 * Returns the index-th bucket. Returns null if the index is out of range.
208 * @param index bucket number
209 * @return the index-th bucket
211 * @provisional This API might change or be removed in a future release.
213 public Bucket<V> getBucket(int index) {
214 if (0 <= index && index < buckets.getBucketCount()) {
215 return buckets.immutableVisibleList.get(index);
224 * @provisional This API might change or be removed in a future release.
226 public Iterator<Bucket<V>> iterator() {
227 return buckets.iterator();
232 * Create the index object.
235 * The locale for the index.
238 public AlphabeticIndex(ULocale locale) {
243 * Create the index object.
246 * The locale for the index.
249 public AlphabeticIndex(Locale locale) {
250 this(ULocale.forLocale(locale), null);
254 * Create an AlphabeticIndex that uses a specific collator.
256 * <p>The index will be created with no labels; the addLabels() function must be called
257 * after creation to add the desired labels to the index.
259 * <p>The index will work directly with the supplied collator. If the caller will need to
260 * continue working with the collator it should be cloned first, so that the
261 * collator provided to the AlphabeticIndex remains unchanged after creation of the index.
263 * @param collator The collator to use to order the contents of this index.
265 * @provisional This API might change or be removed in a future release.
267 public AlphabeticIndex(RuleBasedCollator collator) {
268 this(null, collator);
272 * Internal constructor containing implementation used by public constructors.
274 private AlphabeticIndex(ULocale locale, RuleBasedCollator collator) {
275 collatorOriginal = collator != null ? collator : (RuleBasedCollator) Collator.getInstance(locale);
277 collatorPrimaryOnly = (RuleBasedCollator) (collatorOriginal.clone());
278 } catch (Exception e) {
279 // should never happen
280 throw new IllegalStateException("Collator cannot be cloned", e);
282 collatorPrimaryOnly.setStrength(Collator.PRIMARY);
283 collatorPrimaryOnly.freeze();
285 firstCharsInScripts = new ArrayList<String>(HACK_FIRST_CHARS_IN_SCRIPTS);
286 Collections.sort(firstCharsInScripts, collatorPrimaryOnly);
287 if (collatorPrimaryOnly.compare("\u4E00", "\u1112") <= 0 &&
288 collatorPrimaryOnly.compare("\u1100", "\u4E00") <= 0) {
289 // The standard Korean tailoring sorts Hanja (Han characters)
290 // as secondary differences from Hangul syllables.
291 // This makes U+4E00 not useful as a Han-script boundary.
292 // TODO: This becomes obsolete when the root collator gets
293 // reliable script-first-primary mappings.
294 int hanIndex = Collections.binarySearch(
295 firstCharsInScripts, "\u4E00", collatorPrimaryOnly);
297 firstCharsInScripts.remove(hanIndex);
300 // Guard against a degenerate collator where
301 // some script boundary strings are primary ignorable.
303 if (firstCharsInScripts.isEmpty()) {
304 throw new IllegalArgumentException(
305 "AlphabeticIndex requires some non-ignorable script boundary strings");
307 if (collatorPrimaryOnly.compare(firstCharsInScripts.get(0), "") == 0) {
308 firstCharsInScripts.remove(0);
314 if (locale != null) {
315 addIndexExemplars(locale);
320 * Add more index characters (aside from what are in the locale)
321 * @param additions additional characters to add to the index, such as A-Z.
322 * @return this, for chaining
325 public AlphabeticIndex<V> addLabels(UnicodeSet additions) {
326 initialLabels.addAll(additions);
332 * Add more index characters (aside from what are in the locale)
333 * @param additions additional characters to add to the index, such as those in Swedish.
334 * @return this, for chaining
337 public AlphabeticIndex<V> addLabels(ULocale... additions) {
338 for (ULocale addition : additions) {
339 addIndexExemplars(addition);
346 * Add more index characters (aside from what are in the locale)
347 * @param additions additional characters to add to the index, such as those in Swedish.
348 * @return this, for chaining
351 public AlphabeticIndex<V> addLabels(Locale... additions) {
352 for (Locale addition : additions) {
353 addIndexExemplars(ULocale.forLocale(addition));
360 * Set the overflow label
361 * @param overflowLabel see class description
362 * @return this, for chaining
365 public AlphabeticIndex<V> setOverflowLabel(String overflowLabel) {
366 this.overflowLabel = overflowLabel;
372 * Get the default label used in the IndexCharacters' locale for underflow, eg the last item in: X Y Z ...
374 * @return underflow label
377 public String getUnderflowLabel() {
378 return underflowLabel; // TODO get localized version
383 * Set the underflowLabel label
384 * @param underflowLabel see class description
385 * @return this, for chaining
388 public AlphabeticIndex<V> setUnderflowLabel(String underflowLabel) {
389 this.underflowLabel = underflowLabel;
395 * Get the default label used in the IndexCharacters' locale for overflow, eg the first item in: ... A B C
397 * @return overflow label
400 public String getOverflowLabel() {
401 return overflowLabel; // TODO get localized version
406 * Set the inflowLabel label
407 * @param inflowLabel see class description
408 * @return this, for chaining
411 public AlphabeticIndex<V> setInflowLabel(String inflowLabel) {
412 this.inflowLabel = inflowLabel;
418 * Get the default label used for abbreviated buckets <i>between</i> other labels. For example, consider the labels
419 * for Latin and Greek are used: X Y Z ... Α Β Γ.
421 * @return inflow label
424 public String getInflowLabel() {
425 return inflowLabel; // TODO get localized version
430 * Get the limit on the number of labels in the index. The number of buckets can be slightly larger: see getBucketCount().
432 * @return maxLabelCount maximum number of labels.
435 public int getMaxLabelCount() {
436 return maxLabelCount;
440 * Set a limit on the number of labels in the index. The number of buckets can be slightly larger: see
443 * @param maxLabelCount Set the maximum number of labels. Currently, if the number is exceeded, then every
444 * nth item is removed to bring the count down. A more sophisticated mechanism may be available in the
446 * @return this, for chaining
449 public AlphabeticIndex<V> setMaxLabelCount(int maxLabelCount) {
450 this.maxLabelCount = maxLabelCount;
456 * Determine the best labels to use. This is based on the exemplars, but we also process to make sure that they are unique,
457 * and sort differently, and that the overall list is small enough.
459 private List<String> initLabels() {
460 Normalizer2 nfkdNormalizer = Normalizer2.getNFKDInstance();
461 List<String> indexCharacters = new ArrayList<String>();
463 String firstScriptBoundary = firstCharsInScripts.get(0);
464 String overflowBoundary = firstCharsInScripts.get(firstCharsInScripts.size() - 1);
466 // We make a sorted array of elements.
467 // Some of the input may be redundant.
468 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
469 // We filter out those cases.
470 for (String item : initialLabels) {
471 boolean checkDistinct;
472 if (!UTF16.hasMoreCodePointsThan(item, 1)) {
473 checkDistinct = false;
474 } else if(item.charAt(item.length() - 1) == '*' &&
475 item.charAt(item.length() - 2) != '*') {
476 // Use a label if it is marked with one trailing star,
477 // even if the label string sorts the same when all contractions are suppressed.
478 item = item.substring(0, item.length() - 1);
479 checkDistinct = false;
481 checkDistinct = true;
483 if (collatorPrimaryOnly.compare(item, firstScriptBoundary) < 0) {
484 // Ignore a primary-ignorable or non-alphabetic index character.
485 } else if (collatorPrimaryOnly.compare(item, overflowBoundary) >= 0) {
486 // Ignore an index characters that will land in the overflow bucket.
487 } else if (checkDistinct && collatorPrimaryOnly.compare(item, separated(item)) == 0) {
488 // Ignore a multi-code point index character that does not sort distinctly
489 // from the sequence of its separate characters.
491 int insertionPoint = Collections.binarySearch(indexCharacters, item, collatorPrimaryOnly);
492 if (insertionPoint < 0) {
493 indexCharacters.add(~insertionPoint, item);
495 String itemAlreadyIn = indexCharacters.get(insertionPoint);
496 if (isOneLabelBetterThanOther(nfkdNormalizer, item, itemAlreadyIn)) {
497 indexCharacters.set(insertionPoint, item);
503 // if the result is still too large, cut down to maxCount elements, by removing every nth element
505 final int size = indexCharacters.size() - 1;
506 if (size > maxLabelCount) {
509 for (Iterator<String> it = indexCharacters.iterator(); it.hasNext();) {
512 final int bump = count * maxLabelCount / size;
521 return indexCharacters;
524 private static String fixLabel(String current) {
525 if (!current.startsWith(BASE)) {
528 int rest = current.charAt(BASE.length());
529 if (0x2800 < rest && rest <= 0x28FF) { // stroke count
530 return (rest-0x2800) + "\u5283";
532 return current.substring(BASE.length());
536 * This method is called to get the index exemplars. Normally these come from the locale directly,
537 * but if they aren't available, we have to synthesize them.
539 private void addIndexExemplars(ULocale locale) {
540 // Chinese index characters, which are specific to each of the several Chinese tailorings,
541 // take precedence over the single locale data exemplar set per language.
542 final String language = locale.getLanguage();
543 if (language.equals("zh") || language.equals("ja") || language.equals("ko")) {
544 // TODO: This should be done regardless of the language, but it's expensive.
545 // We should add a Collator function (can be @internal)
546 // to enumerate just the contractions that start with a given code point or string.
547 if (addChineseIndexCharacters()) {
552 UnicodeSet exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_INDEX);
553 if (exemplars != null) {
554 initialLabels.addAll(exemplars);
558 // The locale data did not include explicit Index characters.
559 // Synthesize a set of them from the locale's standard exemplar characters.
560 exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_STANDARD);
562 exemplars = exemplars.cloneAsThawed();
563 // question: should we add auxiliary exemplars?
564 if (exemplars.containsSome('a', 'z') || exemplars.size() == 0) {
565 exemplars.addAll('a', 'z');
567 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
568 // cut down to small list
569 exemplars.remove(0xAC00, 0xD7A3).
570 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
571 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
572 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
573 add(0xD30C).add(0xD558);
575 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block
576 // cut down to small list
577 // make use of the fact that Ethiopic is allocated in 8's, where
578 // the base is 0 mod 8.
579 UnicodeSet ethiopic = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
580 UnicodeSetIterator it = new UnicodeSetIterator(ethiopic);
581 while (it.next() && it.codepoint != UnicodeSetIterator.IS_STRING) {
582 if ((it.codepoint & 0x7) != 0) {
583 exemplars.remove(it.codepoint);
588 // Upper-case any that aren't already so.
589 // (We only do this for synthesized index characters.)
590 for (String item : exemplars) {
591 initialLabels.add(UCharacter.toUpperCase(locale, item));
596 * Add Chinese index characters from the tailoring.
598 private boolean addChineseIndexCharacters() {
599 UnicodeSet contractions = new UnicodeSet();
601 collatorPrimaryOnly.getContractionsAndExpansions(contractions, null, false);
602 } catch (Exception e) {
605 String firstHanBoundary = null;
606 boolean hasPinyin = false;
607 for (String s : contractions) {
608 if (s.startsWith(BASE)) {
609 initialLabels.add(s);
610 if (firstHanBoundary == null ||
611 collatorPrimaryOnly.compare(s, firstHanBoundary) < 0) {
612 firstHanBoundary = s;
614 char c = s.charAt(s.length() - 1);
615 if ('A' <= c && c <= 'Z') {
621 initialLabels.add('A', 'Z');
623 if (firstHanBoundary != null) {
624 // The hardcoded list of script boundaries includes U+4E00
625 // which is tailored to not be the first primary
626 // in all Chinese tailorings except "unihan".
627 // Replace U+4E00 with the first boundary string from the tailoring.
628 // TODO: This becomes obsolete when the root collator gets
629 // reliable script-first-primary mappings.
630 int hanIndex = Collections.binarySearch(
631 firstCharsInScripts, "\u4E00", collatorPrimaryOnly);
633 firstCharsInScripts.set(hanIndex, firstHanBoundary);
642 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
643 * <p>This is used to test whether contractions sort differently from their components.
645 private String separated(String item) {
646 StringBuilder result = new StringBuilder();
647 // add a CGJ except within surrogates
648 char last = item.charAt(0);
650 for (int i = 1; i < item.length(); ++i) {
651 char ch = item.charAt(i);
652 if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
658 return result.toString();
662 * Builds an immutable, thread-safe version of this instance, without data records.
664 * @return an immutable index instance
666 * @provisional This API might change or be removed in a future release.
668 public ImmutableIndex<V> buildImmutableIndex() {
669 // The current AlphabeticIndex Java code never modifies the bucket list once built.
670 // If it contains no records, we can use it.
671 // addRecord() sets buckets=null rather than inserting the new record into it.
672 BucketList<V> immutableBucketList;
673 if (inputList != null && !inputList.isEmpty()) {
674 // We need a bucket list with no records.
675 immutableBucketList = createBucketList();
677 if (buckets == null) {
678 buckets = createBucketList();
680 immutableBucketList = buckets;
682 return new ImmutableIndex<V>(immutableBucketList, collatorPrimaryOnly);
688 * @return The list of bucket labels, after processing.
691 public List<String> getBucketLabels() {
693 ArrayList<String> result = new ArrayList<String>();
694 for (Bucket<V> bucket : buckets) {
695 result.add(bucket.getLabel());
701 * Get a clone of the collator used internally. Note that for performance reasons, the clone is only done once, and
702 * then stored. The next time it is accessed, the same instance is returned.
704 * <b><i>Don't use this method across threads if you are changing the settings on the collator, at least not without
705 * synchronizing.</i></b>
707 * @return a clone of the collator used internally
710 public RuleBasedCollator getCollator() {
711 if (collatorExternal == null) {
713 collatorExternal = (RuleBasedCollator) (collatorOriginal.clone());
714 } catch (Exception e) {
715 // should never happen
716 throw new IllegalStateException("Collator cannot be cloned", e);
719 return collatorExternal;
723 * Add a record (name and data) to the index. The name will be used to sort the items into buckets, and to sort
724 * within the bucket. Two records may have the same name. When they do, the sort order is according to the order added:
725 * the first added comes first.
728 * Name, such as a name
730 * Data, such as an address or link
731 * @return this, for chaining
734 public AlphabeticIndex<V> addRecord(CharSequence name, V data) {
735 // TODO instead of invalidating, just add to unprocessed list.
736 buckets = null; // invalidate old bucketlist
737 if (inputList == null) {
738 inputList = new ArrayList<Record<V>>();
740 inputList.add(new Record<V>(name, data));
745 * Get the bucket number for the given name. This routine permits callers to implement their own bucket handling
746 * mechanisms, including client-server handling. For example, when a new name is created on the client, it can ask
747 * the server for the bucket for that name, and the sortkey (using getCollator). Once the client has that
748 * information, it can put the name into the right bucket, and sort it within that bucket, without having access to
749 * the index or collator.
751 * Note that the bucket number (and sort key) are only valid for the settings of the current AlphabeticIndex; if
752 * those are changed, then the bucket number and sort key must be regenerated.
755 * Name, such as a name
756 * @return the bucket index for the name
759 public int getBucketIndex(CharSequence name) {
761 return buckets.getBucketIndex(name, collatorPrimaryOnly);
767 * @return this, for chaining
770 public AlphabeticIndex<V> clearRecords() {
771 if (inputList != null && !inputList.isEmpty()) {
779 * Return the number of buckets in the index. This will be the same as the number of labels, plus buckets for the underflow, overflow, and inflow(s).
781 * @return number of buckets
784 public int getBucketCount() {
786 return buckets.getBucketCount();
790 * Return the number of records in the index: that is, the total number of distinct <name,data> pairs added with addRecord(...), over all the buckets.
792 * @return total number of records in buckets
795 public int getRecordCount() {
796 return inputList != null ? inputList.size() : 0;
800 * Return an iterator over the buckets.
802 * @return iterator over buckets.
805 public Iterator<Bucket<V>> iterator() {
807 return buckets.iterator();
811 * Creates an index, and buckets and sorts the list of records into the index.
813 private void initBuckets() {
814 if (buckets != null) {
817 buckets = createBucketList();
818 if (inputList == null || inputList.isEmpty()) {
822 // Sort the records by name.
823 // Stable sort preserves input order of collation duplicates.
824 Collections.sort(inputList, recordComparator);
826 // Now, we traverse all of the input, which is now sorted.
827 // If the item doesn't go in the current bucket, we find the next bucket that contains it.
828 // This makes the process order n*log(n), since we just sort the list and then do a linear process.
829 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
830 // we need to improve it for that case.
832 Iterator<Bucket<V>> bucketIterator = buckets.fullIterator();
833 Bucket<V> currentBucket = bucketIterator.next();
834 Bucket<V> nextBucket;
835 String upperBoundary;
836 if (bucketIterator.hasNext()) {
837 nextBucket = bucketIterator.next();
838 upperBoundary = nextBucket.lowerBoundary;
841 upperBoundary = null;
843 for (Record<V> r : inputList) {
844 // if the current bucket isn't the right one, find the one that is
845 // We have a special flag for the last bucket so that we don't look any further
846 while (upperBoundary != null &&
847 collatorPrimaryOnly.compare(r.name, upperBoundary) >= 0) {
848 currentBucket = nextBucket;
849 // now reset the boundary that we compare against
850 if (bucketIterator.hasNext()) {
851 nextBucket = bucketIterator.next();
852 upperBoundary = nextBucket.lowerBoundary;
854 upperBoundary = null;
857 // now put the record into the bucket.
858 Bucket<V> bucket = currentBucket;
859 if (bucket.displayBucket != null) {
860 bucket = bucket.displayBucket;
862 if (bucket.records == null) {
863 bucket.records = new ArrayList<Record<V>>();
865 bucket.records.add(r);
869 private int maxLabelCount = 99;
872 * Returns true if one index character string is "better" than the other.
873 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
874 * better, and otherwise binary-less-than is better.
876 private static boolean isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other) {
877 // This is called with primary-equal strings, but never with one.equals(other).
878 String n1 = nfkdNormalizer.normalize(one);
879 String n2 = nfkdNormalizer.normalize(other);
880 int result = n1.codePointCount(0, n1.length()) - n2.codePointCount(0, n2.length());
884 result = binaryCmp.compare(n1, n2);
888 return binaryCmp.compare(one, other) < 0;
892 * A (name, data) pair, to be sorted by name into one of the index buckets.
893 * The user data is not used by the index implementation.
897 public static class Record<V> {
898 private final CharSequence name;
899 private final V data;
901 private Record(CharSequence name, V data) {
912 public CharSequence getName() {
927 * Standard toString()
930 public String toString() {
931 return name + "=" + data;
936 * An index "bucket" with a label string and type.
937 * It is referenced by {@link AlphabeticIndex#getBucketIndex(CharSequence)}
938 * and {@link AlphabeticIndex.ImmutableIndex#getBucketIndex(CharSequence)},
939 * returned by {@link AlphabeticIndex.ImmutableIndex#getBucket(int)},
940 * and {@link AlphabeticIndex#addRecord(CharSequence, Object)} adds a record
941 * into a bucket according to the record's name.
947 public static class Bucket<V> implements Iterable<Record<V>> {
948 private final String label;
949 private final String lowerBoundary;
950 private final LabelType labelType;
951 private Bucket<V> displayBucket;
952 private int displayIndex;
953 private List<Record<V>> records;
960 public enum LabelType {
961 NORMAL, UNDERFLOW, INFLOW, OVERFLOW
968 * label for the bucket
970 * is an underflow, overflow, or inflow bucket
973 private Bucket(String label, String lowerBoundary, LabelType labelType) {
975 this.lowerBoundary = lowerBoundary;
976 this.labelType = labelType;
982 * @return label for the bucket
985 public String getLabel() {
990 * Is a normal, underflow, overflow, or inflow bucket
992 * @return is an underflow, overflow, or inflow bucket
995 public LabelType getLabelType() {
1000 * Get the number of records in the bucket.
1002 * @return number of records in bucket
1006 return records == null ? 0 : records.size();
1010 * Iterator over the records in the bucket
1013 public Iterator<Record<V>> iterator() {
1014 if (records == null) {
1015 return Collections.<Record<V>>emptyList().iterator();
1017 return records.iterator();
1021 * Standard toString()
1025 public String toString() {
1027 "labelType=" + labelType
1029 "lowerBoundary=" + lowerBoundary
1037 private BucketList<V> createBucketList() {
1038 // Initialize indexCharacters.
1039 List<String> indexCharacters = initLabels();
1041 // Variables for hasMultiplePrimaryWeights().
1042 CollationElementIterator cei = collatorPrimaryOnly.getCollationElementIterator("");
1044 if (collatorPrimaryOnly.isAlternateHandlingShifted()) {
1045 variableTop = CollationElementIterator.primaryOrder(collatorPrimaryOnly.getVariableTop());
1049 boolean hasInvisibleBuckets = false;
1051 // Helper arrays for Chinese Pinyin collation.
1052 @SuppressWarnings({ "rawtypes", "unchecked" })
1053 Bucket<V>[] asciiBuckets = new Bucket[26];
1054 @SuppressWarnings({ "rawtypes", "unchecked" })
1055 Bucket<V>[] pinyinBuckets = new Bucket[26];
1056 boolean hasPinyin = false;
1058 ArrayList<Bucket<V>> bucketList = new ArrayList<Bucket<V>>();
1061 bucketList.add(new Bucket<V>(getUnderflowLabel(), "", LabelType.UNDERFLOW));
1063 // fix up the list, adding underflow, additions, overflow
1064 // Insert inflow labels as needed.
1065 int scriptIndex = -1;
1066 String scriptUpperBoundary = "";
1067 for (String current : indexCharacters) {
1068 if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) >= 0) {
1069 // We crossed the script boundary into a new script.
1070 String inflowBoundary = scriptUpperBoundary;
1071 boolean skippedScript = false;
1073 scriptUpperBoundary = firstCharsInScripts.get(++scriptIndex);
1074 if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) < 0) {
1077 skippedScript = true;
1079 if (skippedScript && bucketList.size() > 1) {
1080 // We are skipping one or more scripts,
1081 // and we are not just getting out of the underflow label.
1082 bucketList.add(new Bucket<V>(getInflowLabel(), inflowBoundary,
1086 // Add a bucket with the current label.
1087 Bucket<V> bucket = new Bucket<V>(fixLabel(current), current, LabelType.NORMAL);
1088 bucketList.add(bucket);
1089 // Remember ASCII and Pinyin buckets for Pinyin redirects.
1091 if (current.length() == 1 && 'A' <= (c = current.charAt(0)) && c <= 'Z') {
1092 asciiBuckets[c - 'A'] = bucket;
1093 } else if (current.length() == BASE.length() + 1 && current.startsWith(BASE) &&
1094 'A' <= (c = current.charAt(BASE.length())) && c <= 'Z') {
1095 pinyinBuckets[c - 'A'] = bucket;
1098 // Check for multiple primary weights.
1099 if (!current.startsWith(BASE) &&
1100 hasMultiplePrimaryWeights(cei, variableTop, current) &&
1101 !current.endsWith("\uffff")) {
1102 // "Æ" or "Sch" etc.
1103 for (int i = bucketList.size() - 2;; --i) {
1104 Bucket<V> singleBucket = bucketList.get(i);
1105 if (singleBucket.labelType != LabelType.NORMAL) {
1106 // There is no single-character bucket since the last
1107 // underflow or inflow label.
1110 if (singleBucket.displayBucket == null &&
1111 !hasMultiplePrimaryWeights(cei, variableTop, singleBucket.lowerBoundary)) {
1112 // Add an invisible bucket that redirects strings greater than the expansion
1113 // to the previous single-character bucket.
1114 // For example, after ... Q R S Sch we add Sch\uFFFF->S
1115 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
1116 bucket = new Bucket<V>("", current + "\uFFFF", LabelType.NORMAL);
1117 bucket.displayBucket = singleBucket;
1118 bucketList.add(bucket);
1119 hasInvisibleBuckets = true;
1125 if (bucketList.size() == 1) {
1126 // No real labels, show only the underflow label.
1127 return new BucketList<V>(bucketList, bucketList);
1130 bucketList.add(new Bucket<V>(getOverflowLabel(), scriptUpperBoundary, LabelType.OVERFLOW)); // final
1133 // Redirect Pinyin buckets.
1134 Bucket<V> asciiBucket = null;
1135 for (int i = 0; i < 26; ++i) {
1136 if (asciiBuckets[i] != null) {
1137 asciiBucket = asciiBuckets[i];
1139 if (pinyinBuckets[i] != null && asciiBucket != null) {
1140 pinyinBuckets[i].displayBucket = asciiBucket;
1141 hasInvisibleBuckets = true;
1146 if (!hasInvisibleBuckets) {
1147 return new BucketList<V>(bucketList, bucketList);
1149 // Merge inflow buckets that are visually adjacent.
1150 // Iterate backwards: Merge inflow into overflow rather than the other way around.
1151 int i = bucketList.size() - 1;
1152 Bucket<V> nextBucket = bucketList.get(i);
1154 Bucket<V> bucket = bucketList.get(i);
1155 if (bucket.displayBucket != null) {
1156 continue; // skip invisible buckets
1158 if (bucket.labelType == LabelType.INFLOW) {
1159 if (nextBucket.labelType != LabelType.NORMAL) {
1160 bucket.displayBucket = nextBucket;
1164 nextBucket = bucket;
1167 ArrayList<Bucket<V>> publicBucketList = new ArrayList<Bucket<V>>();
1168 for (Bucket<V> bucket : bucketList) {
1169 if (bucket.displayBucket == null) {
1170 publicBucketList.add(bucket);
1173 return new BucketList<V>(bucketList, publicBucketList);
1176 private static class BucketList<V> implements Iterable<Bucket<V>> {
1177 private final ArrayList<Bucket<V>> bucketList;
1178 private final List<Bucket<V>> immutableVisibleList;
1180 private BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList) {
1181 this.bucketList = bucketList;
1183 int displayIndex = 0;
1184 for (Bucket<V> bucket : publicBucketList) {
1185 bucket.displayIndex = displayIndex++;
1187 immutableVisibleList = Collections.unmodifiableList(publicBucketList);
1190 private int getBucketCount() {
1191 return immutableVisibleList.size();
1194 private int getBucketIndex(CharSequence name, Collator collatorPrimaryOnly) {
1197 int limit = bucketList.size();
1198 while ((start + 1) < limit) {
1199 int i = (start + limit) / 2;
1200 Bucket<V> bucket = bucketList.get(i);
1201 int nameVsBucket = collatorPrimaryOnly.compare(name, bucket.lowerBoundary);
1202 if (nameVsBucket < 0) {
1208 Bucket<V> bucket = bucketList.get(start);
1209 if (bucket.displayBucket != null) {
1210 bucket = bucket.displayBucket;
1212 return bucket.displayIndex;
1216 * Private iterator over all the buckets, visible and invisible
1218 private Iterator<Bucket<V>> fullIterator() {
1219 return bucketList.iterator();
1223 * Iterator over just the visible buckets.
1225 public Iterator<Bucket<V>> iterator() {
1226 return immutableVisibleList.iterator(); // use immutable list to prevent remove().
1230 private static boolean hasMultiplePrimaryWeights(
1231 CollationElementIterator cei, int variableTop, String s) {
1233 boolean seenPrimary = false;
1235 int ce32 = cei.next();
1236 if (ce32 == CollationElementIterator.NULLORDER) {
1239 int p = CollationElementIterator.primaryOrder(ce32);
1240 if (p > variableTop && (ce32 & 0xc0) != 0xc0) {
1241 // not primary ignorable, and not a continuation CE
1252 * This list contains one character per script that has the
1253 * lowest primary weight for that script in the root collator.
1254 * This list will be copied and sorted to account for script reordering.
1256 * <p>TODO: This is fragile. If the first character of a script is tailored
1257 * so that it does not map to the script's lowest primary weight any more,
1258 * then the buckets will be off.
1259 * There are hacks in the code to handle the known CJK tailorings of U+4E00.
1261 * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a.
1263 private static final List<String> HACK_FIRST_CHARS_IN_SCRIPTS =
1264 Arrays.asList(new String[] {
1265 "A", "\u03B1", "\u2C81", "\u0430", "\u2C30", "\u10D0", "\u0561", "\u05D0", "\uD802\uDD00", "\u0800", "\u0621",
1267 "\u0840", // Mandaic
1268 "\u0780", "\u07CA", "\u2D30", "\u1200", "\u0950", "\u0985", "\u0A74", "\u0AD0", "\u0B05", "\u0BD0",
1269 "\u0C05", "\u0C85", "\u0D05", "\u0D85",
1270 "\uAAF2", // Meetei Mayek
1271 "\uA800", "\uA882", "\uD804\uDC83",
1272 UCharacter.toString(0x111C4), // Sharada
1273 UCharacter.toString(0x11680), // Takri
1274 "\u1B83", // Sundanese
1275 "\uD804\uDC05", // Brahmi (U+11005)
1276 "\uD802\uDE00", "\u0E01",
1278 "\uAA80", "\u0F40", "\u1C00", "\uA840", "\u1900", "\u1700", "\u1720", "\u1740", "\u1760",
1279 "\u1A00", // Buginese
1281 "\uA930", "\uA90A", "\u1000",
1282 UCharacter.toString(0x11103), // Chakma
1283 "\u1780", "\u1950", "\u1980", "\u1A20", "\uAA00", "\u1B05", "\uA984", "\u1880", "\u1C5A", "\u13A0", "\u1401", "\u1681", "\u16A0", "\uD803\uDC00", "\uA500", "\uA6A0", "\u1100",
1284 "\u3041", "\u30A1", "\u3105", "\uA000", "\uA4F8",
1285 UCharacter.toString(0x16F00), // Miao
1286 "\uD800\uDE80", "\uD800\uDEA0", "\uD802\uDD20", "\uD800\uDF00", "\uD800\uDF30", "\uD801\uDC28", "\uD801\uDC50", "\uD801\uDC80",
1287 UCharacter.toString(0x110D0), // Sora Sompeng
1288 "\uD800\uDC00", "\uD802\uDC00", "\uD802\uDE60", "\uD802\uDF00", "\uD802\uDC40",
1289 "\uD802\uDF40", "\uD802\uDF60", "\uD800\uDF80", "\uD800\uDFA0", "\uD808\uDC00", "\uD80C\uDC00",
1290 UCharacter.toString(0x109A0), // Meroitic Cursive
1291 UCharacter.toString(0x10980), // Meroitic Hieroglyphs
1293 // TODO: The overflow bucket's lowerBoundary string should be the
1294 // first item after the last reordering group in the collator's script order.
1295 // This should normally be the first Unicode code point
1296 // that is unassigned (U+0378 in Unicode 6.3) and untailored.
1297 // However, at least up to ICU 51 the Hani reordering group includes
1298 // unassigned code points,
1299 // and there is no stable string for the start of the trailing-weights range.
1300 // The only known string that sorts "high" is U+FFFF.
1301 // When ICU separates Hani vs. unassigned reordering groups, we need to fix this,
1302 // and fix relevant test code.
1303 // Ideally, FractionalUCA.txt will have a "script first primary"
1304 // for unassigned code points.
1309 * Return a list of the first character in each script. Only exposed for testing.
1311 * @return list of first characters in each script
1313 * @deprecated This API is ICU internal, only for testing.
1315 public static Collection<String> getFirstCharactersInScripts() {
1316 return HACK_FIRST_CHARS_IN_SCRIPTS;