jars/icu4j-4_2_1-src/src/com/ibm/icu/text/IndexCharacters.java

   1 //##header J2SE15
   2 //#if defined(FOUNDATION10) || defined(J2SE13)
   3 //#else
   4 /*
   5  *******************************************************************************
   6  * Copyright (C) 2008-2009, Google Inc, International Business Machines Corporation
   7  * and others. All Rights Reserved.
   8  *******************************************************************************
   9  */
  10 package com.ibm.icu.text;
  11
  12 import java.util.ArrayList;
  13 import java.util.Collection;
  14 import java.util.Collections;
  15 import java.util.Comparator;
  16 import java.util.Iterator;
  17 import java.util.LinkedHashMap;
  18 import java.util.LinkedHashSet;
  19 import java.util.List;
  20 import java.util.Map;
  21 import java.util.Set;
  22 import java.util.TreeSet;
  23
  24 import com.ibm.icu.lang.UCharacter;
  25 import com.ibm.icu.util.LocaleData;
  26 import com.ibm.icu.util.ULocale;
  27 import com.ibm.icu.impl.CollectionUtilities.MultiComparator;
  28
  29 /**
  30  * A set of characters for use as a UI "index", that is, a
  31  * list of clickable characters (or character sequences) that allow the user to
  32  * see a segment of a larger "target" list. That is, each character corresponds
  33  * to a bucket in the target list, where everything in the bucket is greater
  34  * than or equal to the character (according to the locale's collation). The
  35  * intention is to have two main functions; one that produces an index list that
  36  * is relatively static, and the other is a list that produces roughly
  37  * equally-sized buckets. Only the first is currently provided.
  38  * <p>
  39  * The static list would be presented as something like
  40  *
  41  * <pre>
  42  *  A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
  43  * </pre>
  44  *
  45  * In the UI, an index character could be omitted if its bucket is empty. For
  46  * example, if there is nothing in the bucket for Q, then Q could be omitted.
  47  * <p>
  48  * <b>Important Notes:</b>
  49  * <ul>
  50  * <li>Although we say "character" above, the index character could be a
  51  * sequence, like "CH".</li>
  52  * <li>There could be items in a target list that are less than the first or
  53  * (much) greater than the last; examples include words from other scripts. The
  54  * UI could bucket them with the first or last respectively, or have some symbol
  55  * for those categories.</li>
  56  * <li>The use of the list requires that the target list be sorted according to
  57  * the locale that is used to create that list.</li>
  58  * <li>For languages without widely accepted sorting methods (eg Chinese/Japanese)
  59  * the results may appear arbitrary, and it may be best not to use these methods.</li>
  60  * <li>In the initial version, an arbitrary limit of 100 is placed on these lists.</li>
  61  * </ul>
  62  *
  63  * @author markdavis
  64  * @draft ICU 4.2
  65  * @provisional This API might change or be removed in a future release.
  66  */
  67 //TODO(markdavis) return an additional character that is the "least greater" character than
  68 //the last character.
  69 public class IndexCharacters {
  70     public static final char CGJ = '\u034F';
  71     private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]");
  72     private static final UnicodeSet HANGUL = new UnicodeSet("[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14  \uC0AC  \uC544 \uC790  \uCC28 \uCE74 \uD0C0 \uD30C \uD558]");
  73     private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
  74     private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]");
  75
  76     private ULocale locale;
  77     private Collator comparator;
  78     private Set indexCharacters;
  79     private LinkedHashMap alreadyIn = new LinkedHashMap();
  80     private List noDistinctSorting = new ArrayList();
  81     private List notAlphabetic = new ArrayList();
  82
  83     /**
  84      * Create the index object.
  85      * @param locale
  86      * @draft ICU 4.2
  87      * @provisional This API might change or be removed in a future release.
  88      */
  89     public IndexCharacters(ULocale locale) {
  90         this.locale = locale;
  91         comparator = Collator.getInstance(locale);
  92         comparator.setStrength(Collator.PRIMARY);
  93
  94         // get the exemplars, and handle special cases
  95
  96         UnicodeSet exemplars = LocaleData.getExemplarSet(locale, LocaleData.ES_STANDARD);
  97         // question: should we add auxiliary exemplars?
  98         if (exemplars.containsSome(CORE_LATIN)) {
  99             exemplars.addAll(CORE_LATIN);
 100         }
 101         if (exemplars.containsSome(HANGUL)) {
 102             // cut down to small list
 103             exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL);
 104         }
 105         if (exemplars.containsSome(ETHIOPIC)) {
 106             // cut down to small list
 107             // make use of the fact that Ethiopic is allocated in 8's, where
 108             // the base is 0 mod 8.
 109             for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) {
 110                 if ((it.codepoint & 0x7) != 0) {
 111                     exemplars.remove(it.codepoint);
 112                 }
 113             }
 114         }
 115
 116         // first sort them, with an "best" ordering among items that are the same according
 117         // to the collator
 118
 119         Set preferenceSorting = new TreeSet(new MultiComparator(new Comparator[]{
 120                 comparator, new PreferenceComparator(Collator.getInstance(locale))}));
 121         for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {
 122             preferenceSorting.add(it.getString());
 123         }
 124
 125         indexCharacters = new TreeSet(comparator);
 126
 127         // We nw make a sorted array of elements, uppercased
 128         // Some of the input may, however, be redundant.
 129         // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
 130         // So we make a pass through, filtering out those cases.
 131
 132         for (Iterator it = preferenceSorting.iterator(); it.hasNext();) {
 133             String item = (String) it.next();
 134             item = UCharacter.toUpperCase(locale, item);
 135             if (indexCharacters.contains(item)) {
 136                 for (Iterator it2 = indexCharacters.iterator(); it2.hasNext();) {
 137                     Object itemAlreadyIn = it2.next();
 138                     if (comparator.compare(item, itemAlreadyIn) == 0) {
 139                         Set targets = (Set) alreadyIn.get(itemAlreadyIn);
 140                         if (targets == null) {
 141                             alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet());
 142                         }
 143                         targets.add(item);
 144                         break;
 145                     }
 146                 }
 147             } else if (UTF16.countCodePoint(item) > 1 && comparator.compare(item, separated(item)) == 0){
 148                 noDistinctSorting.add(item);
 149             } else if (!ALPHABETIC.containsSome(item)) {
 150                 notAlphabetic.add(item);
 151             } else {
 152                 indexCharacters.add(item);
 153             }
 154         }
 155
 156         // if the result is still too large, cut down to 100 elements
 157
 158         final int size = indexCharacters.size() - 1;
 159         if (size > 99) {
 160             int count = 0;
 161             int old = -1;
 162             for (Iterator it = indexCharacters.iterator(); it.hasNext();) {
 163                 ++ count;
 164                 it.next();
 165                 final int bump = count * 99 / size;
 166                 if (bump == old) {
 167                     it.remove();
 168                 } else {
 169                     old = bump;
 170                 }
 171             }
 172         }
 173         indexCharacters = Collections.unmodifiableSet(indexCharacters);
 174     }
 175
 176     /*
 177      * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
 178      */
 179     private String separated(String item) {
 180         StringBuffer result = new StringBuffer();
 181         // add a CGJ except within surrogates
 182         char last = item.charAt(0);
 183         result.append(last);
 184         for (int i = 1; i < item.length(); ++i) {
 185             char ch = item.charAt(i);
 186             if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
 187                 result.append(CGJ);
 188             }
 189             result.append(ch);
 190             last = ch;
 191         }
 192         return result.toString();
 193     }
 194
 195     /**
 196      * Get the index characters.
 197      * @return A collection including the index characters
 198      * @draft ICU 4.2
 199      * @provisional This API might change or be removed in a future release.
 200      */
 201     public Collection getIndexCharacters() {
 202         return indexCharacters;
 203     }
 204
 205     /**
 206      * Get the locale
 207      * @return The locale.
 208      * @draft ICU 4.2
 209      * @provisional This API might change or be removed in a future release.
 210      */
 211     public ULocale getLocale() {
 212         return locale;
 213     }
 214
 215     /**
 216      * As the index is built, items may be discarded from the exemplars.
 217      * This contains some of the discards, and is intended for debugging.
 218      * @internal
 219      */
 220     public Map getAlreadyIn() {
 221         return alreadyIn;
 222     }
 223
 224     /**
 225      * As the index is built, items may be discarded from the exemplars.
 226      * This contains some of the discards, and is intended for debugging.
 227      * @internal
 228      */
 229     public List getNoDistinctSorting() {
 230         return noDistinctSorting;
 231     }
 232
 233     /**
 234      * As the index is built, items may be discarded from the exemplars.
 235      * This contains some of the discards, and is intended for debugging.
 236      * @internal
 237      */
 238     public List getNotAlphabetic() {
 239         return notAlphabetic;
 240     }
 241
 242     /*
 243      * Comparator that returns "better" items first, where shorter NFKD is better,
 244      * and otherwise NFKD binary order is better, and otherwise binary order is better.
 245      */
 246     private static class PreferenceComparator implements Comparator {
 247         static final Comparator binary = new UTF16.StringComparator(true,false,0);
 248         final Collator collator;
 249
 250         public PreferenceComparator(Collator collator) {
 251             this.collator = collator;
 252         }
 253
 254         public int compare(Object o1, Object o2) {
 255             if (o1 == o2) {
 256                 return 0;
 257             }
 258             String s1 = (String) o1;
 259             String s2 = (String) o2;
 260             String n1 = Normalizer.decompose(s1, true);
 261             String n2 = Normalizer.decompose(s2, true);
 262             int result = n1.length() - n2.length();
 263             if (result != 0) {
 264                 return result;
 265             }
 266             result = collator.compare(n1, n2);
 267             if (result != 0) {
 268                 return result;
 269             }
 270             return binary.compare(s1, s2);
 271         }
 272     }
 273 }
 274 //#endif