jars/icu4j-4_2_1-src/src/com/ibm/icu/text/IndexCharacters.java

   1 //##header\r
   2 //#if defined(FOUNDATION10) || defined(J2SE13)\r
   3 //#else\r
   4 /*\r
   5  *******************************************************************************\r
   6  * Copyright (C) 2008-2009, Google Inc, International Business Machines Corporation\r
   7  * and others. All Rights Reserved.\r
   8  *******************************************************************************\r
   9  */\r
  10 package com.ibm.icu.text;\r
  11 \r
  12 import java.util.ArrayList;\r
  13 import java.util.Collection;\r
  14 import java.util.Collections;\r
  15 import java.util.Comparator;\r
  16 import java.util.Iterator;\r
  17 import java.util.LinkedHashMap;\r
  18 import java.util.LinkedHashSet;\r
  19 import java.util.List;\r
  20 import java.util.Map;\r
  21 import java.util.Set;\r
  22 import java.util.TreeSet;\r
  23 \r
  24 import com.ibm.icu.lang.UCharacter;\r
  25 import com.ibm.icu.util.LocaleData;\r
  26 import com.ibm.icu.util.ULocale;\r
  27 import com.ibm.icu.impl.CollectionUtilities.MultiComparator;\r
  28 \r
  29 /**\r
  30  * A set of characters for use as a UI "index", that is, a\r
  31  * list of clickable characters (or character sequences) that allow the user to\r
  32  * see a segment of a larger "target" list. That is, each character corresponds\r
  33  * to a bucket in the target list, where everything in the bucket is greater\r
  34  * than or equal to the character (according to the locale's collation). The\r
  35  * intention is to have two main functions; one that produces an index list that\r
  36  * is relatively static, and the other is a list that produces roughly\r
  37  * equally-sized buckets. Only the first is currently provided.\r
  38  * <p>\r
  39  * The static list would be presented as something like\r
  40  * \r
  41  * <pre>\r
  42  *  A B C D E F G H I J K L M N O P Q R S T U V W X Y Z\r
  43  * </pre>\r
  44  * \r
  45  * In the UI, an index character could be omitted if its bucket is empty. For\r
  46  * example, if there is nothing in the bucket for Q, then Q could be omitted.\r
  47  * <p>\r
  48  * <b>Important Notes:</b>\r
  49  * <ul>\r
  50  * <li>Although we say "character" above, the index character could be a\r
  51  * sequence, like "CH".</li>\r
  52  * <li>There could be items in a target list that are less than the first or\r
  53  * (much) greater than the last; examples include words from other scripts. The\r
  54  * UI could bucket them with the first or last respectively, or have some symbol\r
  55  * for those categories.</li>\r
  56  * <li>The use of the list requires that the target list be sorted according to\r
  57  * the locale that is used to create that list.</li>\r
  58  * <li>For languages without widely accepted sorting methods (eg Chinese/Japanese)\r
  59  * the results may appear arbitrary, and it may be best not to use these methods.</li>\r
  60  * <li>In the initial version, an arbitrary limit of 100 is placed on these lists.</li>\r
  61  * </ul>\r
  62  * \r
  63  * @author markdavis\r
  64  * @draft ICU 4.2\r
  65  * @provisional This API might change or be removed in a future release.\r
  66  */\r
  67 //TODO(markdavis) return an additional character that is the "least greater" character than\r
  68 //the last character.\r
  69 public class IndexCharacters {\r
  70     public static final char CGJ = '\u034F';\r
  71     private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]");\r
  72     private static final UnicodeSet HANGUL = new UnicodeSet("[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14  \uC0AC  \uC544 \uC790  \uCC28 \uCE74 \uD0C0 \uD30C \uD558]");\r
  73     private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");\r
  74     private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]");\r
  75 \r
  76     private ULocale locale;\r
  77     private Collator comparator;\r
  78     private Set indexCharacters;\r
  79     private LinkedHashMap alreadyIn = new LinkedHashMap();\r
  80     private List noDistinctSorting = new ArrayList();\r
  81     private List notAlphabetic = new ArrayList();\r
  82 \r
  83     /**\r
  84      * Create the index object.\r
  85      * @param locale\r
  86      * @draft ICU 4.2\r
  87      * @provisional This API might change or be removed in a future release.\r
  88      */\r
  89     public IndexCharacters(ULocale locale) {\r
  90         this.locale = locale;\r
  91         comparator = Collator.getInstance(locale);\r
  92         comparator.setStrength(Collator.PRIMARY);\r
  93 \r
  94         // get the exemplars, and handle special cases\r
  95 \r
  96         UnicodeSet exemplars = LocaleData.getExemplarSet(locale, LocaleData.ES_STANDARD);\r
  97         // question: should we add auxiliary exemplars?\r
  98         if (exemplars.containsSome(CORE_LATIN)) {\r
  99             exemplars.addAll(CORE_LATIN);\r
 100         }\r
 101         if (exemplars.containsSome(HANGUL)) {\r
 102             // cut down to small list\r
 103             exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL);\r
 104         }\r
 105         if (exemplars.containsSome(ETHIOPIC)) {\r
 106             // cut down to small list\r
 107             // make use of the fact that Ethiopic is allocated in 8's, where\r
 108             // the base is 0 mod 8.\r
 109             for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) {\r
 110                 if ((it.codepoint & 0x7) != 0) {\r
 111                     exemplars.remove(it.codepoint);\r
 112                 }\r
 113             }\r
 114         }\r
 115 \r
 116         // first sort them, with an "best" ordering among items that are the same according\r
 117         // to the collator\r
 118 \r
 119         Set preferenceSorting = new TreeSet(new MultiComparator(new Comparator[]{\r
 120                 comparator, new PreferenceComparator(Collator.getInstance(locale))}));\r
 121         for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {\r
 122             preferenceSorting.add(it.getString());\r
 123         }\r
 124 \r
 125         indexCharacters = new TreeSet(comparator);\r
 126 \r
 127         // We nw make a sorted array of elements, uppercased\r
 128         // Some of the input may, however, be redundant.\r
 129         // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"\r
 130         // So we make a pass through, filtering out those cases.\r
 131 \r
 132         for (Iterator it = preferenceSorting.iterator(); it.hasNext();) {\r
 133             String item = (String) it.next();\r
 134             item = UCharacter.toUpperCase(locale, item);\r
 135             if (indexCharacters.contains(item)) {\r
 136                 for (Iterator it2 = indexCharacters.iterator(); it2.hasNext();) {\r
 137                     Object itemAlreadyIn = it2.next();\r
 138                     if (comparator.compare(item, itemAlreadyIn) == 0) {\r
 139                         Set targets = (Set) alreadyIn.get(itemAlreadyIn);\r
 140                         if (targets == null) {\r
 141                             alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet());\r
 142                         }\r
 143                         targets.add(item);\r
 144                         break;\r
 145                     }\r
 146                 }\r
 147             } else if (UTF16.countCodePoint(item) > 1 && comparator.compare(item, separated(item)) == 0){\r
 148                 noDistinctSorting.add(item);\r
 149             } else if (!ALPHABETIC.containsSome(item)) {\r
 150                 notAlphabetic.add(item);\r
 151             } else {\r
 152                 indexCharacters.add(item);\r
 153             }\r
 154         }\r
 155 \r
 156         // if the result is still too large, cut down to 100 elements\r
 157 \r
 158         final int size = indexCharacters.size() - 1;\r
 159         if (size > 99) {\r
 160             int count = 0;\r
 161             int old = -1;\r
 162             for (Iterator it = indexCharacters.iterator(); it.hasNext();) {\r
 163                 ++ count;\r
 164                 it.next();\r
 165                 final int bump = count * 99 / size;\r
 166                 if (bump == old) {\r
 167                     it.remove();\r
 168                 } else {\r
 169                     old = bump;\r
 170                 }   \r
 171             }\r
 172         }\r
 173         indexCharacters = Collections.unmodifiableSet(indexCharacters);\r
 174     }\r
 175 \r
 176     /*\r
 177      * Return the string with interspersed CGJs. Input must have more than 2 codepoints.\r
 178      */\r
 179     private String separated(String item) {\r
 180         StringBuffer result = new StringBuffer();\r
 181         // add a CGJ except within surrogates\r
 182         char last = item.charAt(0);\r
 183         result.append(last);\r
 184         for (int i = 1; i < item.length(); ++i) {\r
 185             char ch = item.charAt(i);\r
 186             if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {\r
 187                 result.append(CGJ);\r
 188             }\r
 189             result.append(ch);\r
 190             last = ch;\r
 191         }\r
 192         return result.toString();\r
 193     }\r
 194 \r
 195     /**\r
 196      * Get the index characters.\r
 197      * @return A collection including the index characters\r
 198      * @draft ICU 4.2\r
 199      * @provisional This API might change or be removed in a future release.\r
 200      */\r
 201     public Collection getIndexCharacters() {\r
 202         return indexCharacters;\r
 203     }\r
 204 \r
 205     /**\r
 206      * Get the locale\r
 207      * @return The locale.\r
 208      * @draft ICU 4.2\r
 209      * @provisional This API might change or be removed in a future release.\r
 210      */\r
 211     public ULocale getLocale() {\r
 212         return locale;\r
 213     }\r
 214 \r
 215     /**\r
 216      * As the index is built, items may be discarded from the exemplars.\r
 217      * This contains some of the discards, and is intended for debugging.\r
 218      * @internal\r
 219      */\r
 220     public Map getAlreadyIn() {\r
 221         return alreadyIn;\r
 222     }\r
 223 \r
 224     /**\r
 225      * As the index is built, items may be discarded from the exemplars.\r
 226      * This contains some of the discards, and is intended for debugging.\r
 227      * @internal\r
 228      */\r
 229     public List getNoDistinctSorting() {\r
 230         return noDistinctSorting;\r
 231     }\r
 232 \r
 233     /**\r
 234      * As the index is built, items may be discarded from the exemplars.\r
 235      * This contains some of the discards, and is intended for debugging.\r
 236      * @internal\r
 237      */\r
 238     public List getNotAlphabetic() {\r
 239         return notAlphabetic;\r
 240     }\r
 241 \r
 242     /*\r
 243      * Comparator that returns "better" items first, where shorter NFKD is better,\r
 244      * and otherwise NFKD binary order is better, and otherwise binary order is better.\r
 245      */\r
 246     private static class PreferenceComparator implements Comparator {\r
 247         static final Comparator binary = new UTF16.StringComparator(true,false,0);\r
 248         final Collator collator;\r
 249 \r
 250         public PreferenceComparator(Collator collator) {\r
 251             this.collator = collator;\r
 252         }\r
 253         \r
 254         public int compare(Object o1, Object o2) {\r
 255             if (o1 == o2) {\r
 256                 return 0;\r
 257             }\r
 258             String s1 = (String) o1;\r
 259             String s2 = (String) o2;\r
 260             String n1 = Normalizer.decompose(s1, true);\r
 261             String n2 = Normalizer.decompose(s2, true);\r
 262             int result = n1.length() - n2.length();\r
 263             if (result != 0) {\r
 264                 return result;\r
 265             }\r
 266             result = collator.compare(n1, n2);\r
 267             if (result != 0) {\r
 268                 return result;\r
 269             }\r
 270             return binary.compare(s1, s2);\r
 271         }\r
 272     }\r
 273 }\r
 274 //#endif\r