X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=jars%2Ficu4j-4_2_1-src%2Fsrc%2Fcom%2Fibm%2Ficu%2Ftext%2FIndexCharacters.java;h=9fb47617f0d5dcc9701f71ea898aa13e9ef6d9eb;hb=127973afabe0c34015667c599d68bf9453d85652;hp=c3f7fa36ad0792fa40fc6b921a51015ff4dbbdd5;hpb=92dfc8b7d39cbc2e55f3c547c0c265bc7ae3af86;p=Dictionary.git
diff --git a/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/IndexCharacters.java b/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/IndexCharacters.java
old mode 100755
new mode 100644
index c3f7fa3..9fb4761
--- a/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/IndexCharacters.java
+++ b/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/IndexCharacters.java
@@ -1,274 +1,274 @@
-//##header
-//#if defined(FOUNDATION10) || defined(J2SE13)
-//#else
-/*
- *******************************************************************************
- * Copyright (C) 2008-2009, Google Inc, International Business Machines Corporation
- * and others. All Rights Reserved.
- *******************************************************************************
- */
-package com.ibm.icu.text;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.LinkedHashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-
-import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.util.LocaleData;
-import com.ibm.icu.util.ULocale;
-import com.ibm.icu.impl.CollectionUtilities.MultiComparator;
-
-/**
- * A set of characters for use as a UI "index", that is, a
- * list of clickable characters (or character sequences) that allow the user to
- * see a segment of a larger "target" list. That is, each character corresponds
- * to a bucket in the target list, where everything in the bucket is greater
- * than or equal to the character (according to the locale's collation). The
- * intention is to have two main functions; one that produces an index list that
- * is relatively static, and the other is a list that produces roughly
- * equally-sized buckets. Only the first is currently provided.
- *
- * The static list would be presented as something like
- *
- *
- * A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
- *
- *
- * In the UI, an index character could be omitted if its bucket is empty. For
- * example, if there is nothing in the bucket for Q, then Q could be omitted.
- *
- * Important Notes:
- *
- * - Although we say "character" above, the index character could be a
- * sequence, like "CH".
- * - There could be items in a target list that are less than the first or
- * (much) greater than the last; examples include words from other scripts. The
- * UI could bucket them with the first or last respectively, or have some symbol
- * for those categories.
- * - The use of the list requires that the target list be sorted according to
- * the locale that is used to create that list.
- * - For languages without widely accepted sorting methods (eg Chinese/Japanese)
- * the results may appear arbitrary, and it may be best not to use these methods.
- * - In the initial version, an arbitrary limit of 100 is placed on these lists.
- *
- *
- * @author markdavis
- * @draft ICU 4.2
- * @provisional This API might change or be removed in a future release.
- */
-//TODO(markdavis) return an additional character that is the "least greater" character than
-//the last character.
-public class IndexCharacters {
- public static final char CGJ = '\u034F';
- private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]");
- private static final UnicodeSet HANGUL = new UnicodeSet("[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14 \uC0AC \uC544 \uC790 \uCC28 \uCE74 \uD0C0 \uD30C \uD558]");
- private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
- private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]");
-
- private ULocale locale;
- private Collator comparator;
- private Set indexCharacters;
- private LinkedHashMap alreadyIn = new LinkedHashMap();
- private List noDistinctSorting = new ArrayList();
- private List notAlphabetic = new ArrayList();
-
- /**
- * Create the index object.
- * @param locale
- * @draft ICU 4.2
- * @provisional This API might change or be removed in a future release.
- */
- public IndexCharacters(ULocale locale) {
- this.locale = locale;
- comparator = Collator.getInstance(locale);
- comparator.setStrength(Collator.PRIMARY);
-
- // get the exemplars, and handle special cases
-
- UnicodeSet exemplars = LocaleData.getExemplarSet(locale, LocaleData.ES_STANDARD);
- // question: should we add auxiliary exemplars?
- if (exemplars.containsSome(CORE_LATIN)) {
- exemplars.addAll(CORE_LATIN);
- }
- if (exemplars.containsSome(HANGUL)) {
- // cut down to small list
- exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL);
- }
- if (exemplars.containsSome(ETHIOPIC)) {
- // cut down to small list
- // make use of the fact that Ethiopic is allocated in 8's, where
- // the base is 0 mod 8.
- for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) {
- if ((it.codepoint & 0x7) != 0) {
- exemplars.remove(it.codepoint);
- }
- }
- }
-
- // first sort them, with an "best" ordering among items that are the same according
- // to the collator
-
- Set preferenceSorting = new TreeSet(new MultiComparator(new Comparator[]{
- comparator, new PreferenceComparator(Collator.getInstance(locale))}));
- for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {
- preferenceSorting.add(it.getString());
- }
-
- indexCharacters = new TreeSet(comparator);
-
- // We nw make a sorted array of elements, uppercased
- // Some of the input may, however, be redundant.
- // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
- // So we make a pass through, filtering out those cases.
-
- for (Iterator it = preferenceSorting.iterator(); it.hasNext();) {
- String item = (String) it.next();
- item = UCharacter.toUpperCase(locale, item);
- if (indexCharacters.contains(item)) {
- for (Iterator it2 = indexCharacters.iterator(); it2.hasNext();) {
- Object itemAlreadyIn = it2.next();
- if (comparator.compare(item, itemAlreadyIn) == 0) {
- Set targets = (Set) alreadyIn.get(itemAlreadyIn);
- if (targets == null) {
- alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet());
- }
- targets.add(item);
- break;
- }
- }
- } else if (UTF16.countCodePoint(item) > 1 && comparator.compare(item, separated(item)) == 0){
- noDistinctSorting.add(item);
- } else if (!ALPHABETIC.containsSome(item)) {
- notAlphabetic.add(item);
- } else {
- indexCharacters.add(item);
- }
- }
-
- // if the result is still too large, cut down to 100 elements
-
- final int size = indexCharacters.size() - 1;
- if (size > 99) {
- int count = 0;
- int old = -1;
- for (Iterator it = indexCharacters.iterator(); it.hasNext();) {
- ++ count;
- it.next();
- final int bump = count * 99 / size;
- if (bump == old) {
- it.remove();
- } else {
- old = bump;
- }
- }
- }
- indexCharacters = Collections.unmodifiableSet(indexCharacters);
- }
-
- /*
- * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
- */
- private String separated(String item) {
- StringBuffer result = new StringBuffer();
- // add a CGJ except within surrogates
- char last = item.charAt(0);
- result.append(last);
- for (int i = 1; i < item.length(); ++i) {
- char ch = item.charAt(i);
- if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
- result.append(CGJ);
- }
- result.append(ch);
- last = ch;
- }
- return result.toString();
- }
-
- /**
- * Get the index characters.
- * @return A collection including the index characters
- * @draft ICU 4.2
- * @provisional This API might change or be removed in a future release.
- */
- public Collection getIndexCharacters() {
- return indexCharacters;
- }
-
- /**
- * Get the locale
- * @return The locale.
- * @draft ICU 4.2
- * @provisional This API might change or be removed in a future release.
- */
- public ULocale getLocale() {
- return locale;
- }
-
- /**
- * As the index is built, items may be discarded from the exemplars.
- * This contains some of the discards, and is intended for debugging.
- * @internal
- */
- public Map getAlreadyIn() {
- return alreadyIn;
- }
-
- /**
- * As the index is built, items may be discarded from the exemplars.
- * This contains some of the discards, and is intended for debugging.
- * @internal
- */
- public List getNoDistinctSorting() {
- return noDistinctSorting;
- }
-
- /**
- * As the index is built, items may be discarded from the exemplars.
- * This contains some of the discards, and is intended for debugging.
- * @internal
- */
- public List getNotAlphabetic() {
- return notAlphabetic;
- }
-
- /*
- * Comparator that returns "better" items first, where shorter NFKD is better,
- * and otherwise NFKD binary order is better, and otherwise binary order is better.
- */
- private static class PreferenceComparator implements Comparator {
- static final Comparator binary = new UTF16.StringComparator(true,false,0);
- final Collator collator;
-
- public PreferenceComparator(Collator collator) {
- this.collator = collator;
- }
-
- public int compare(Object o1, Object o2) {
- if (o1 == o2) {
- return 0;
- }
- String s1 = (String) o1;
- String s2 = (String) o2;
- String n1 = Normalizer.decompose(s1, true);
- String n2 = Normalizer.decompose(s2, true);
- int result = n1.length() - n2.length();
- if (result != 0) {
- return result;
- }
- result = collator.compare(n1, n2);
- if (result != 0) {
- return result;
- }
- return binary.compare(s1, s2);
- }
- }
-}
-//#endif
+//##header J2SE15
+//#if defined(FOUNDATION10) || defined(J2SE13)
+//#else
+/*
+ *******************************************************************************
+ * Copyright (C) 2008-2009, Google Inc, International Business Machines Corporation
+ * and others. All Rights Reserved.
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.util.LocaleData;
+import com.ibm.icu.util.ULocale;
+import com.ibm.icu.impl.CollectionUtilities.MultiComparator;
+
+/**
+ * A set of characters for use as a UI "index", that is, a
+ * list of clickable characters (or character sequences) that allow the user to
+ * see a segment of a larger "target" list. That is, each character corresponds
+ * to a bucket in the target list, where everything in the bucket is greater
+ * than or equal to the character (according to the locale's collation). The
+ * intention is to have two main functions; one that produces an index list that
+ * is relatively static, and the other is a list that produces roughly
+ * equally-sized buckets. Only the first is currently provided.
+ *
+ * The static list would be presented as something like
+ *
+ *
+ * A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
+ *
+ *
+ * In the UI, an index character could be omitted if its bucket is empty. For
+ * example, if there is nothing in the bucket for Q, then Q could be omitted.
+ *
+ * Important Notes:
+ *
+ * - Although we say "character" above, the index character could be a
+ * sequence, like "CH".
+ * - There could be items in a target list that are less than the first or
+ * (much) greater than the last; examples include words from other scripts. The
+ * UI could bucket them with the first or last respectively, or have some symbol
+ * for those categories.
+ * - The use of the list requires that the target list be sorted according to
+ * the locale that is used to create that list.
+ * - For languages without widely accepted sorting methods (eg Chinese/Japanese)
+ * the results may appear arbitrary, and it may be best not to use these methods.
+ * - In the initial version, an arbitrary limit of 100 is placed on these lists.
+ *
+ *
+ * @author markdavis
+ * @draft ICU 4.2
+ * @provisional This API might change or be removed in a future release.
+ */
+//TODO(markdavis) return an additional character that is the "least greater" character than
+//the last character.
+public class IndexCharacters {
+ public static final char CGJ = '\u034F';
+ private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]");
+ private static final UnicodeSet HANGUL = new UnicodeSet("[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14 \uC0AC \uC544 \uC790 \uCC28 \uCE74 \uD0C0 \uD30C \uD558]");
+ private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
+ private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]");
+
+ private ULocale locale;
+ private Collator comparator;
+ private Set indexCharacters;
+ private LinkedHashMap alreadyIn = new LinkedHashMap();
+ private List noDistinctSorting = new ArrayList();
+ private List notAlphabetic = new ArrayList();
+
+ /**
+ * Create the index object.
+ * @param locale
+ * @draft ICU 4.2
+ * @provisional This API might change or be removed in a future release.
+ */
+ public IndexCharacters(ULocale locale) {
+ this.locale = locale;
+ comparator = Collator.getInstance(locale);
+ comparator.setStrength(Collator.PRIMARY);
+
+ // get the exemplars, and handle special cases
+
+ UnicodeSet exemplars = LocaleData.getExemplarSet(locale, LocaleData.ES_STANDARD);
+ // question: should we add auxiliary exemplars?
+ if (exemplars.containsSome(CORE_LATIN)) {
+ exemplars.addAll(CORE_LATIN);
+ }
+ if (exemplars.containsSome(HANGUL)) {
+ // cut down to small list
+ exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL);
+ }
+ if (exemplars.containsSome(ETHIOPIC)) {
+ // cut down to small list
+ // make use of the fact that Ethiopic is allocated in 8's, where
+ // the base is 0 mod 8.
+ for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) {
+ if ((it.codepoint & 0x7) != 0) {
+ exemplars.remove(it.codepoint);
+ }
+ }
+ }
+
+ // first sort them, with an "best" ordering among items that are the same according
+ // to the collator
+
+ Set preferenceSorting = new TreeSet(new MultiComparator(new Comparator[]{
+ comparator, new PreferenceComparator(Collator.getInstance(locale))}));
+ for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {
+ preferenceSorting.add(it.getString());
+ }
+
+ indexCharacters = new TreeSet(comparator);
+
+ // We nw make a sorted array of elements, uppercased
+ // Some of the input may, however, be redundant.
+ // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
+ // So we make a pass through, filtering out those cases.
+
+ for (Iterator it = preferenceSorting.iterator(); it.hasNext();) {
+ String item = (String) it.next();
+ item = UCharacter.toUpperCase(locale, item);
+ if (indexCharacters.contains(item)) {
+ for (Iterator it2 = indexCharacters.iterator(); it2.hasNext();) {
+ Object itemAlreadyIn = it2.next();
+ if (comparator.compare(item, itemAlreadyIn) == 0) {
+ Set targets = (Set) alreadyIn.get(itemAlreadyIn);
+ if (targets == null) {
+ alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet());
+ }
+ targets.add(item);
+ break;
+ }
+ }
+ } else if (UTF16.countCodePoint(item) > 1 && comparator.compare(item, separated(item)) == 0){
+ noDistinctSorting.add(item);
+ } else if (!ALPHABETIC.containsSome(item)) {
+ notAlphabetic.add(item);
+ } else {
+ indexCharacters.add(item);
+ }
+ }
+
+ // if the result is still too large, cut down to 100 elements
+
+ final int size = indexCharacters.size() - 1;
+ if (size > 99) {
+ int count = 0;
+ int old = -1;
+ for (Iterator it = indexCharacters.iterator(); it.hasNext();) {
+ ++ count;
+ it.next();
+ final int bump = count * 99 / size;
+ if (bump == old) {
+ it.remove();
+ } else {
+ old = bump;
+ }
+ }
+ }
+ indexCharacters = Collections.unmodifiableSet(indexCharacters);
+ }
+
+ /*
+ * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
+ */
+ private String separated(String item) {
+ StringBuffer result = new StringBuffer();
+ // add a CGJ except within surrogates
+ char last = item.charAt(0);
+ result.append(last);
+ for (int i = 1; i < item.length(); ++i) {
+ char ch = item.charAt(i);
+ if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
+ result.append(CGJ);
+ }
+ result.append(ch);
+ last = ch;
+ }
+ return result.toString();
+ }
+
+ /**
+ * Get the index characters.
+ * @return A collection including the index characters
+ * @draft ICU 4.2
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Collection getIndexCharacters() {
+ return indexCharacters;
+ }
+
+ /**
+ * Get the locale
+ * @return The locale.
+ * @draft ICU 4.2
+ * @provisional This API might change or be removed in a future release.
+ */
+ public ULocale getLocale() {
+ return locale;
+ }
+
+ /**
+ * As the index is built, items may be discarded from the exemplars.
+ * This contains some of the discards, and is intended for debugging.
+ * @internal
+ */
+ public Map getAlreadyIn() {
+ return alreadyIn;
+ }
+
+ /**
+ * As the index is built, items may be discarded from the exemplars.
+ * This contains some of the discards, and is intended for debugging.
+ * @internal
+ */
+ public List getNoDistinctSorting() {
+ return noDistinctSorting;
+ }
+
+ /**
+ * As the index is built, items may be discarded from the exemplars.
+ * This contains some of the discards, and is intended for debugging.
+ * @internal
+ */
+ public List getNotAlphabetic() {
+ return notAlphabetic;
+ }
+
+ /*
+ * Comparator that returns "better" items first, where shorter NFKD is better,
+ * and otherwise NFKD binary order is better, and otherwise binary order is better.
+ */
+ private static class PreferenceComparator implements Comparator {
+ static final Comparator binary = new UTF16.StringComparator(true,false,0);
+ final Collator collator;
+
+ public PreferenceComparator(Collator collator) {
+ this.collator = collator;
+ }
+
+ public int compare(Object o1, Object o2) {
+ if (o1 == o2) {
+ return 0;
+ }
+ String s1 = (String) o1;
+ String s2 = (String) o2;
+ String n1 = Normalizer.decompose(s1, true);
+ String n2 = Normalizer.decompose(s2, true);
+ int result = n1.length() - n2.length();
+ if (result != 0) {
+ return result;
+ }
+ result = collator.compare(n1, n2);
+ if (result != 0) {
+ return result;
+ }
+ return binary.compare(s1, s2);
+ }
+ }
+}
+//#endif