X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=jars%2Ficu4j-4_2_1-src%2Fsrc%2Fcom%2Fibm%2Ficu%2Ftext%2FIndexCharacters.java;h=9fb47617f0d5dcc9701f71ea898aa13e9ef6d9eb;hb=127973afabe0c34015667c599d68bf9453d85652;hp=c3f7fa36ad0792fa40fc6b921a51015ff4dbbdd5;hpb=92dfc8b7d39cbc2e55f3c547c0c265bc7ae3af86;p=Dictionary.git

diff --git a/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/IndexCharacters.java b/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/IndexCharacters.java
old mode 100755
new mode 100644
index c3f7fa3..9fb4761
--- a/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/IndexCharacters.java
+++ b/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/IndexCharacters.java
@@ -1,274 +1,274 @@
-//##header
-//#if defined(FOUNDATION10) || defined(J2SE13)
-//#else
-/*
- *******************************************************************************
- * Copyright (C) 2008-2009, Google Inc, International Business Machines Corporation
- * and others. All Rights Reserved.
- *******************************************************************************
- */
-package com.ibm.icu.text;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.LinkedHashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-
-import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.util.LocaleData;
-import com.ibm.icu.util.ULocale;
-import com.ibm.icu.impl.CollectionUtilities.MultiComparator;
-
-/**
- * A set of characters for use as a UI "index", that is, a
- * list of clickable characters (or character sequences) that allow the user to
- * see a segment of a larger "target" list. That is, each character corresponds
- * to a bucket in the target list, where everything in the bucket is greater
- * than or equal to the character (according to the locale's collation). The
- * intention is to have two main functions; one that produces an index list that
- * is relatively static, and the other is a list that produces roughly
- * equally-sized buckets. Only the first is currently provided.
- * <p>
- * The static list would be presented as something like
- * 
- * <pre>
- *  A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
- * </pre>
- * 
- * In the UI, an index character could be omitted if its bucket is empty. For
- * example, if there is nothing in the bucket for Q, then Q could be omitted.
- * <p>
- * <b>Important Notes:</b>
- * <ul>
- * <li>Although we say "character" above, the index character could be a
- * sequence, like "CH".</li>
- * <li>There could be items in a target list that are less than the first or
- * (much) greater than the last; examples include words from other scripts. The
- * UI could bucket them with the first or last respectively, or have some symbol
- * for those categories.</li>
- * <li>The use of the list requires that the target list be sorted according to
- * the locale that is used to create that list.</li>
- * <li>For languages without widely accepted sorting methods (eg Chinese/Japanese)
- * the results may appear arbitrary, and it may be best not to use these methods.</li>
- * <li>In the initial version, an arbitrary limit of 100 is placed on these lists.</li>
- * </ul>
- * 
- * @author markdavis
- * @draft ICU 4.2
- * @provisional This API might change or be removed in a future release.
- */
-//TODO(markdavis) return an additional character that is the "least greater" character than
-//the last character.
-public class IndexCharacters {
-    public static final char CGJ = '\u034F';
-    private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]");
-    private static final UnicodeSet HANGUL = new UnicodeSet("[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14  \uC0AC  \uC544 \uC790  \uCC28 \uCE74 \uD0C0 \uD30C \uD558]");
-    private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
-    private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]");
-
-    private ULocale locale;
-    private Collator comparator;
-    private Set indexCharacters;
-    private LinkedHashMap alreadyIn = new LinkedHashMap();
-    private List noDistinctSorting = new ArrayList();
-    private List notAlphabetic = new ArrayList();
-
-    /**
-     * Create the index object.
-     * @param locale
-     * @draft ICU 4.2
-     * @provisional This API might change or be removed in a future release.
-     */
-    public IndexCharacters(ULocale locale) {
-        this.locale = locale;
-        comparator = Collator.getInstance(locale);
-        comparator.setStrength(Collator.PRIMARY);
-
-        // get the exemplars, and handle special cases
-
-        UnicodeSet exemplars = LocaleData.getExemplarSet(locale, LocaleData.ES_STANDARD);
-        // question: should we add auxiliary exemplars?
-        if (exemplars.containsSome(CORE_LATIN)) {
-            exemplars.addAll(CORE_LATIN);
-        }
-        if (exemplars.containsSome(HANGUL)) {
-            // cut down to small list
-            exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL);
-        }
-        if (exemplars.containsSome(ETHIOPIC)) {
-            // cut down to small list
-            // make use of the fact that Ethiopic is allocated in 8's, where
-            // the base is 0 mod 8.
-            for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) {
-                if ((it.codepoint & 0x7) != 0) {
-                    exemplars.remove(it.codepoint);
-                }
-            }
-        }
-
-        // first sort them, with an "best" ordering among items that are the same according
-        // to the collator
-
-        Set preferenceSorting = new TreeSet(new MultiComparator(new Comparator[]{
-                comparator, new PreferenceComparator(Collator.getInstance(locale))}));
-        for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {
-            preferenceSorting.add(it.getString());
-        }
-
-        indexCharacters = new TreeSet(comparator);
-
-        // We nw make a sorted array of elements, uppercased
-        // Some of the input may, however, be redundant.
-        // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
-        // So we make a pass through, filtering out those cases.
-
-        for (Iterator it = preferenceSorting.iterator(); it.hasNext();) {
-            String item = (String) it.next();
-            item = UCharacter.toUpperCase(locale, item);
-            if (indexCharacters.contains(item)) {
-                for (Iterator it2 = indexCharacters.iterator(); it2.hasNext();) {
-                    Object itemAlreadyIn = it2.next();
-                    if (comparator.compare(item, itemAlreadyIn) == 0) {
-                        Set targets = (Set) alreadyIn.get(itemAlreadyIn);
-                        if (targets == null) {
-                            alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet());
-                        }
-                        targets.add(item);
-                        break;
-                    }
-                }
-            } else if (UTF16.countCodePoint(item) > 1 && comparator.compare(item, separated(item)) == 0){
-                noDistinctSorting.add(item);
-            } else if (!ALPHABETIC.containsSome(item)) {
-                notAlphabetic.add(item);
-            } else {
-                indexCharacters.add(item);
-            }
-        }
-
-        // if the result is still too large, cut down to 100 elements
-
-        final int size = indexCharacters.size() - 1;
-        if (size > 99) {
-            int count = 0;
-            int old = -1;
-            for (Iterator it = indexCharacters.iterator(); it.hasNext();) {
-                ++ count;
-                it.next();
-                final int bump = count * 99 / size;
-                if (bump == old) {
-                    it.remove();
-                } else {
-                    old = bump;
-                }   
-            }
-        }
-        indexCharacters = Collections.unmodifiableSet(indexCharacters);
-    }
-
-    /*
-     * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
-     */
-    private String separated(String item) {
-        StringBuffer result = new StringBuffer();
-        // add a CGJ except within surrogates
-        char last = item.charAt(0);
-        result.append(last);
-        for (int i = 1; i < item.length(); ++i) {
-            char ch = item.charAt(i);
-            if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
-                result.append(CGJ);
-            }
-            result.append(ch);
-            last = ch;
-        }
-        return result.toString();
-    }
-
-    /**
-     * Get the index characters.
-     * @return A collection including the index characters
-     * @draft ICU 4.2
-     * @provisional This API might change or be removed in a future release.
-     */
-    public Collection getIndexCharacters() {
-        return indexCharacters;
-    }
-
-    /**
-     * Get the locale
-     * @return The locale.
-     * @draft ICU 4.2
-     * @provisional This API might change or be removed in a future release.
-     */
-    public ULocale getLocale() {
-        return locale;
-    }
-
-    /**
-     * As the index is built, items may be discarded from the exemplars.
-     * This contains some of the discards, and is intended for debugging.
-     * @internal
-     */
-    public Map getAlreadyIn() {
-        return alreadyIn;
-    }
-
-    /**
-     * As the index is built, items may be discarded from the exemplars.
-     * This contains some of the discards, and is intended for debugging.
-     * @internal
-     */
-    public List getNoDistinctSorting() {
-        return noDistinctSorting;
-    }
-
-    /**
-     * As the index is built, items may be discarded from the exemplars.
-     * This contains some of the discards, and is intended for debugging.
-     * @internal
-     */
-    public List getNotAlphabetic() {
-        return notAlphabetic;
-    }
-
-    /*
-     * Comparator that returns "better" items first, where shorter NFKD is better,
-     * and otherwise NFKD binary order is better, and otherwise binary order is better.
-     */
-    private static class PreferenceComparator implements Comparator {
-        static final Comparator binary = new UTF16.StringComparator(true,false,0);
-        final Collator collator;
-
-        public PreferenceComparator(Collator collator) {
-            this.collator = collator;
-        }
-        
-        public int compare(Object o1, Object o2) {
-            if (o1 == o2) {
-                return 0;
-            }
-            String s1 = (String) o1;
-            String s2 = (String) o2;
-            String n1 = Normalizer.decompose(s1, true);
-            String n2 = Normalizer.decompose(s2, true);
-            int result = n1.length() - n2.length();
-            if (result != 0) {
-                return result;
-            }
-            result = collator.compare(n1, n2);
-            if (result != 0) {
-                return result;
-            }
-            return binary.compare(s1, s2);
-        }
-    }
-}
-//#endif
+//##header J2SE15
+//#if defined(FOUNDATION10) || defined(J2SE13)
+//#else
+/*
+ *******************************************************************************
+ * Copyright (C) 2008-2009, Google Inc, International Business Machines Corporation
+ * and others. All Rights Reserved.
+ *******************************************************************************
+ */
+package com.ibm.icu.text;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.util.LocaleData;
+import com.ibm.icu.util.ULocale;
+import com.ibm.icu.impl.CollectionUtilities.MultiComparator;
+
+/**
+ * A set of characters for use as a UI "index", that is, a
+ * list of clickable characters (or character sequences) that allow the user to
+ * see a segment of a larger "target" list. That is, each character corresponds
+ * to a bucket in the target list, where everything in the bucket is greater
+ * than or equal to the character (according to the locale's collation). The
+ * intention is to have two main functions; one that produces an index list that
+ * is relatively static, and the other is a list that produces roughly
+ * equally-sized buckets. Only the first is currently provided.
+ * <p>
+ * The static list would be presented as something like
+ * 
+ * <pre>
+ *  A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
+ * </pre>
+ * 
+ * In the UI, an index character could be omitted if its bucket is empty. For
+ * example, if there is nothing in the bucket for Q, then Q could be omitted.
+ * <p>
+ * <b>Important Notes:</b>
+ * <ul>
+ * <li>Although we say "character" above, the index character could be a
+ * sequence, like "CH".</li>
+ * <li>There could be items in a target list that are less than the first or
+ * (much) greater than the last; examples include words from other scripts. The
+ * UI could bucket them with the first or last respectively, or have some symbol
+ * for those categories.</li>
+ * <li>The use of the list requires that the target list be sorted according to
+ * the locale that is used to create that list.</li>
+ * <li>For languages without widely accepted sorting methods (eg Chinese/Japanese)
+ * the results may appear arbitrary, and it may be best not to use these methods.</li>
+ * <li>In the initial version, an arbitrary limit of 100 is placed on these lists.</li>
+ * </ul>
+ * 
+ * @author markdavis
+ * @draft ICU 4.2
+ * @provisional This API might change or be removed in a future release.
+ */
+//TODO(markdavis) return an additional character that is the "least greater" character than
+//the last character.
+public class IndexCharacters {
+    public static final char CGJ = '\u034F';
+    private static final UnicodeSet ALPHABETIC = new UnicodeSet("[[:alphabetic:]-[:mark:]]");
+    private static final UnicodeSet HANGUL = new UnicodeSet("[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14  \uC0AC  \uC544 \uC790  \uCC28 \uCE74 \uD0C0 \uD30C \uD558]");
+    private static final UnicodeSet ETHIOPIC = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
+    private static final UnicodeSet CORE_LATIN = new UnicodeSet("[a-z]");
+
+    private ULocale locale;
+    private Collator comparator;
+    private Set indexCharacters;
+    private LinkedHashMap alreadyIn = new LinkedHashMap();
+    private List noDistinctSorting = new ArrayList();
+    private List notAlphabetic = new ArrayList();
+
+    /**
+     * Create the index object.
+     * @param locale
+     * @draft ICU 4.2
+     * @provisional This API might change or be removed in a future release.
+     */
+    public IndexCharacters(ULocale locale) {
+        this.locale = locale;
+        comparator = Collator.getInstance(locale);
+        comparator.setStrength(Collator.PRIMARY);
+
+        // get the exemplars, and handle special cases
+
+        UnicodeSet exemplars = LocaleData.getExemplarSet(locale, LocaleData.ES_STANDARD);
+        // question: should we add auxiliary exemplars?
+        if (exemplars.containsSome(CORE_LATIN)) {
+            exemplars.addAll(CORE_LATIN);
+        }
+        if (exemplars.containsSome(HANGUL)) {
+            // cut down to small list
+            exemplars.removeAll(new UnicodeSet("[:block=hangul_syllables:]")).addAll(HANGUL);
+        }
+        if (exemplars.containsSome(ETHIOPIC)) {
+            // cut down to small list
+            // make use of the fact that Ethiopic is allocated in 8's, where
+            // the base is 0 mod 8.
+            for (UnicodeSetIterator it = new UnicodeSetIterator(ETHIOPIC); it.next();) {
+                if ((it.codepoint & 0x7) != 0) {
+                    exemplars.remove(it.codepoint);
+                }
+            }
+        }
+
+        // first sort them, with an "best" ordering among items that are the same according
+        // to the collator
+
+        Set preferenceSorting = new TreeSet(new MultiComparator(new Comparator[]{
+                comparator, new PreferenceComparator(Collator.getInstance(locale))}));
+        for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {
+            preferenceSorting.add(it.getString());
+        }
+
+        indexCharacters = new TreeSet(comparator);
+
+        // We nw make a sorted array of elements, uppercased
+        // Some of the input may, however, be redundant.
+        // That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
+        // So we make a pass through, filtering out those cases.
+
+        for (Iterator it = preferenceSorting.iterator(); it.hasNext();) {
+            String item = (String) it.next();
+            item = UCharacter.toUpperCase(locale, item);
+            if (indexCharacters.contains(item)) {
+                for (Iterator it2 = indexCharacters.iterator(); it2.hasNext();) {
+                    Object itemAlreadyIn = it2.next();
+                    if (comparator.compare(item, itemAlreadyIn) == 0) {
+                        Set targets = (Set) alreadyIn.get(itemAlreadyIn);
+                        if (targets == null) {
+                            alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet());
+                        }
+                        targets.add(item);
+                        break;
+                    }
+                }
+            } else if (UTF16.countCodePoint(item) > 1 && comparator.compare(item, separated(item)) == 0){
+                noDistinctSorting.add(item);
+            } else if (!ALPHABETIC.containsSome(item)) {
+                notAlphabetic.add(item);
+            } else {
+                indexCharacters.add(item);
+            }
+        }
+
+        // if the result is still too large, cut down to 100 elements
+
+        final int size = indexCharacters.size() - 1;
+        if (size > 99) {
+            int count = 0;
+            int old = -1;
+            for (Iterator it = indexCharacters.iterator(); it.hasNext();) {
+                ++ count;
+                it.next();
+                final int bump = count * 99 / size;
+                if (bump == old) {
+                    it.remove();
+                } else {
+                    old = bump;
+                }   
+            }
+        }
+        indexCharacters = Collections.unmodifiableSet(indexCharacters);
+    }
+
+    /*
+     * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
+     */
+    private String separated(String item) {
+        StringBuffer result = new StringBuffer();
+        // add a CGJ except within surrogates
+        char last = item.charAt(0);
+        result.append(last);
+        for (int i = 1; i < item.length(); ++i) {
+            char ch = item.charAt(i);
+            if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
+                result.append(CGJ);
+            }
+            result.append(ch);
+            last = ch;
+        }
+        return result.toString();
+    }
+
+    /**
+     * Get the index characters.
+     * @return A collection including the index characters
+     * @draft ICU 4.2
+     * @provisional This API might change or be removed in a future release.
+     */
+    public Collection getIndexCharacters() {
+        return indexCharacters;
+    }
+
+    /**
+     * Get the locale
+     * @return The locale.
+     * @draft ICU 4.2
+     * @provisional This API might change or be removed in a future release.
+     */
+    public ULocale getLocale() {
+        return locale;
+    }
+
+    /**
+     * As the index is built, items may be discarded from the exemplars.
+     * This contains some of the discards, and is intended for debugging.
+     * @internal
+     */
+    public Map getAlreadyIn() {
+        return alreadyIn;
+    }
+
+    /**
+     * As the index is built, items may be discarded from the exemplars.
+     * This contains some of the discards, and is intended for debugging.
+     * @internal
+     */
+    public List getNoDistinctSorting() {
+        return noDistinctSorting;
+    }
+
+    /**
+     * As the index is built, items may be discarded from the exemplars.
+     * This contains some of the discards, and is intended for debugging.
+     * @internal
+     */
+    public List getNotAlphabetic() {
+        return notAlphabetic;
+    }
+
+    /*
+     * Comparator that returns "better" items first, where shorter NFKD is better,
+     * and otherwise NFKD binary order is better, and otherwise binary order is better.
+     */
+    private static class PreferenceComparator implements Comparator {
+        static final Comparator binary = new UTF16.StringComparator(true,false,0);
+        final Collator collator;
+
+        public PreferenceComparator(Collator collator) {
+            this.collator = collator;
+        }
+        
+        public int compare(Object o1, Object o2) {
+            if (o1 == o2) {
+                return 0;
+            }
+            String s1 = (String) o1;
+            String s2 = (String) o2;
+            String n1 = Normalizer.decompose(s1, true);
+            String n2 = Normalizer.decompose(s2, true);
+            int result = n1.length() - n2.length();
+            if (result != 0) {
+                return result;
+            }
+            result = collator.compare(n1, n2);
+            if (result != 0) {
+                return result;
+            }
+            return binary.compare(s1, s2);
+        }
+    }
+}
+//#endif