jars/icu4j-4_4_2-src/main/classes/charset/src/com/ibm/icu/charset/CharsetSelector.java

   1 /*\r
   2  ******************************************************************************\r
   3  * Copyright (C) 1996-2010, International Business Machines Corporation and   *\r
   4  * others. All Rights Reserved.                                               *\r
   5  ******************************************************************************\r
   6  */\r
   7 \r
   8 /* \r
   9  * This is a port of the C++ class UConverterSelector. \r
  10  *\r
  11  * Methods related to serialization are not ported in this version. In addition,\r
  12  * the selectForUTF8 method is not going to be ported, as UTF8 is seldom used\r
  13  * in Java.\r
  14  * \r
  15  * @author Shaopeng Jia\r
  16  */\r
  17 \r
  18 package com.ibm.icu.charset;\r
  19 \r
  20 import java.nio.charset.Charset;\r
  21 import java.nio.charset.IllegalCharsetNameException;\r
  22 import java.nio.charset.UnsupportedCharsetException;\r
  23 import java.util.List;\r
  24 import java.util.Vector;\r
  25 \r
  26 import com.ibm.icu.impl.IntTrie;\r
  27 import com.ibm.icu.impl.PropsVectors;\r
  28 import com.ibm.icu.text.UTF16;\r
  29 import com.ibm.icu.text.UnicodeSet;\r
  30 \r
  31 /**\r
  32  * Charset Selector\r
  33  * \r
  34  * A charset selector is built with a list of charset names and given an input\r
  35  * CharSequence returns the list of names the corresponding charsets which can\r
  36  * convert the CharSequence.\r
  37  * \r
  38  * @stable ICU 4.2\r
  39  */\r
  40 public final class CharsetSelector {\r
  41     private IntTrie trie;\r
  42     private int[] pv; // table of bits\r
  43     private String[] encodings; // encodings users ask to use\r
  44 \r
  45     private void generateSelectorData(PropsVectors pvec,\r
  46             UnicodeSet excludedCodePoints, int mappingTypes) {\r
  47         int columns = (encodings.length + 31) / 32;\r
  48 \r
  49         // set errorValue to all-ones\r
  50         for (int col = 0; col < columns; ++col) {\r
  51             pvec.setValue(PropsVectors.ERROR_VALUE_CP,\r
  52                     PropsVectors.ERROR_VALUE_CP, col, ~0, ~0);\r
  53         }\r
  54 \r
  55         for (int i = 0; i < encodings.length; ++i) {\r
  56             Charset testCharset = CharsetICU.forNameICU(encodings[i]);\r
  57             UnicodeSet unicodePointSet = new UnicodeSet(); // empty set\r
  58             ((CharsetICU) testCharset).getUnicodeSet(unicodePointSet,\r
  59                     mappingTypes);\r
  60             int column = i / 32;\r
  61             int mask = 1 << (i % 32);\r
  62             // now iterate over intervals on set i\r
  63             int itemCount = unicodePointSet.getRangeCount();\r
  64             for (int j = 0; j < itemCount; ++j) {\r
  65                 int startChar = unicodePointSet.getRangeStart(j);\r
  66                 int endChar = unicodePointSet.getRangeEnd(j);\r
  67                 pvec.setValue(startChar, endChar, column, ~0, mask);\r
  68             }\r
  69         }\r
  70 \r
  71         // handle excluded encodings\r
  72         // Simply set their values to all 1's in the pvec\r
  73         if (!excludedCodePoints.isEmpty()) {\r
  74             int itemCount = excludedCodePoints.getRangeCount();\r
  75             for (int j = 0; j < itemCount; ++j) {\r
  76                 int startChar = excludedCodePoints.getRangeStart(j);\r
  77                 int endChar = excludedCodePoints.getRangeEnd(j);\r
  78                 for (int col = 0; col < columns; col++) {\r
  79                     pvec.setValue(startChar, endChar, col, ~0, ~0);\r
  80                 }\r
  81             }\r
  82         }\r
  83 \r
  84         trie = pvec.compactToTrieWithRowIndexes();\r
  85         pv = pvec.getCompactedArray();\r
  86     }\r
  87 \r
  88     // internal function to intersect two sets of masks\r
  89     // returns whether the mask has reduced to all zeros. The\r
  90     // second set of mask consists of len elements in pv starting from \r
  91     // pvIndex\r
  92     private boolean intersectMasks(int[] dest, int pvIndex, int len) {\r
  93         int oredDest = 0;\r
  94         for (int i = 0; i < len; ++i) {\r
  95             oredDest |= (dest[i] &= pv[pvIndex + i]);\r
  96         }\r
  97         return oredDest == 0;\r
  98     }\r
  99 \r
 100     // internal function\r
 101     private List<String> selectForMask(int[] mask) {\r
 102         // this is the context we will use. Store a table of indices to which\r
 103         // encodings are legit\r
 104 \r
 105         Vector<String> result = new Vector<String>();\r
 106         int columns = (encodings.length + 31) / 32;\r
 107         int numOnes = countOnes(mask, columns);\r
 108 \r
 109         // now we know the exact space we need to index\r
 110         if (numOnes > 0) {\r
 111             int k = 0;\r
 112             for (int j = 0; j < columns; j++) {\r
 113                 int v = mask[j];\r
 114                 for (int i = 0; i < 32 && k < encodings.length; i++, k++) {\r
 115                     if ((v & 1) != 0) {\r
 116                         result.addElement(encodings[k]);\r
 117                     }\r
 118                     v >>= 1;\r
 119                 }\r
 120             }\r
 121         }\r
 122 \r
 123         // otherwise, index will remain NULL\r
 124         return result;\r
 125     }\r
 126 \r
 127     // internal function to count how many 1's are there in a mask\r
 128     // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html\r
 129     private int countOnes(int[] mask, int len) {\r
 130         int totalOnes = 0;\r
 131         for (int i = 0; i < len; ++i) {\r
 132             int ent = mask[i];\r
 133             for (; ent != 0; totalOnes++) {\r
 134                 ent &= ent - 1; // clear the least significant bit set\r
 135             }\r
 136         }\r
 137         return totalOnes;\r
 138     }\r
 139 \r
 140     /**\r
 141      * Construct a CharsetSelector from a list of charset names.\r
 142      * \r
 143      * @param charsetList\r
 144      *            a list of charset names in the form of strings. If charsetList\r
 145      *            is empty, a selector for all available charset is constructed.\r
 146      * @param excludedCodePoints\r
 147      *            a set of code points to be excluded from consideration.\r
 148      *            Excluded code points appearing in the input CharSequence do\r
 149      *            not change the selection result. It could be empty when no\r
 150      *            code point should be excluded.\r
 151      * @param mappingTypes\r
 152      *            an int which determines whether to consider only roundtrip\r
 153      *            mappings or also fallbacks, e.g. CharsetICU.ROUNDTRIP_SET. See\r
 154      *            CharsetICU.java for the constants that are currently\r
 155      *            supported.\r
 156      * @throws IllegalArgumentException\r
 157      *             if the parameters is invalid.\r
 158      * @throws IllegalCharsetNameException\r
 159      *             If the given charset name is illegal.\r
 160      * @throws UnsupportedCharsetException\r
 161      *             If no support for the named charset is available in this\r
 162      *             instance of the Java virtual machine.\r
 163      * @stable ICU 4.2\r
 164      */\r
 165     public CharsetSelector(List<String> charsetList, UnicodeSet excludedCodePoints,\r
 166             int mappingTypes) {\r
 167         if (mappingTypes != CharsetICU.ROUNDTRIP_AND_FALLBACK_SET\r
 168                 && mappingTypes != CharsetICU.ROUNDTRIP_SET) {\r
 169             throw new IllegalArgumentException("Unsupported mappingTypes");\r
 170         }\r
 171 \r
 172         int encodingCount = charsetList.size();\r
 173         if (encodingCount > 0) {\r
 174             encodings = charsetList.toArray(new String[0]);\r
 175         } else {\r
 176             encodings = CharsetProviderICU.getAvailableNames();\r
 177             encodingCount = encodings.length;\r
 178         }\r
 179 \r
 180         PropsVectors pvec = new PropsVectors((encodingCount + 31) / 32);\r
 181         generateSelectorData(pvec, excludedCodePoints, mappingTypes);\r
 182     }\r
 183 \r
 184     /**\r
 185      * Select charsets that can map all characters in a CharSequence, ignoring\r
 186      * the excluded code points.\r
 187      * \r
 188      * @param unicodeText\r
 189      *            a CharSequence. It could be empty.\r
 190      * @return a list that contains charset names in the form of strings. The\r
 191      *         returned encoding names and their order will be the same as\r
 192      *         supplied when building the selector.\r
 193      * \r
 194      * @stable ICU 4.2\r
 195      */\r
 196     public List<String> selectForString(CharSequence unicodeText) {\r
 197         int columns = (encodings.length + 31) / 32;\r
 198         int[] mask = new int[columns];\r
 199         for (int i = 0; i < columns; i++) {\r
 200             mask[i] = - 1; // set each bit to 1\r
 201                            // Note: All integers are signed in Java, assigning\r
 202                            // 2 ^ 32 -1 to mask is wrong!\r
 203         }\r
 204         int index = 0;\r
 205         while (index < unicodeText.length()) {\r
 206             int c = UTF16.charAt(unicodeText, index);\r
 207             int pvIndex = trie.getCodePointValue(c);\r
 208             index += UTF16.getCharCount(c);\r
 209             if (intersectMasks(mask, pvIndex, columns)) {\r
 210                 break;\r
 211             }\r
 212         }\r
 213         return selectForMask(mask);\r
 214     }\r
 215 }\r