2 ******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
9 * This is a port of the C++ class UConverterSelector.
11 * Methods related to serialization are not ported in this version. In addition,
12 * the selectForUTF8 method is not going to be ported, as UTF8 is seldom used
15 * @author Shaopeng Jia
18 package com.ibm.icu.charset;
20 import java.nio.charset.Charset;
21 import java.nio.charset.IllegalCharsetNameException;
22 import java.nio.charset.UnsupportedCharsetException;
23 import java.util.ArrayList;
24 import java.util.List;
26 import com.ibm.icu.impl.IntTrie;
27 import com.ibm.icu.impl.PropsVectors;
28 import com.ibm.icu.text.UTF16;
29 import com.ibm.icu.text.UnicodeSet;
34 * A charset selector is built with a list of charset names and given an input
35 * CharSequence returns the list of names the corresponding charsets which can
36 * convert the CharSequence.
40 public final class CharsetSelector {
42 private int[] pv; // table of bits
43 private String[] encodings; // encodings users ask to use
45 private void generateSelectorData(PropsVectors pvec,
46 UnicodeSet excludedCodePoints, int mappingTypes) {
47 int columns = (encodings.length + 31) / 32;
49 // set errorValue to all-ones
50 for (int col = 0; col < columns; ++col) {
51 pvec.setValue(PropsVectors.ERROR_VALUE_CP,
52 PropsVectors.ERROR_VALUE_CP, col, ~0, ~0);
55 for (int i = 0; i < encodings.length; ++i) {
56 Charset testCharset = CharsetICU.forNameICU(encodings[i]);
57 UnicodeSet unicodePointSet = new UnicodeSet(); // empty set
58 ((CharsetICU) testCharset).getUnicodeSet(unicodePointSet,
61 int mask = 1 << (i % 32);
62 // now iterate over intervals on set i
63 int itemCount = unicodePointSet.getRangeCount();
64 for (int j = 0; j < itemCount; ++j) {
65 int startChar = unicodePointSet.getRangeStart(j);
66 int endChar = unicodePointSet.getRangeEnd(j);
67 pvec.setValue(startChar, endChar, column, ~0, mask);
71 // handle excluded encodings
72 // Simply set their values to all 1's in the pvec
73 if (!excludedCodePoints.isEmpty()) {
74 int itemCount = excludedCodePoints.getRangeCount();
75 for (int j = 0; j < itemCount; ++j) {
76 int startChar = excludedCodePoints.getRangeStart(j);
77 int endChar = excludedCodePoints.getRangeEnd(j);
78 for (int col = 0; col < columns; col++) {
79 pvec.setValue(startChar, endChar, col, ~0, ~0);
84 trie = pvec.compactToTrieWithRowIndexes();
85 pv = pvec.getCompactedArray();
88 // internal function to intersect two sets of masks
89 // returns whether the mask has reduced to all zeros. The
90 // second set of mask consists of len elements in pv starting from
92 private boolean intersectMasks(int[] dest, int pvIndex, int len) {
94 for (int i = 0; i < len; ++i) {
95 oredDest |= (dest[i] &= pv[pvIndex + i]);
101 private List<String> selectForMask(int[] mask) {
102 // this is the context we will use. Store a table of indices to which
103 // encodings are legit
105 List<String> result = new ArrayList<String>();
106 int columns = (encodings.length + 31) / 32;
107 int numOnes = countOnes(mask, columns);
109 // now we know the exact space we need to index
112 for (int j = 0; j < columns; j++) {
114 for (int i = 0; i < 32 && k < encodings.length; i++, k++) {
116 result.add(encodings[k]);
123 // otherwise, index will remain NULL
127 // internal function to count how many 1's are there in a mask
128 // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html
129 private int countOnes(int[] mask, int len) {
131 for (int i = 0; i < len; ++i) {
133 for (; ent != 0; totalOnes++) {
134 ent &= ent - 1; // clear the least significant bit set
141 * Construct a CharsetSelector from a list of charset names.
144 * a list of charset names in the form of strings. If charsetList
145 * is empty, a selector for all available charset is constructed.
146 * @param excludedCodePoints
147 * a set of code points to be excluded from consideration.
148 * Excluded code points appearing in the input CharSequence do
149 * not change the selection result. It could be empty when no
150 * code point should be excluded.
151 * @param mappingTypes
152 * an int which determines whether to consider only roundtrip
153 * mappings or also fallbacks, e.g. CharsetICU.ROUNDTRIP_SET. See
154 * CharsetICU.java for the constants that are currently
156 * @throws IllegalArgumentException
157 * if the parameters is invalid.
158 * @throws IllegalCharsetNameException
159 * If the given charset name is illegal.
160 * @throws UnsupportedCharsetException
161 * If no support for the named charset is available in this
162 * instance of the Java virtual machine.
165 public CharsetSelector(List<String> charsetList, UnicodeSet excludedCodePoints,
167 if (mappingTypes != CharsetICU.ROUNDTRIP_AND_FALLBACK_SET
168 && mappingTypes != CharsetICU.ROUNDTRIP_SET) {
169 throw new IllegalArgumentException("Unsupported mappingTypes");
172 int encodingCount = charsetList.size();
173 if (encodingCount > 0) {
174 encodings = charsetList.toArray(new String[0]);
176 encodings = CharsetProviderICU.getAvailableNames();
177 encodingCount = encodings.length;
180 PropsVectors pvec = new PropsVectors((encodingCount + 31) / 32);
181 generateSelectorData(pvec, excludedCodePoints, mappingTypes);
185 * Select charsets that can map all characters in a CharSequence, ignoring
186 * the excluded code points.
189 * a CharSequence. It could be empty.
190 * @return a list that contains charset names in the form of strings. The
191 * returned encoding names and their order will be the same as
192 * supplied when building the selector.
196 public List<String> selectForString(CharSequence unicodeText) {
197 int columns = (encodings.length + 31) / 32;
198 int[] mask = new int[columns];
199 for (int i = 0; i < columns; i++) {
200 mask[i] = - 1; // set each bit to 1
201 // Note: All integers are signed in Java, assigning
202 // 2 ^ 32 -1 to mask is wrong!
205 while (index < unicodeText.length()) {
206 int c = UTF16.charAt(unicodeText, index);
207 int pvIndex = trie.getCodePointValue(c);
208 index += UTF16.getCharCount(c);
209 if (intersectMasks(mask, pvIndex, columns)) {
213 return selectForMask(mask);