2 ******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 ******************************************************************************
\r
9 * This is a port of the C++ class UConverterSelector.
\r
11 * Methods related to serialization are not ported in this version. In addition,
\r
12 * the selectForUTF8 method is not going to be ported, as UTF8 is seldom used
\r
15 * @author Shaopeng Jia
\r
18 package com.ibm.icu.charset;
\r
20 import java.nio.charset.Charset;
\r
21 import java.nio.charset.IllegalCharsetNameException;
\r
22 import java.nio.charset.UnsupportedCharsetException;
\r
23 import java.util.List;
\r
24 import java.util.Vector;
\r
26 import com.ibm.icu.impl.IntTrie;
\r
27 import com.ibm.icu.impl.PropsVectors;
\r
28 import com.ibm.icu.text.UTF16;
\r
29 import com.ibm.icu.text.UnicodeSet;
\r
34 * A charset selector is built with a list of charset names and given an input
\r
35 * CharSequence returns the list of names the corresponding charsets which can
\r
36 * convert the CharSequence.
\r
40 public final class CharsetSelector {
\r
41 private IntTrie trie;
\r
42 private int[] pv; // table of bits
\r
43 private String[] encodings; // encodings users ask to use
\r
45 private void generateSelectorData(PropsVectors pvec,
\r
46 UnicodeSet excludedCodePoints, int mappingTypes) {
\r
47 int columns = (encodings.length + 31) / 32;
\r
49 // set errorValue to all-ones
\r
50 for (int col = 0; col < columns; ++col) {
\r
51 pvec.setValue(PropsVectors.ERROR_VALUE_CP,
\r
52 PropsVectors.ERROR_VALUE_CP, col, ~0, ~0);
\r
55 for (int i = 0; i < encodings.length; ++i) {
\r
56 Charset testCharset = CharsetICU.forNameICU(encodings[i]);
\r
57 UnicodeSet unicodePointSet = new UnicodeSet(); // empty set
\r
58 ((CharsetICU) testCharset).getUnicodeSet(unicodePointSet,
\r
60 int column = i / 32;
\r
61 int mask = 1 << (i % 32);
\r
62 // now iterate over intervals on set i
\r
63 int itemCount = unicodePointSet.getRangeCount();
\r
64 for (int j = 0; j < itemCount; ++j) {
\r
65 int startChar = unicodePointSet.getRangeStart(j);
\r
66 int endChar = unicodePointSet.getRangeEnd(j);
\r
67 pvec.setValue(startChar, endChar, column, ~0, mask);
\r
71 // handle excluded encodings
\r
72 // Simply set their values to all 1's in the pvec
\r
73 if (!excludedCodePoints.isEmpty()) {
\r
74 int itemCount = excludedCodePoints.getRangeCount();
\r
75 for (int j = 0; j < itemCount; ++j) {
\r
76 int startChar = excludedCodePoints.getRangeStart(j);
\r
77 int endChar = excludedCodePoints.getRangeEnd(j);
\r
78 for (int col = 0; col < columns; col++) {
\r
79 pvec.setValue(startChar, endChar, col, ~0, ~0);
\r
84 trie = pvec.compactToTrieWithRowIndexes();
\r
85 pv = pvec.getCompactedArray();
\r
88 // internal function to intersect two sets of masks
\r
89 // returns whether the mask has reduced to all zeros. The
\r
90 // second set of mask consists of len elements in pv starting from
\r
92 private boolean intersectMasks(int[] dest, int pvIndex, int len) {
\r
94 for (int i = 0; i < len; ++i) {
\r
95 oredDest |= (dest[i] &= pv[pvIndex + i]);
\r
97 return oredDest == 0;
\r
100 // internal function
\r
101 private List<String> selectForMask(int[] mask) {
\r
102 // this is the context we will use. Store a table of indices to which
\r
103 // encodings are legit
\r
105 Vector<String> result = new Vector<String>();
\r
106 int columns = (encodings.length + 31) / 32;
\r
107 int numOnes = countOnes(mask, columns);
\r
109 // now we know the exact space we need to index
\r
112 for (int j = 0; j < columns; j++) {
\r
114 for (int i = 0; i < 32 && k < encodings.length; i++, k++) {
\r
115 if ((v & 1) != 0) {
\r
116 result.addElement(encodings[k]);
\r
123 // otherwise, index will remain NULL
\r
127 // internal function to count how many 1's are there in a mask
\r
128 // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html
\r
129 private int countOnes(int[] mask, int len) {
\r
131 for (int i = 0; i < len; ++i) {
\r
133 for (; ent != 0; totalOnes++) {
\r
134 ent &= ent - 1; // clear the least significant bit set
\r
141 * Construct a CharsetSelector from a list of charset names.
\r
143 * @param charsetList
\r
144 * a list of charset names in the form of strings. If charsetList
\r
145 * is empty, a selector for all available charset is constructed.
\r
146 * @param excludedCodePoints
\r
147 * a set of code points to be excluded from consideration.
\r
148 * Excluded code points appearing in the input CharSequence do
\r
149 * not change the selection result. It could be empty when no
\r
150 * code point should be excluded.
\r
151 * @param mappingTypes
\r
152 * an int which determines whether to consider only roundtrip
\r
153 * mappings or also fallbacks, e.g. CharsetICU.ROUNDTRIP_SET. See
\r
154 * CharsetICU.java for the constants that are currently
\r
156 * @throws IllegalArgumentException
\r
157 * if the parameters is invalid.
\r
158 * @throws IllegalCharsetNameException
\r
159 * If the given charset name is illegal.
\r
160 * @throws UnsupportedCharsetException
\r
161 * If no support for the named charset is available in this
\r
162 * instance of the Java virtual machine.
\r
165 public CharsetSelector(List<String> charsetList, UnicodeSet excludedCodePoints,
\r
166 int mappingTypes) {
\r
167 if (mappingTypes != CharsetICU.ROUNDTRIP_AND_FALLBACK_SET
\r
168 && mappingTypes != CharsetICU.ROUNDTRIP_SET) {
\r
169 throw new IllegalArgumentException("Unsupported mappingTypes");
\r
172 int encodingCount = charsetList.size();
\r
173 if (encodingCount > 0) {
\r
174 encodings = charsetList.toArray(new String[0]);
\r
176 encodings = CharsetProviderICU.getAvailableNames();
\r
177 encodingCount = encodings.length;
\r
180 PropsVectors pvec = new PropsVectors((encodingCount + 31) / 32);
\r
181 generateSelectorData(pvec, excludedCodePoints, mappingTypes);
\r
185 * Select charsets that can map all characters in a CharSequence, ignoring
\r
186 * the excluded code points.
\r
188 * @param unicodeText
\r
189 * a CharSequence. It could be empty.
\r
190 * @return a list that contains charset names in the form of strings. The
\r
191 * returned encoding names and their order will be the same as
\r
192 * supplied when building the selector.
\r
196 public List<String> selectForString(CharSequence unicodeText) {
\r
197 int columns = (encodings.length + 31) / 32;
\r
198 int[] mask = new int[columns];
\r
199 for (int i = 0; i < columns; i++) {
\r
200 mask[i] = - 1; // set each bit to 1
\r
201 // Note: All integers are signed in Java, assigning
\r
202 // 2 ^ 32 -1 to mask is wrong!
\r
205 while (index < unicodeText.length()) {
\r
206 int c = UTF16.charAt(unicodeText, index);
\r
207 int pvIndex = trie.getCodePointValue(c);
\r
208 index += UTF16.getCharCount(c);
\r
209 if (intersectMasks(mask, pvIndex, columns)) {
\r
213 return selectForMask(mask);
\r