2 *******************************************************************************
3 * Copyright (C) 2006-2012, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 *******************************************************************************
10 package com.ibm.icu.charset;
12 import java.lang.reflect.Constructor;
13 import java.lang.reflect.InvocationTargetException;
14 import java.nio.charset.Charset;
15 import java.nio.charset.IllegalCharsetNameException;
16 import java.nio.charset.UnsupportedCharsetException;
17 import java.util.HashMap;
19 import com.ibm.icu.text.UnicodeSet;
22 * <p>A subclass of java.nio.Charset for providing implementation of ICU's charset converters.
23 * This API is used to convert codepage or character encoded data to and
24 * from UTF-16. You can open a converter with {@link Charset#forName } and {@link #forNameICU }. With that
25 * converter, you can get its properties, set options, convert your data.</p>
27 * <p>Since many software programs recogize different converter names for
28 * different types of converters, there are other functions in this API to
29 * iterate over the converter aliases.
33 public abstract class CharsetICU extends Charset{
35 String icuCanonicalName;
36 String javaCanonicalName;
39 float maxCharsPerByte;
41 String name; /* +4: 60 internal name of the converter- invariant chars */
43 int codepage; /* +64: 4 codepage # (now IBM-$codepage) */
45 byte platform; /* +68: 1 platform of the converter (only IBM now) */
46 byte conversionType; /* +69: 1 conversion type */
48 int minBytesPerChar; /* +70: 1 Minimum # bytes per char in this codepage */
49 int maxBytesPerChar; /* +71: 1 Maximum # bytes output per UChar in this codepage */
51 byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4 [note: 4 and 8 byte boundary] */
52 byte subCharLen; /* +76: 1 */
54 byte hasToUnicodeFallback; /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
55 byte hasFromUnicodeFallback; /* +78: 1 */
56 short unicodeMask; /* +79: 1 bit 0: has supplementary bit 1: has single surrogates */
57 byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */
58 //byte reserved[/*19*/]; /* +81: 19 to round out the structure */
61 // typedef enum UConverterUnicodeSet {
63 * Parameter that select the set of roundtrippable Unicode code points.
66 public static final int ROUNDTRIP_SET=0;
68 * Select the set of Unicode code points with roundtrip or fallback mappings.
69 * Not supported at this point.
71 * @deprecated This API is ICU internal only.
73 public static final int ROUNDTRIP_AND_FALLBACK_SET =1;
75 //} UConverterUnicodeSet;
79 * @param icuCanonicalName
80 * @param canonicalName
84 protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) {
85 super(canonicalName,aliases);
86 if(canonicalName.length() == 0){
87 throw new IllegalCharsetNameException(canonicalName);
89 this.javaCanonicalName = canonicalName;
90 this.icuCanonicalName = icuCanonicalName;
94 * Ascertains if a charset is a sub set of this charset
95 * Implements the abstract method of super class.
96 * @param cs charset to test
97 * @return true if the given charset is a subset of this charset
100 public boolean contains(Charset cs){
103 } else if (this.equals(cs)) {
108 private static final HashMap<String, String> algorithmicCharsets = new HashMap<String, String>();
110 algorithmicCharsets.put("LMBCS-1", "com.ibm.icu.charset.CharsetLMBCS");
111 algorithmicCharsets.put("LMBCS-2", "com.ibm.icu.charset.CharsetLMBCS");
112 algorithmicCharsets.put("LMBCS-3", "com.ibm.icu.charset.CharsetLMBCS");
113 algorithmicCharsets.put("LMBCS-4", "com.ibm.icu.charset.CharsetLMBCS");
114 algorithmicCharsets.put("LMBCS-5", "com.ibm.icu.charset.CharsetLMBCS");
115 algorithmicCharsets.put("LMBCS-6", "com.ibm.icu.charset.CharsetLMBCS");
116 algorithmicCharsets.put("LMBCS-8", "com.ibm.icu.charset.CharsetLMBCS");
117 algorithmicCharsets.put("LMBCS-11", "com.ibm.icu.charset.CharsetLMBCS");
118 algorithmicCharsets.put("LMBCS-16", "com.ibm.icu.charset.CharsetLMBCS");
119 algorithmicCharsets.put("LMBCS-17", "com.ibm.icu.charset.CharsetLMBCS");
120 algorithmicCharsets.put("LMBCS-18", "com.ibm.icu.charset.CharsetLMBCS");
121 algorithmicCharsets.put("LMBCS-19", "com.ibm.icu.charset.CharsetLMBCS");
122 algorithmicCharsets.put("BOCU-1", "com.ibm.icu.charset.CharsetBOCU1" );
123 algorithmicCharsets.put("SCSU", "com.ibm.icu.charset.CharsetSCSU" );
124 algorithmicCharsets.put("US-ASCII", "com.ibm.icu.charset.CharsetASCII" );
125 algorithmicCharsets.put("ISO-8859-1", "com.ibm.icu.charset.Charset88591" );
126 algorithmicCharsets.put("UTF-16", "com.ibm.icu.charset.CharsetUTF16" );
127 algorithmicCharsets.put("UTF-16BE", "com.ibm.icu.charset.CharsetUTF16BE" );
128 algorithmicCharsets.put("UTF-16BE,version=1", "com.ibm.icu.charset.CharsetUTF16BE" );
129 algorithmicCharsets.put("UTF-16LE", "com.ibm.icu.charset.CharsetUTF16LE" );
130 algorithmicCharsets.put("UTF-16LE,version=1", "com.ibm.icu.charset.CharsetUTF16LE" );
131 algorithmicCharsets.put("UTF16_OppositeEndian", "com.ibm.icu.charset.CharsetUTF16LE" );
132 algorithmicCharsets.put("UTF16_PlatformEndian", "com.ibm.icu.charset.CharsetUTF16" );
133 algorithmicCharsets.put("UTF-32", "com.ibm.icu.charset.CharsetUTF32" );
134 algorithmicCharsets.put("UTF-32BE", "com.ibm.icu.charset.CharsetUTF32BE" );
135 algorithmicCharsets.put("UTF-32LE", "com.ibm.icu.charset.CharsetUTF32LE" );
136 algorithmicCharsets.put("UTF32_OppositeEndian", "com.ibm.icu.charset.CharsetUTF32LE" );
137 algorithmicCharsets.put("UTF32_PlatformEndian", "com.ibm.icu.charset.CharsetUTF32" );
138 algorithmicCharsets.put("UTF-8", "com.ibm.icu.charset.CharsetUTF8" );
139 algorithmicCharsets.put("CESU-8", "com.ibm.icu.charset.CharsetCESU8" );
140 algorithmicCharsets.put("UTF-7", "com.ibm.icu.charset.CharsetUTF7" );
141 algorithmicCharsets.put("ISCII,version=0", "com.ibm.icu.charset.CharsetISCII" );
142 algorithmicCharsets.put("ISCII,version=1", "com.ibm.icu.charset.CharsetISCII" );
143 algorithmicCharsets.put("ISCII,version=2", "com.ibm.icu.charset.CharsetISCII" );
144 algorithmicCharsets.put("ISCII,version=3", "com.ibm.icu.charset.CharsetISCII" );
145 algorithmicCharsets.put("ISCII,version=4", "com.ibm.icu.charset.CharsetISCII" );
146 algorithmicCharsets.put("ISCII,version=5", "com.ibm.icu.charset.CharsetISCII" );
147 algorithmicCharsets.put("ISCII,version=6", "com.ibm.icu.charset.CharsetISCII" );
148 algorithmicCharsets.put("ISCII,version=7", "com.ibm.icu.charset.CharsetISCII" );
149 algorithmicCharsets.put("ISCII,version=8", "com.ibm.icu.charset.CharsetISCII" );
150 algorithmicCharsets.put("IMAP-mailbox-name", "com.ibm.icu.charset.CharsetUTF7" );
151 algorithmicCharsets.put("HZ", "com.ibm.icu.charset.CharsetHZ" );
152 algorithmicCharsets.put("ISO_2022,locale=ja,version=0", "com.ibm.icu.charset.CharsetISO2022" );
153 algorithmicCharsets.put("ISO_2022,locale=ja,version=1", "com.ibm.icu.charset.CharsetISO2022" );
154 algorithmicCharsets.put("ISO_2022,locale=ja,version=2", "com.ibm.icu.charset.CharsetISO2022" );
155 algorithmicCharsets.put("ISO_2022,locale=ja,version=3", "com.ibm.icu.charset.CharsetISO2022" );
156 algorithmicCharsets.put("ISO_2022,locale=ja,version=4", "com.ibm.icu.charset.CharsetISO2022" );
157 algorithmicCharsets.put("ISO_2022,locale=zh,version=0", "com.ibm.icu.charset.CharsetISO2022" );
158 algorithmicCharsets.put("ISO_2022,locale=zh,version=1", "com.ibm.icu.charset.CharsetISO2022" );
159 algorithmicCharsets.put("ISO_2022,locale=zh,version=2", "com.ibm.icu.charset.CharsetISO2022" );
160 algorithmicCharsets.put("ISO_2022,locale=ko,version=0", "com.ibm.icu.charset.CharsetISO2022" );
161 algorithmicCharsets.put("ISO_2022,locale=ko,version=1", "com.ibm.icu.charset.CharsetISO2022" );
162 algorithmicCharsets.put("x11-compound-text", "com.ibm.icu.charset.CharsetCompoundText" );
165 /*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){
166 String className = algorithmicCharsets.get(icuCanonicalName);
168 //all the cnv files are loaded as MBCS
169 className = "com.ibm.icu.charset.CharsetMBCS";
172 CharsetICU conv = null;
173 Class<? extends CharsetICU> cs = Class.forName(className).asSubclass(CharsetICU.class);
174 Class<?>[] paramTypes = new Class<?>[]{ String.class, String.class, String[].class};
175 final Constructor<? extends CharsetICU> c = cs.getConstructor(paramTypes);
176 Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases};
180 conv = c.newInstance(params);
184 }catch (InvocationTargetException e) {
185 throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className+ ". Exception:" + e.getTargetException());
187 }catch(ClassNotFoundException ex){
188 }catch(NoSuchMethodException ex){
189 }catch (IllegalAccessException ex){
190 }catch (InstantiationException ex){
192 throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className);
195 static final boolean isSurrogate(int c){
196 return (((c)&0xfffff800)==0xd800);
200 * Returns the default charset name
202 // static final String getDefaultCharsetName(){
203 // String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();
204 // return defaultEncoding;
208 * Returns a charset object for the named charset.
209 * This method gurantee that ICU charset is returned when
210 * available. If the ICU charset provider does not support
211 * the specified charset, then try other charset providers
212 * including the standard Java charset provider.
214 * @param charsetName The name of the requested charset,
215 * may be either a canonical name or an alias
216 * @return A charset object for the named charset
217 * @throws IllegalCharsetNameException If the given charset name
219 * @throws UnsupportedCharsetException If no support for the
220 * named charset is available in this instance of th Java
224 public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException {
225 CharsetProviderICU icuProvider = new CharsetProviderICU();
226 CharsetICU cs = (CharsetICU) icuProvider.charsetForName(charsetName);
230 return Charset.forName(charsetName);
234 // * @see java.lang.Comparable#compareTo(java.lang.Object)
237 // public int compareTo(Object otherObj) {
238 // if (!(otherObj instanceof CharsetICU)) {
241 // return icuCanonicalName.compareTo(((CharsetICU)otherObj).icuCanonicalName);
245 * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
246 * start of the stream for example U+FEFF (the Unicode BOM/signature
247 * character) that can be ignored.
249 * Detects Unicode signature byte sequences at the start of the byte stream
250 * and returns number of bytes of the BOM of the indicated Unicode charset.
251 * 0 is returned when no Unicode signature is recognized.
254 // TODO This should be proposed as CharsetDecoderICU API.
255 // static String detectUnicodeSignature(ByteBuffer source) {
256 // int signatureLength = 0; // number of bytes of the signature
257 // final int SIG_MAX_LEN = 5;
258 // String sigUniCharset = null; // states what unicode charset is the BOM
262 // * initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we
263 // * don't misdetect something
265 // byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5,
268 // while (i < source.remaining() && i < SIG_MAX_LEN) {
269 // start[i] = source.get(i);
273 // if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) {
274 // signatureLength = 2;
275 // sigUniCharset = "UTF-16BE";
276 // source.position(signatureLength);
277 // return sigUniCharset;
278 // } else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) {
279 // if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) {
280 // signatureLength = 4;
281 // sigUniCharset = "UTF-32LE";
282 // source.position(signatureLength);
283 // return sigUniCharset;
285 // signatureLength = 2;
286 // sigUniCharset = "UTF-16LE";
287 // source.position(signatureLength);
288 // return sigUniCharset;
290 // } else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB
291 // && start[2] == (byte) 0xBF) {
292 // signatureLength = 3;
293 // sigUniCharset = "UTF-8";
294 // source.position(signatureLength);
295 // return sigUniCharset;
296 // } else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00
297 // && start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) {
298 // signatureLength = 4;
299 // sigUniCharset = "UTF-32BE";
300 // source.position(signatureLength);
301 // return sigUniCharset;
302 // } else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE
303 // && start[2] == (byte) 0xFF) {
304 // signatureLength = 3;
305 // sigUniCharset = "SCSU";
306 // source.position(signatureLength);
307 // return sigUniCharset;
308 // } else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE
309 // && start[2] == (byte) 0x28) {
310 // signatureLength = 3;
311 // sigUniCharset = "BOCU-1";
312 // source.position(signatureLength);
313 // return sigUniCharset;
314 // } else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F
315 // && start[2] == (byte) 0x76) {
317 // if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) {
318 // signatureLength = 5;
319 // sigUniCharset = "UTF-7";
320 // source.position(signatureLength);
321 // return sigUniCharset;
322 // } else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39
323 // || start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) {
324 // signatureLength = 4;
325 // sigUniCharset = "UTF-7";
326 // source.position(signatureLength);
327 // return sigUniCharset;
329 // } else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73
330 // && start[2] == (byte) 0x66 && start[3] == (byte) 0x73) {
331 // signatureLength = 4;
332 // sigUniCharset = "UTF-EBCDIC";
333 // source.position(signatureLength);
334 // return sigUniCharset;
337 // /* no known Unicode signature byte sequence recognized */
342 abstract void getUnicodeSetImpl(UnicodeSet setFillIn, int which);
345 * <p>Returns the set of Unicode code points that can be converted by an ICU Converter.
347 * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): The set of all Unicode code points that can be
348 * roundtrip-converted (converted without any data loss) with the converter This set will not include code points that have fallback
349 * mappings or are only the result of reverse fallback mappings. See UTR #22 "Character Mapping Markup Language" at <a href="http://www.unicode.org/reports/tr22/">http://www.unicode.org/reports/tr22/</a>
350 * <p>* In the future, there may be more UConverterUnicodeSet choices to select sets with different properties.
352 * <p>This is useful for example for
353 * <ul><li>checking that a string or document can be roundtrip-converted with a converter,
354 * without/before actually performing the conversion</li>
355 * <li>testing if a converter can be used for text for typical text for a certain locale,
356 * by comparing its roundtrip set with the set of ExemplarCharacters from
357 * ICU's locale data or other sources</li></ul>
359 * @param setFillIn A valid UnicodeSet. It will be cleared by this function before
360 * the converter's specific set is filled in.
361 * @param which A selector; currently ROUNDTRIP_SET is the only supported value.
362 * @throws IllegalArgumentException if the parameters does not match.
365 public void getUnicodeSet(UnicodeSet setFillIn, int which){
366 if( setFillIn == null || which != ROUNDTRIP_SET ){
367 throw new IllegalArgumentException();
370 getUnicodeSetImpl(setFillIn, which);
374 * Returns whether or not the charset of the converter has a fixed number of bytes
375 * per charset character.
376 * An example of this are converters that are of the type UCNV_SBCS or UCNV_DBCS.
377 * Another example is UTF-32 which is always 4 bytes per character. A UTF-32 code point
378 * may represent more than one UTF-8 or UTF-16 code units but always have size of 4 bytes.
379 * Note: This method is not intended to be used to determine whether the charset has a
380 * fixed ratio of bytes to Unicode codes units for any particular Unicode encoding form.
381 * @return true if the converter is fixed-width
384 public boolean isFixedWidth() {
385 if (this instanceof CharsetASCII || this instanceof CharsetUTF32) {
389 if (this instanceof CharsetMBCS) {
390 if (((CharsetMBCS)this).sharedData.staticData.maxBytesPerChar == ((CharsetMBCS)this).sharedData.staticData.minBytesPerChar) {
398 static void getNonSurrogateUnicodeSet(UnicodeSet setFillIn){
399 setFillIn.add(0, 0xd7ff);
400 setFillIn.add(0xe000, 0x10ffff);
403 static void getCompleteUnicodeSet(UnicodeSet setFillIn){
404 setFillIn.add(0, 0x10ffff);