2 *******************************************************************************
\r
3 * Copyright (C) 2006-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 *******************************************************************************
\r
10 package com.ibm.icu.charset;
\r
12 //import java.io.ByteArrayInputStream;
\r
13 //import java.io.InputStreamReader;
\r
14 import java.lang.reflect.Constructor;
\r
16 import java.lang.reflect.InvocationTargetException;
\r
17 import java.nio.charset.*;
\r
18 import java.util.HashMap;
\r
20 import com.ibm.icu.text.UnicodeSet;
\r
23 * <p>A subclass of java.nio.Charset for providing implementation of ICU's charset converters.
\r
24 * This API is used to convert codepage or character encoded data to and
\r
25 * from UTF-16. You can open a converter with {@link Charset#forName } and {@link #forNameICU }. With that
\r
26 * converter, you can get its properties, set options, convert your data.</p>
\r
28 * <p>Since many software programs recogize different converter names for
\r
29 * different types of converters, there are other functions in this API to
\r
30 * iterate over the converter aliases.
\r
34 public abstract class CharsetICU extends Charset{
\r
36 String icuCanonicalName;
\r
37 String javaCanonicalName;
\r
40 float maxCharsPerByte;
\r
42 String name; /* +4: 60 internal name of the converter- invariant chars */
\r
44 int codepage; /* +64: 4 codepage # (now IBM-$codepage) */
\r
46 byte platform; /* +68: 1 platform of the converter (only IBM now) */
\r
47 byte conversionType; /* +69: 1 conversion type */
\r
49 int minBytesPerChar; /* +70: 1 Minimum # bytes per char in this codepage */
\r
50 int maxBytesPerChar; /* +71: 1 Maximum # bytes output per UChar in this codepage */
\r
52 byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4 [note: 4 and 8 byte boundary] */
\r
53 byte subCharLen; /* +76: 1 */
\r
55 byte hasToUnicodeFallback; /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
\r
56 byte hasFromUnicodeFallback; /* +78: 1 */
\r
57 short unicodeMask; /* +79: 1 bit 0: has supplementary bit 1: has single surrogates */
\r
58 byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */
\r
59 //byte reserved[/*19*/]; /* +81: 19 to round out the structure */
\r
62 // typedef enum UConverterUnicodeSet {
\r
64 * Parameter that select the set of roundtrippable Unicode code points.
\r
67 public static final int ROUNDTRIP_SET=0;
\r
69 * Select the set of Unicode code points with roundtrip or fallback mappings.
\r
70 * Not supported at this point.
\r
72 * @deprecated This API is ICU internal only.
\r
74 public static final int ROUNDTRIP_AND_FALLBACK_SET =1;
\r
76 //} UConverterUnicodeSet;
\r
80 * @param icuCanonicalName
\r
81 * @param canonicalName
\r
85 protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) {
\r
86 super(canonicalName,aliases);
\r
87 if(canonicalName.length() == 0){
\r
88 throw new IllegalCharsetNameException(canonicalName);
\r
90 this.javaCanonicalName = canonicalName;
\r
91 this.icuCanonicalName = icuCanonicalName;
\r
95 * Ascertains if a charset is a sub set of this charset
\r
96 * Implements the abstract method of super class.
\r
97 * @param cs charset to test
\r
98 * @return true if the given charset is a subset of this charset
\r
101 public boolean contains(Charset cs){
\r
104 } else if (this.equals(cs)) {
\r
109 private static final HashMap algorithmicCharsets = new HashMap();
\r
111 algorithmicCharsets.put("LMBCS-1", "com.ibm.icu.charset.CharsetLMBCS");
\r
112 algorithmicCharsets.put("BOCU-1", "com.ibm.icu.charset.CharsetBOCU1" );
\r
113 algorithmicCharsets.put("SCSU", "com.ibm.icu.charset.CharsetSCSU" );
\r
114 algorithmicCharsets.put("US-ASCII", "com.ibm.icu.charset.CharsetASCII" );
\r
115 algorithmicCharsets.put("ISO-8859-1", "com.ibm.icu.charset.Charset88591" );
\r
116 algorithmicCharsets.put("UTF-16", "com.ibm.icu.charset.CharsetUTF16" );
\r
117 algorithmicCharsets.put("UTF-16BE", "com.ibm.icu.charset.CharsetUTF16BE" );
\r
118 algorithmicCharsets.put("UTF-16LE", "com.ibm.icu.charset.CharsetUTF16LE" );
\r
119 algorithmicCharsets.put("UTF16_OppositeEndian", "com.ibm.icu.charset.CharsetUTF16LE" );
\r
120 algorithmicCharsets.put("UTF16_PlatformEndian", "com.ibm.icu.charset.CharsetUTF16" );
\r
121 algorithmicCharsets.put("UTF-32", "com.ibm.icu.charset.CharsetUTF32" );
\r
122 algorithmicCharsets.put("UTF-32BE", "com.ibm.icu.charset.CharsetUTF32BE" );
\r
123 algorithmicCharsets.put("UTF-32LE", "com.ibm.icu.charset.CharsetUTF32LE" );
\r
124 algorithmicCharsets.put("UTF32_OppositeEndian", "com.ibm.icu.charset.CharsetUTF32LE" );
\r
125 algorithmicCharsets.put("UTF32_PlatformEndian", "com.ibm.icu.charset.CharsetUTF32" );
\r
126 algorithmicCharsets.put("UTF-8", "com.ibm.icu.charset.CharsetUTF8" );
\r
127 algorithmicCharsets.put("CESU-8", "com.ibm.icu.charset.CharsetCESU8" );
\r
128 algorithmicCharsets.put("UTF-7", "com.ibm.icu.charset.CharsetUTF7" );
\r
129 algorithmicCharsets.put("ISCII,version=0", "com.ibm.icu.charset.CharsetISCII" );
\r
130 algorithmicCharsets.put("ISCII,version=1", "com.ibm.icu.charset.CharsetISCII" );
\r
131 algorithmicCharsets.put("ISCII,version=2", "com.ibm.icu.charset.CharsetISCII" );
\r
132 algorithmicCharsets.put("ISCII,version=3", "com.ibm.icu.charset.CharsetISCII" );
\r
133 algorithmicCharsets.put("ISCII,version=4", "com.ibm.icu.charset.CharsetISCII" );
\r
134 algorithmicCharsets.put("ISCII,version=5", "com.ibm.icu.charset.CharsetISCII" );
\r
135 algorithmicCharsets.put("ISCII,version=6", "com.ibm.icu.charset.CharsetISCII" );
\r
136 algorithmicCharsets.put("ISCII,version=7", "com.ibm.icu.charset.CharsetISCII" );
\r
137 algorithmicCharsets.put("ISCII,version=8", "com.ibm.icu.charset.CharsetISCII" );
\r
138 algorithmicCharsets.put("IMAP-mailbox-name", "com.ibm.icu.charset.CharsetUTF7" );
\r
139 algorithmicCharsets.put("HZ", "com.ibm.icu.charset.CharsetHZ" );
\r
140 algorithmicCharsets.put("ISO_2022,locale=ja,version=0", "com.ibm.icu.charset.CharsetISO2022" );
\r
141 algorithmicCharsets.put("ISO_2022,locale=ja,version=1", "com.ibm.icu.charset.CharsetISO2022" );
\r
142 algorithmicCharsets.put("ISO_2022,locale=ja,version=2", "com.ibm.icu.charset.CharsetISO2022" );
\r
143 algorithmicCharsets.put("ISO_2022,locale=ja,version=3", "com.ibm.icu.charset.CharsetISO2022" );
\r
144 algorithmicCharsets.put("ISO_2022,locale=ja,version=4", "com.ibm.icu.charset.CharsetISO2022" );
\r
145 algorithmicCharsets.put("ISO_2022,locale=zh,version=0", "com.ibm.icu.charset.CharsetISO2022" );
\r
146 algorithmicCharsets.put("ISO_2022,locale=zh,version=1", "com.ibm.icu.charset.CharsetISO2022" );
\r
147 algorithmicCharsets.put("ISO_2022,locale=ko,version=0", "com.ibm.icu.charset.CharsetISO2022" );
\r
148 algorithmicCharsets.put("ISO_2022,locale=ko,version=1", "com.ibm.icu.charset.CharsetISO2022" );
\r
151 /*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){
\r
152 String className = (String) algorithmicCharsets.get(icuCanonicalName);
\r
153 if(className==null){
\r
154 //all the cnv files are loaded as MBCS
\r
155 className = "com.ibm.icu.charset.CharsetMBCS";
\r
158 CharsetICU conv = null;
\r
159 Class cs = Class.forName(className);
\r
160 Class[] paramTypes = new Class[]{ String.class, String.class, String[].class};
\r
161 final Constructor c = cs.getConstructor(paramTypes);
\r
162 Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases};
\r
166 Object obj = c.newInstance(params);
\r
167 if(obj!=null && obj instanceof CharsetICU){
\r
168 conv = (CharsetICU)obj;
\r
171 }catch (InvocationTargetException e) {
\r
172 throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className+ ". Exception:" + e.getTargetException());
\r
174 }catch(ClassNotFoundException ex){
\r
175 }catch(NoSuchMethodException ex){
\r
176 }catch (IllegalAccessException ex){
\r
177 }catch (InstantiationException ex){
\r
179 throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className);
\r
182 static final boolean isSurrogate(int c){
\r
183 return (((c)&0xfffff800)==0xd800);
\r
187 * Returns the default charset name
\r
189 // static final String getDefaultCharsetName(){
\r
190 // String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();
\r
191 // return defaultEncoding;
\r
195 * Returns a charset object for the named charset.
\r
196 * This method gurantee that ICU charset is returned when
\r
197 * available. If the ICU charset provider does not support
\r
198 * the specified charset, then try other charset providers
\r
199 * including the standard Java charset provider.
\r
201 * @param charsetName The name of the requested charset,
\r
202 * may be either a canonical name or an alias
\r
203 * @return A charset object for the named charset
\r
204 * @throws IllegalCharsetNameException If the given charset name
\r
206 * @throws UnsupportedCharsetException If no support for the
\r
207 * named charset is available in this instance of th Java
\r
211 public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException {
\r
212 CharsetProviderICU icuProvider = new CharsetProviderICU();
\r
213 CharsetICU cs = (CharsetICU) icuProvider.charsetForName(charsetName);
\r
217 return Charset.forName(charsetName);
\r
221 // * @see java.lang.Comparable#compareTo(java.lang.Object)
\r
224 // public int compareTo(Object otherObj) {
\r
225 // if (!(otherObj instanceof CharsetICU)) {
\r
228 // return icuCanonicalName.compareTo(((CharsetICU)otherObj).icuCanonicalName);
\r
232 * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
\r
233 * start of the stream for example U+FEFF (the Unicode BOM/signature
\r
234 * character) that can be ignored.
\r
236 * Detects Unicode signature byte sequences at the start of the byte stream
\r
237 * and returns number of bytes of the BOM of the indicated Unicode charset.
\r
238 * 0 is returned when no Unicode signature is recognized.
\r
241 // TODO This should be proposed as CharsetDecoderICU API.
\r
242 // static String detectUnicodeSignature(ByteBuffer source) {
\r
243 // int signatureLength = 0; // number of bytes of the signature
\r
244 // final int SIG_MAX_LEN = 5;
\r
245 // String sigUniCharset = null; // states what unicode charset is the BOM
\r
249 // * initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we
\r
250 // * don't misdetect something
\r
252 // byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5,
\r
255 // while (i < source.remaining() && i < SIG_MAX_LEN) {
\r
256 // start[i] = source.get(i);
\r
260 // if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) {
\r
261 // signatureLength = 2;
\r
262 // sigUniCharset = "UTF-16BE";
\r
263 // source.position(signatureLength);
\r
264 // return sigUniCharset;
\r
265 // } else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) {
\r
266 // if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) {
\r
267 // signatureLength = 4;
\r
268 // sigUniCharset = "UTF-32LE";
\r
269 // source.position(signatureLength);
\r
270 // return sigUniCharset;
\r
272 // signatureLength = 2;
\r
273 // sigUniCharset = "UTF-16LE";
\r
274 // source.position(signatureLength);
\r
275 // return sigUniCharset;
\r
277 // } else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB
\r
278 // && start[2] == (byte) 0xBF) {
\r
279 // signatureLength = 3;
\r
280 // sigUniCharset = "UTF-8";
\r
281 // source.position(signatureLength);
\r
282 // return sigUniCharset;
\r
283 // } else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00
\r
284 // && start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) {
\r
285 // signatureLength = 4;
\r
286 // sigUniCharset = "UTF-32BE";
\r
287 // source.position(signatureLength);
\r
288 // return sigUniCharset;
\r
289 // } else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE
\r
290 // && start[2] == (byte) 0xFF) {
\r
291 // signatureLength = 3;
\r
292 // sigUniCharset = "SCSU";
\r
293 // source.position(signatureLength);
\r
294 // return sigUniCharset;
\r
295 // } else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE
\r
296 // && start[2] == (byte) 0x28) {
\r
297 // signatureLength = 3;
\r
298 // sigUniCharset = "BOCU-1";
\r
299 // source.position(signatureLength);
\r
300 // return sigUniCharset;
\r
301 // } else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F
\r
302 // && start[2] == (byte) 0x76) {
\r
304 // if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) {
\r
305 // signatureLength = 5;
\r
306 // sigUniCharset = "UTF-7";
\r
307 // source.position(signatureLength);
\r
308 // return sigUniCharset;
\r
309 // } else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39
\r
310 // || start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) {
\r
311 // signatureLength = 4;
\r
312 // sigUniCharset = "UTF-7";
\r
313 // source.position(signatureLength);
\r
314 // return sigUniCharset;
\r
316 // } else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73
\r
317 // && start[2] == (byte) 0x66 && start[3] == (byte) 0x73) {
\r
318 // signatureLength = 4;
\r
319 // sigUniCharset = "UTF-EBCDIC";
\r
320 // source.position(signatureLength);
\r
321 // return sigUniCharset;
\r
324 // /* no known Unicode signature byte sequence recognized */
\r
329 abstract void getUnicodeSetImpl(UnicodeSet setFillIn, int which);
\r
332 * <p>Returns the set of Unicode code points that can be converted by an ICU Converter.
\r
334 * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): The set of all Unicode code points that can be
\r
335 * roundtrip-converted (converted without any data loss) with the converter This set will not include code points that have fallback
\r
336 * mappings or are only the result of reverse fallback mappings. See UTR #22 "Character Mapping Markup Language" at <a href="http://www.unicode.org/reports/tr22/">http://www.unicode.org/reports/tr22/</a>
\r
337 * <p>* In the future, there may be more UConverterUnicodeSet choices to select sets with different properties.
\r
339 * <p>This is useful for example for
\r
340 * <ul><li>checking that a string or document can be roundtrip-converted with a converter,
\r
341 * without/before actually performing the conversion</li>
\r
342 * <li>testing if a converter can be used for text for typical text for a certain locale,
\r
343 * by comparing its roundtrip set with the set of ExemplarCharacters from
\r
344 * ICU's locale data or other sources</li></ul>
\r
346 * @param setFillIn A valid UnicodeSet. It will be cleared by this function before
\r
347 * the converter's specific set is filled in.
\r
348 * @param which A selector; currently ROUNDTRIP_SET is the only supported value.
\r
349 * @throws IllegalArgumentException if the parameters does not match.
\r
352 public void getUnicodeSet(UnicodeSet setFillIn, int which){
\r
353 if( setFillIn == null || which != ROUNDTRIP_SET ){
\r
354 throw new IllegalArgumentException();
\r
357 getUnicodeSetImpl(setFillIn, which);
\r
360 static void getNonSurrogateUnicodeSet(UnicodeSet setFillIn){
\r
361 setFillIn.add(0, 0xd7ff);
\r
362 setFillIn.add(0xe000, 0x10ffff);
\r
365 static void getCompleteUnicodeSet(UnicodeSet setFillIn){
\r
366 setFillIn.add(0, 0x10ffff);
\r