jars/icu4j-52_1/main/classes/charset/src/com/ibm/icu/charset/CharsetICU.java

   1 /**
   2 *******************************************************************************
   3 * Copyright (C) 2006-2012, International Business Machines Corporation and    *
   4 * others. All Rights Reserved.                                                *
   5 *******************************************************************************
   6 *
   7 *******************************************************************************
   8 */
   9
  10 package com.ibm.icu.charset;
  11
  12 import java.lang.reflect.Constructor;
  13 import java.lang.reflect.InvocationTargetException;
  14 import java.nio.charset.Charset;
  15 import java.nio.charset.IllegalCharsetNameException;
  16 import java.nio.charset.UnsupportedCharsetException;
  17 import java.util.HashMap;
  18
  19 import com.ibm.icu.text.UnicodeSet;
  20
  21 /**
  22  * <p>A subclass of java.nio.Charset for providing implementation of ICU's charset converters.
  23  * This API is used to convert codepage or character encoded data to and
  24  * from UTF-16. You can open a converter with {@link Charset#forName } and {@link #forNameICU }. With that
  25  * converter, you can get its properties, set options, convert your data.</p>
  26  *
  27  * <p>Since many software programs recogize different converter names for
  28  * different types of converters, there are other functions in this API to
  29  * iterate over the converter aliases.
  30  *
  31  * @stable ICU 3.6
  32  */
  33 public abstract class CharsetICU extends Charset{
  34
  35      String icuCanonicalName;
  36      String javaCanonicalName;
  37      int options;
  38
  39      float  maxCharsPerByte;
  40
  41      String name; /* +4: 60  internal name of the converter- invariant chars */
  42
  43      int codepage;               /* +64: 4 codepage # (now IBM-$codepage) */
  44
  45      byte platform;                /* +68: 1 platform of the converter (only IBM now) */
  46      byte conversionType;          /* +69: 1 conversion type */
  47
  48      int minBytesPerChar;         /* +70: 1 Minimum # bytes per char in this codepage */
  49      int maxBytesPerChar;         /* +71: 1 Maximum # bytes output per UChar in this codepage */
  50
  51      byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4  [note:  4 and 8 byte boundary] */
  52      byte subCharLen;              /* +76: 1 */
  53
  54      byte hasToUnicodeFallback;   /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
  55      byte hasFromUnicodeFallback; /* +78: 1 */
  56      short unicodeMask;            /* +79: 1  bit 0: has supplementary  bit 1: has single surrogates */
  57      byte subChar1;               /* +80: 1  single-byte substitution character for IBM MBCS (0 if none) */
  58      //byte reserved[/*19*/];           /* +81: 19 to round out the structure */
  59
  60
  61     // typedef enum UConverterUnicodeSet {
  62      /**
  63       * Parameter that select the set of roundtrippable Unicode code points.
  64       * @stable ICU 4.0
  65       */
  66       public static final int ROUNDTRIP_SET=0;
  67       /**
  68        * Select the set of Unicode code points with roundtrip or fallback mappings.
  69        * Not supported at this point.
  70        * @internal
  71        * @deprecated This API is ICU internal only.
  72        */
  73       public static final int ROUNDTRIP_AND_FALLBACK_SET =1;
  74
  75     //} UConverterUnicodeSet;
  76
  77     /**
  78      *
  79      * @param icuCanonicalName
  80      * @param canonicalName
  81      * @param aliases
  82      * @stable ICU 3.6
  83      */
  84     protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) {
  85         super(canonicalName,aliases);
  86         if(canonicalName.length() == 0){
  87             throw new IllegalCharsetNameException(canonicalName);
  88         }
  89         this.javaCanonicalName = canonicalName;
  90         this.icuCanonicalName  = icuCanonicalName;
  91     }
  92
  93     /**
  94      * Ascertains if a charset is a sub set of this charset
  95      * Implements the abstract method of super class.
  96      * @param cs charset to test
  97      * @return true if the given charset is a subset of this charset
  98      * @stable ICU 3.6
  99      */
 100     public boolean contains(Charset cs){
 101         if (null == cs) {
 102             return false;
 103         } else if (this.equals(cs)) {
 104             return true;
 105         }
 106         return false;
 107     }
 108     private static final HashMap<String, String> algorithmicCharsets = new HashMap<String, String>();
 109     static{
 110         algorithmicCharsets.put("LMBCS-1",               "com.ibm.icu.charset.CharsetLMBCS");
 111         algorithmicCharsets.put("LMBCS-2",               "com.ibm.icu.charset.CharsetLMBCS");
 112         algorithmicCharsets.put("LMBCS-3",               "com.ibm.icu.charset.CharsetLMBCS");
 113         algorithmicCharsets.put("LMBCS-4",               "com.ibm.icu.charset.CharsetLMBCS");
 114         algorithmicCharsets.put("LMBCS-5",               "com.ibm.icu.charset.CharsetLMBCS");
 115         algorithmicCharsets.put("LMBCS-6",               "com.ibm.icu.charset.CharsetLMBCS");
 116         algorithmicCharsets.put("LMBCS-8",               "com.ibm.icu.charset.CharsetLMBCS");
 117         algorithmicCharsets.put("LMBCS-11",              "com.ibm.icu.charset.CharsetLMBCS");
 118         algorithmicCharsets.put("LMBCS-16",              "com.ibm.icu.charset.CharsetLMBCS");
 119         algorithmicCharsets.put("LMBCS-17",              "com.ibm.icu.charset.CharsetLMBCS");
 120         algorithmicCharsets.put("LMBCS-18",              "com.ibm.icu.charset.CharsetLMBCS");
 121         algorithmicCharsets.put("LMBCS-19",              "com.ibm.icu.charset.CharsetLMBCS");
 122         algorithmicCharsets.put("BOCU-1",                "com.ibm.icu.charset.CharsetBOCU1" );
 123         algorithmicCharsets.put("SCSU",                  "com.ibm.icu.charset.CharsetSCSU" );
 124         algorithmicCharsets.put("US-ASCII",              "com.ibm.icu.charset.CharsetASCII" );
 125         algorithmicCharsets.put("ISO-8859-1",            "com.ibm.icu.charset.Charset88591" );
 126         algorithmicCharsets.put("UTF-16",                "com.ibm.icu.charset.CharsetUTF16" );
 127         algorithmicCharsets.put("UTF-16BE",              "com.ibm.icu.charset.CharsetUTF16BE" );
 128         algorithmicCharsets.put("UTF-16BE,version=1",    "com.ibm.icu.charset.CharsetUTF16BE" );
 129         algorithmicCharsets.put("UTF-16LE",              "com.ibm.icu.charset.CharsetUTF16LE" );
 130         algorithmicCharsets.put("UTF-16LE,version=1",    "com.ibm.icu.charset.CharsetUTF16LE" );
 131         algorithmicCharsets.put("UTF16_OppositeEndian",  "com.ibm.icu.charset.CharsetUTF16LE" );
 132         algorithmicCharsets.put("UTF16_PlatformEndian",  "com.ibm.icu.charset.CharsetUTF16" );
 133         algorithmicCharsets.put("UTF-32",                "com.ibm.icu.charset.CharsetUTF32" );
 134         algorithmicCharsets.put("UTF-32BE",              "com.ibm.icu.charset.CharsetUTF32BE" );
 135         algorithmicCharsets.put("UTF-32LE",              "com.ibm.icu.charset.CharsetUTF32LE" );
 136         algorithmicCharsets.put("UTF32_OppositeEndian",  "com.ibm.icu.charset.CharsetUTF32LE" );
 137         algorithmicCharsets.put("UTF32_PlatformEndian",  "com.ibm.icu.charset.CharsetUTF32" );
 138         algorithmicCharsets.put("UTF-8",                 "com.ibm.icu.charset.CharsetUTF8" );
 139         algorithmicCharsets.put("CESU-8",                "com.ibm.icu.charset.CharsetCESU8" );
 140         algorithmicCharsets.put("UTF-7",                 "com.ibm.icu.charset.CharsetUTF7" );
 141         algorithmicCharsets.put("ISCII,version=0",       "com.ibm.icu.charset.CharsetISCII" );
 142         algorithmicCharsets.put("ISCII,version=1",       "com.ibm.icu.charset.CharsetISCII" );
 143         algorithmicCharsets.put("ISCII,version=2",       "com.ibm.icu.charset.CharsetISCII" );
 144         algorithmicCharsets.put("ISCII,version=3",       "com.ibm.icu.charset.CharsetISCII" );
 145         algorithmicCharsets.put("ISCII,version=4",       "com.ibm.icu.charset.CharsetISCII" );
 146         algorithmicCharsets.put("ISCII,version=5",       "com.ibm.icu.charset.CharsetISCII" );
 147         algorithmicCharsets.put("ISCII,version=6",       "com.ibm.icu.charset.CharsetISCII" );
 148         algorithmicCharsets.put("ISCII,version=7",       "com.ibm.icu.charset.CharsetISCII" );
 149         algorithmicCharsets.put("ISCII,version=8",       "com.ibm.icu.charset.CharsetISCII" );
 150         algorithmicCharsets.put("IMAP-mailbox-name",     "com.ibm.icu.charset.CharsetUTF7" );
 151         algorithmicCharsets.put("HZ",                    "com.ibm.icu.charset.CharsetHZ" );
 152         algorithmicCharsets.put("ISO_2022,locale=ja,version=0",               "com.ibm.icu.charset.CharsetISO2022" );
 153         algorithmicCharsets.put("ISO_2022,locale=ja,version=1",               "com.ibm.icu.charset.CharsetISO2022" );
 154         algorithmicCharsets.put("ISO_2022,locale=ja,version=2",               "com.ibm.icu.charset.CharsetISO2022" );
 155         algorithmicCharsets.put("ISO_2022,locale=ja,version=3",               "com.ibm.icu.charset.CharsetISO2022" );
 156         algorithmicCharsets.put("ISO_2022,locale=ja,version=4",               "com.ibm.icu.charset.CharsetISO2022" );
 157         algorithmicCharsets.put("ISO_2022,locale=zh,version=0",               "com.ibm.icu.charset.CharsetISO2022" );
 158         algorithmicCharsets.put("ISO_2022,locale=zh,version=1",               "com.ibm.icu.charset.CharsetISO2022" );
 159         algorithmicCharsets.put("ISO_2022,locale=zh,version=2",               "com.ibm.icu.charset.CharsetISO2022" );
 160         algorithmicCharsets.put("ISO_2022,locale=ko,version=0",               "com.ibm.icu.charset.CharsetISO2022" );
 161         algorithmicCharsets.put("ISO_2022,locale=ko,version=1",               "com.ibm.icu.charset.CharsetISO2022" );
 162         algorithmicCharsets.put("x11-compound-text",                          "com.ibm.icu.charset.CharsetCompoundText" );
 163         }
 164
 165     /*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){
 166        String className = algorithmicCharsets.get(icuCanonicalName);
 167        if(className==null){
 168            //all the cnv files are loaded as MBCS
 169            className = "com.ibm.icu.charset.CharsetMBCS";
 170        }
 171        try{
 172            CharsetICU conv = null;
 173            Class<? extends CharsetICU> cs = Class.forName(className).asSubclass(CharsetICU.class);
 174            Class<?>[] paramTypes = new Class<?>[]{ String.class, String.class,  String[].class};
 175            final Constructor<? extends CharsetICU> c = cs.getConstructor(paramTypes);
 176            Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases};
 177
 178            // Run constructor
 179            try {
 180                conv = c.newInstance(params);
 181                if (conv != null) {
 182                    return conv;
 183                }
 184            }catch (InvocationTargetException e) {
 185                throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className+ ". Exception:" + e.getTargetException());
 186            }
 187        }catch(ClassNotFoundException ex){
 188        }catch(NoSuchMethodException ex){
 189        }catch (IllegalAccessException ex){
 190        }catch (InstantiationException ex){
 191        }
 192        throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className);
 193     }
 194
 195     static final boolean isSurrogate(int c){
 196         return (((c)&0xfffff800)==0xd800);
 197     }
 198
 199     /*
 200      * Returns the default charset name
 201      */
 202 //    static final String getDefaultCharsetName(){
 203 //        String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();
 204 //        return defaultEncoding;
 205 //    }
 206
 207     /**
 208      * Returns a charset object for the named charset.
 209      * This method gurantee that ICU charset is returned when
 210      * available.  If the ICU charset provider does not support
 211      * the specified charset, then try other charset providers
 212      * including the standard Java charset provider.
 213      *
 214      * @param charsetName The name of the requested charset,
 215      * may be either a canonical name or an alias
 216      * @return A charset object for the named charset
 217      * @throws IllegalCharsetNameException If the given charset name
 218      * is illegal
 219      * @throws UnsupportedCharsetException If no support for the
 220      * named charset is available in this instance of th Java
 221      * virtual machine
 222      * @stable ICU 3.6
 223      */
 224     public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException {
 225         CharsetProviderICU icuProvider = new CharsetProviderICU();
 226         CharsetICU cs = (CharsetICU) icuProvider.charsetForName(charsetName);
 227         if (cs != null) {
 228             return cs;
 229         }
 230         return Charset.forName(charsetName);
 231     }
 232
 233 //    /**
 234 //     * @see java.lang.Comparable#compareTo(java.lang.Object)
 235 //     * @stable 3.8
 236 //     */
 237 //    public int compareTo(Object otherObj) {
 238 //        if (!(otherObj instanceof CharsetICU)) {
 239 //            return -1;
 240 //        }
 241 //        return icuCanonicalName.compareTo(((CharsetICU)otherObj).icuCanonicalName);
 242 //    }
 243
 244     /**
 245      * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
 246      * start of the stream for example U+FEFF (the Unicode BOM/signature
 247      * character) that can be ignored.
 248      *
 249      * Detects Unicode signature byte sequences at the start of the byte stream
 250      * and returns number of bytes of the BOM of the indicated Unicode charset.
 251      * 0 is returned when no Unicode signature is recognized.
 252      *
 253      */
 254     // TODO This should be proposed as CharsetDecoderICU API.
 255 //    static String detectUnicodeSignature(ByteBuffer source) {
 256 //        int signatureLength = 0; // number of bytes of the signature
 257 //        final int SIG_MAX_LEN = 5;
 258 //        String sigUniCharset = null; // states what unicode charset is the BOM
 259 //        int i = 0;
 260 //
 261 //        /*
 262 //         * initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we
 263 //         * don't misdetect something
 264 //         */
 265 //        byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5,
 266 //                (byte) 0xa5 };
 267 //
 268 //        while (i < source.remaining() && i < SIG_MAX_LEN) {
 269 //            start[i] = source.get(i);
 270 //            i++;
 271 //        }
 272 //
 273 //        if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) {
 274 //            signatureLength = 2;
 275 //            sigUniCharset = "UTF-16BE";
 276 //            source.position(signatureLength);
 277 //            return sigUniCharset;
 278 //        } else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) {
 279 //            if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) {
 280 //                signatureLength = 4;
 281 //                sigUniCharset = "UTF-32LE";
 282 //                source.position(signatureLength);
 283 //                return sigUniCharset;
 284 //            } else {
 285 //                signatureLength = 2;
 286 //                sigUniCharset = "UTF-16LE";
 287 //                source.position(signatureLength);
 288 //                return sigUniCharset;
 289 //            }
 290 //        } else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB
 291 //                && start[2] == (byte) 0xBF) {
 292 //            signatureLength = 3;
 293 //            sigUniCharset = "UTF-8";
 294 //            source.position(signatureLength);
 295 //            return sigUniCharset;
 296 //        } else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00
 297 //                && start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) {
 298 //            signatureLength = 4;
 299 //            sigUniCharset = "UTF-32BE";
 300 //            source.position(signatureLength);
 301 //            return sigUniCharset;
 302 //        } else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE
 303 //                && start[2] == (byte) 0xFF) {
 304 //            signatureLength = 3;
 305 //            sigUniCharset = "SCSU";
 306 //            source.position(signatureLength);
 307 //            return sigUniCharset;
 308 //        } else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE
 309 //                && start[2] == (byte) 0x28) {
 310 //            signatureLength = 3;
 311 //            sigUniCharset = "BOCU-1";
 312 //            source.position(signatureLength);
 313 //            return sigUniCharset;
 314 //        } else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F
 315 //                && start[2] == (byte) 0x76) {
 316 //
 317 //            if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) {
 318 //                signatureLength = 5;
 319 //                sigUniCharset = "UTF-7";
 320 //                source.position(signatureLength);
 321 //                return sigUniCharset;
 322 //            } else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39
 323 //                    || start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) {
 324 //                signatureLength = 4;
 325 //                sigUniCharset = "UTF-7";
 326 //                source.position(signatureLength);
 327 //                return sigUniCharset;
 328 //            }
 329 //        } else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73
 330 //                && start[2] == (byte) 0x66 && start[3] == (byte) 0x73) {
 331 //            signatureLength = 4;
 332 //            sigUniCharset = "UTF-EBCDIC";
 333 //            source.position(signatureLength);
 334 //            return sigUniCharset;
 335 //        }
 336 //
 337 //        /* no known Unicode signature byte sequence recognized */
 338 //        return null;
 339 //    }
 340
 341
 342     abstract void getUnicodeSetImpl(UnicodeSet setFillIn, int which);
 343
 344     /**
 345     * <p>Returns the set of Unicode code points that can be converted by an ICU Converter.
 346     * <p>
 347     * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): The set of all Unicode code points that can be
 348     * roundtrip-converted (converted without any data loss) with the converter This set will not include code points that have fallback
 349     * mappings or are only the result of reverse fallback mappings.  See UTR #22 "Character Mapping Markup Language" at  <a href="http://www.unicode.org/reports/tr22/">http://www.unicode.org/reports/tr22/</a>
 350     * <p>* In the future, there may be more UConverterUnicodeSet choices to select sets with different properties.
 351     * <p>
 352     * <p>This is useful for example for
 353     * <ul><li>checking that a string or document can be roundtrip-converted with a converter,
 354     *   without/before actually performing the conversion</li>
 355     * <li>testing if a converter can be used for text for typical text for a certain locale,
 356     *   by comparing its roundtrip set with the set of ExemplarCharacters from
 357     *   ICU's locale data or other sources</li></ul>
 358     *
 359     * @param setFillIn A valid UnicodeSet. It will be cleared by this function before
 360     *                   the converter's specific set is filled in.
 361     * @param which A selector; currently ROUNDTRIP_SET is the only supported value.
 362     * @throws IllegalArgumentException if the parameters does not match.
 363     * @stable ICU 4.0
 364     */
 365        public void getUnicodeSet(UnicodeSet setFillIn, int which){
 366            if( setFillIn == null || which != ROUNDTRIP_SET ){
 367                throw new IllegalArgumentException();
 368            }
 369            setFillIn.clear();
 370            getUnicodeSetImpl(setFillIn, which);
 371        }
 372
 373        /**
 374         * Returns whether or not the charset of the converter has a fixed number of bytes
 375         * per charset character.
 376         * An example of this are converters that are of the type UCNV_SBCS or UCNV_DBCS.
 377         * Another example is UTF-32 which is always 4 bytes per character.  A UTF-32 code point
 378         * may represent more than one UTF-8 or UTF-16 code units but always have size of 4 bytes.
 379         * Note: This method is not intended to be used to determine whether the charset has a
 380         * fixed ratio of bytes to Unicode codes units for any particular Unicode encoding form.
 381         * @return true if the converter is fixed-width
 382         * @stable ICU 4.8
 383         */
 384        public boolean isFixedWidth() {
 385            if (this instanceof CharsetASCII || this instanceof CharsetUTF32) {
 386                return true;
 387            }
 388
 389            if (this instanceof CharsetMBCS) {
 390                if (((CharsetMBCS)this).sharedData.staticData.maxBytesPerChar == ((CharsetMBCS)this).sharedData.staticData.minBytesPerChar) {
 391                    return true;
 392                }
 393            }
 394
 395            return false;
 396        }
 397
 398        static void getNonSurrogateUnicodeSet(UnicodeSet setFillIn){
 399            setFillIn.add(0, 0xd7ff);
 400            setFillIn.add(0xe000, 0x10ffff);
 401        }
 402
 403        static void getCompleteUnicodeSet(UnicodeSet setFillIn){
 404            setFillIn.add(0, 0x10ffff);
 405        }
 406 }