jars/icu4j-4_2_1-src/src/com/ibm/icu/charset/CharsetICU.java

   1 /**\r
   2 *******************************************************************************\r
   3 * Copyright (C) 2006-2009, International Business Machines Corporation and    *\r
   4 * others. All Rights Reserved.                                                *\r
   5 *******************************************************************************\r
   6 *\r
   7 *******************************************************************************\r
   8 */ \r
   9 \r
  10 package com.ibm.icu.charset;\r
  11 \r
  12 //import java.io.ByteArrayInputStream;\r
  13 //import java.io.InputStreamReader;\r
  14 import java.lang.reflect.Constructor;\r
  15 \r
  16 import java.lang.reflect.InvocationTargetException;\r
  17 import java.nio.charset.*;\r
  18 import java.util.HashMap;\r
  19 \r
  20 import com.ibm.icu.text.UnicodeSet;\r
  21 \r
  22 /**\r
  23  * <p>A subclass of java.nio.Charset for providing implementation of ICU's charset converters.\r
  24  * This API is used to convert codepage or character encoded data to and\r
  25  * from UTF-16. You can open a converter with {@link Charset#forName } and {@link #forNameICU }. With that\r
  26  * converter, you can get its properties, set options, convert your data.</p>\r
  27  *\r
  28  * <p>Since many software programs recogize different converter names for\r
  29  * different types of converters, there are other functions in this API to\r
  30  * iterate over the converter aliases. \r
  31  * \r
  32  * @stable ICU 3.6\r
  33  */\r
  34 public abstract class CharsetICU extends Charset{\r
  35 \r
  36      String icuCanonicalName;\r
  37      String javaCanonicalName;\r
  38      int options;\r
  39 \r
  40      float  maxCharsPerByte;\r
  41     \r
  42      String name; /* +4: 60  internal name of the converter- invariant chars */\r
  43 \r
  44      int codepage;               /* +64: 4 codepage # (now IBM-$codepage) */\r
  45 \r
  46      byte platform;                /* +68: 1 platform of the converter (only IBM now) */\r
  47      byte conversionType;          /* +69: 1 conversion type */\r
  48 \r
  49      int minBytesPerChar;         /* +70: 1 Minimum # bytes per char in this codepage */\r
  50      int maxBytesPerChar;         /* +71: 1 Maximum # bytes output per UChar in this codepage */\r
  51 \r
  52      byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4  [note:  4 and 8 byte boundary] */\r
  53      byte subCharLen;              /* +76: 1 */\r
  54     \r
  55      byte hasToUnicodeFallback;   /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */\r
  56      byte hasFromUnicodeFallback; /* +78: 1 */\r
  57      short unicodeMask;            /* +79: 1  bit 0: has supplementary  bit 1: has single surrogates */\r
  58      byte subChar1;               /* +80: 1  single-byte substitution character for IBM MBCS (0 if none) */\r
  59      //byte reserved[/*19*/];           /* +81: 19 to round out the structure */\r
  60      \r
  61      \r
  62     // typedef enum UConverterUnicodeSet {\r
  63      /** \r
  64       * Parameter that select the set of roundtrippable Unicode code points. \r
  65       * @stable ICU 4.0\r
  66       */\r
  67       public static final int ROUNDTRIP_SET=0; \r
  68       /**\r
  69        * Select the set of Unicode code points with roundtrip or fallback mappings.\r
  70        * Not supported at this point.\r
  71        * @internal\r
  72        * @deprecated This API is ICU internal only.\r
  73        */\r
  74       public static final int ROUNDTRIP_AND_FALLBACK_SET =1;\r
  75       \r
  76     //} UConverterUnicodeSet;\r
  77      \r
  78     /**\r
  79      * \r
  80      * @param icuCanonicalName\r
  81      * @param canonicalName\r
  82      * @param aliases\r
  83      * @stable ICU 3.6\r
  84      */\r
  85     protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) {\r
  86         super(canonicalName,aliases);\r
  87         if(canonicalName.length() == 0){\r
  88             throw new IllegalCharsetNameException(canonicalName);\r
  89         }\r
  90         this.javaCanonicalName = canonicalName;\r
  91         this.icuCanonicalName  = icuCanonicalName;\r
  92     }\r
  93     \r
  94     /**\r
  95      * Ascertains if a charset is a sub set of this charset\r
  96      * Implements the abstract method of super class.\r
  97      * @param cs charset to test\r
  98      * @return true if the given charset is a subset of this charset\r
  99      * @stable ICU 3.6\r
 100      */\r
 101     public boolean contains(Charset cs){\r
 102         if (null == cs) {\r
 103             return false;\r
 104         } else if (this.equals(cs)) {\r
 105             return true;\r
 106         }\r
 107         return false;\r
 108     }\r
 109     private static final HashMap algorithmicCharsets = new HashMap();\r
 110     static{\r
 111         algorithmicCharsets.put("LMBCS-1",               "com.ibm.icu.charset.CharsetLMBCS");\r
 112         algorithmicCharsets.put("BOCU-1",                "com.ibm.icu.charset.CharsetBOCU1" );\r
 113         algorithmicCharsets.put("SCSU",                  "com.ibm.icu.charset.CharsetSCSU" ); \r
 114         algorithmicCharsets.put("US-ASCII",              "com.ibm.icu.charset.CharsetASCII" );\r
 115         algorithmicCharsets.put("ISO-8859-1",            "com.ibm.icu.charset.Charset88591" );\r
 116         algorithmicCharsets.put("UTF-16",                "com.ibm.icu.charset.CharsetUTF16" );\r
 117         algorithmicCharsets.put("UTF-16BE",              "com.ibm.icu.charset.CharsetUTF16BE" );\r
 118         algorithmicCharsets.put("UTF-16LE",              "com.ibm.icu.charset.CharsetUTF16LE" );\r
 119         algorithmicCharsets.put("UTF16_OppositeEndian",  "com.ibm.icu.charset.CharsetUTF16LE" );\r
 120         algorithmicCharsets.put("UTF16_PlatformEndian",  "com.ibm.icu.charset.CharsetUTF16" );\r
 121         algorithmicCharsets.put("UTF-32",                "com.ibm.icu.charset.CharsetUTF32" );\r
 122         algorithmicCharsets.put("UTF-32BE",              "com.ibm.icu.charset.CharsetUTF32BE" );\r
 123         algorithmicCharsets.put("UTF-32LE",              "com.ibm.icu.charset.CharsetUTF32LE" );\r
 124         algorithmicCharsets.put("UTF32_OppositeEndian",  "com.ibm.icu.charset.CharsetUTF32LE" );\r
 125         algorithmicCharsets.put("UTF32_PlatformEndian",  "com.ibm.icu.charset.CharsetUTF32" );\r
 126         algorithmicCharsets.put("UTF-8",                 "com.ibm.icu.charset.CharsetUTF8" );\r
 127         algorithmicCharsets.put("CESU-8",                "com.ibm.icu.charset.CharsetCESU8" );\r
 128         algorithmicCharsets.put("UTF-7",                 "com.ibm.icu.charset.CharsetUTF7" );\r
 129         algorithmicCharsets.put("ISCII,version=0",       "com.ibm.icu.charset.CharsetISCII" );\r
 130         algorithmicCharsets.put("ISCII,version=1",       "com.ibm.icu.charset.CharsetISCII" );\r
 131         algorithmicCharsets.put("ISCII,version=2",       "com.ibm.icu.charset.CharsetISCII" );\r
 132         algorithmicCharsets.put("ISCII,version=3",       "com.ibm.icu.charset.CharsetISCII" );\r
 133         algorithmicCharsets.put("ISCII,version=4",       "com.ibm.icu.charset.CharsetISCII" );\r
 134         algorithmicCharsets.put("ISCII,version=5",       "com.ibm.icu.charset.CharsetISCII" );\r
 135         algorithmicCharsets.put("ISCII,version=6",       "com.ibm.icu.charset.CharsetISCII" );\r
 136         algorithmicCharsets.put("ISCII,version=7",       "com.ibm.icu.charset.CharsetISCII" );\r
 137         algorithmicCharsets.put("ISCII,version=8",       "com.ibm.icu.charset.CharsetISCII" );\r
 138         algorithmicCharsets.put("IMAP-mailbox-name",     "com.ibm.icu.charset.CharsetUTF7" );\r
 139         algorithmicCharsets.put("HZ",                    "com.ibm.icu.charset.CharsetHZ" );\r
 140         algorithmicCharsets.put("ISO_2022,locale=ja,version=0",               "com.ibm.icu.charset.CharsetISO2022" );\r
 141         algorithmicCharsets.put("ISO_2022,locale=ja,version=1",               "com.ibm.icu.charset.CharsetISO2022" );\r
 142         algorithmicCharsets.put("ISO_2022,locale=ja,version=2",               "com.ibm.icu.charset.CharsetISO2022" );\r
 143         algorithmicCharsets.put("ISO_2022,locale=ja,version=3",               "com.ibm.icu.charset.CharsetISO2022" );\r
 144         algorithmicCharsets.put("ISO_2022,locale=ja,version=4",               "com.ibm.icu.charset.CharsetISO2022" );\r
 145         algorithmicCharsets.put("ISO_2022,locale=zh,version=0",               "com.ibm.icu.charset.CharsetISO2022" );\r
 146         algorithmicCharsets.put("ISO_2022,locale=zh,version=1",               "com.ibm.icu.charset.CharsetISO2022" );\r
 147         algorithmicCharsets.put("ISO_2022,locale=ko,version=0",               "com.ibm.icu.charset.CharsetISO2022" );\r
 148         algorithmicCharsets.put("ISO_2022,locale=ko,version=1",               "com.ibm.icu.charset.CharsetISO2022" );\r
 149         }\r
 150 \r
 151     /*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){\r
 152        String className = (String) algorithmicCharsets.get(icuCanonicalName);\r
 153        if(className==null){\r
 154            //all the cnv files are loaded as MBCS\r
 155            className = "com.ibm.icu.charset.CharsetMBCS";\r
 156        }\r
 157        try{\r
 158            CharsetICU conv = null;\r
 159            Class cs = Class.forName(className);\r
 160            Class[] paramTypes = new Class[]{ String.class, String.class,  String[].class};\r
 161            final Constructor c = cs.getConstructor(paramTypes);\r
 162            Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases};\r
 163            \r
 164            // Run constructor\r
 165            try {\r
 166                Object obj = c.newInstance(params);\r
 167                if(obj!=null && obj instanceof CharsetICU){\r
 168                    conv = (CharsetICU)obj;\r
 169                    return conv;\r
 170                }\r
 171            }catch (InvocationTargetException e) {\r
 172                throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className+ ". Exception:" + e.getTargetException());    \r
 173            }\r
 174        }catch(ClassNotFoundException ex){\r
 175        }catch(NoSuchMethodException ex){\r
 176        }catch (IllegalAccessException ex){ \r
 177        }catch (InstantiationException ex){ \r
 178        }\r
 179        throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className);    \r
 180     }\r
 181     \r
 182     static final boolean isSurrogate(int c){\r
 183         return (((c)&0xfffff800)==0xd800);\r
 184     }\r
 185     \r
 186     /*\r
 187      * Returns the default charset name \r
 188      */\r
 189 //    static final String getDefaultCharsetName(){\r
 190 //        String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();\r
 191 //        return defaultEncoding;\r
 192 //    }\r
 193 \r
 194     /**\r
 195      * Returns a charset object for the named charset.\r
 196      * This method gurantee that ICU charset is returned when\r
 197      * available.  If the ICU charset provider does not support\r
 198      * the specified charset, then try other charset providers\r
 199      * including the standard Java charset provider.\r
 200      * \r
 201      * @param charsetName The name of the requested charset,\r
 202      * may be either a canonical name or an alias\r
 203      * @return A charset object for the named charset\r
 204      * @throws IllegalCharsetNameException If the given charset name\r
 205      * is illegal\r
 206      * @throws UnsupportedCharsetException If no support for the\r
 207      * named charset is available in this instance of th Java\r
 208      * virtual machine\r
 209      * @stable ICU 3.6\r
 210      */\r
 211     public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException {\r
 212         CharsetProviderICU icuProvider = new CharsetProviderICU();\r
 213         CharsetICU cs = (CharsetICU) icuProvider.charsetForName(charsetName);\r
 214         if (cs != null) {\r
 215             return cs;\r
 216         }\r
 217         return Charset.forName(charsetName);\r
 218     }\r
 219 \r
 220 //    /**\r
 221 //     * @see java.lang.Comparable#compareTo(java.lang.Object)\r
 222 //     * @stable 3.8\r
 223 //     */\r
 224 //    public int compareTo(Object otherObj) {\r
 225 //        if (!(otherObj instanceof CharsetICU)) {\r
 226 //            return -1;\r
 227 //        }\r
 228 //        return icuCanonicalName.compareTo(((CharsetICU)otherObj).icuCanonicalName);\r
 229 //    }\r
 230 \r
 231     /**\r
 232      * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the\r
 233      * start of the stream for example U+FEFF (the Unicode BOM/signature\r
 234      * character) that can be ignored.\r
 235      * \r
 236      * Detects Unicode signature byte sequences at the start of the byte stream\r
 237      * and returns number of bytes of the BOM of the indicated Unicode charset.\r
 238      * 0 is returned when no Unicode signature is recognized.\r
 239      * \r
 240      */\r
 241     // TODO This should be proposed as CharsetDecoderICU API.\r
 242 //    static String detectUnicodeSignature(ByteBuffer source) {\r
 243 //        int signatureLength = 0; // number of bytes of the signature\r
 244 //        final int SIG_MAX_LEN = 5;\r
 245 //        String sigUniCharset = null; // states what unicode charset is the BOM\r
 246 //        int i = 0;\r
 247 //\r
 248 //        /*\r
 249 //         * initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we\r
 250 //         * don't misdetect something\r
 251 //         */\r
 252 //        byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5,\r
 253 //                (byte) 0xa5 };\r
 254 //\r
 255 //        while (i < source.remaining() && i < SIG_MAX_LEN) {\r
 256 //            start[i] = source.get(i);\r
 257 //            i++;\r
 258 //        }\r
 259 //\r
 260 //        if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) {\r
 261 //            signatureLength = 2;\r
 262 //            sigUniCharset = "UTF-16BE";\r
 263 //            source.position(signatureLength);\r
 264 //            return sigUniCharset;\r
 265 //        } else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) {\r
 266 //            if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) {\r
 267 //                signatureLength = 4;\r
 268 //                sigUniCharset = "UTF-32LE";\r
 269 //                source.position(signatureLength);\r
 270 //                return sigUniCharset;\r
 271 //            } else {\r
 272 //                signatureLength = 2;\r
 273 //                sigUniCharset = "UTF-16LE";\r
 274 //                source.position(signatureLength);\r
 275 //                return sigUniCharset;\r
 276 //            }\r
 277 //        } else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB\r
 278 //                && start[2] == (byte) 0xBF) {\r
 279 //            signatureLength = 3;\r
 280 //            sigUniCharset = "UTF-8";\r
 281 //            source.position(signatureLength);\r
 282 //            return sigUniCharset;\r
 283 //        } else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00\r
 284 //                && start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) {\r
 285 //            signatureLength = 4;\r
 286 //            sigUniCharset = "UTF-32BE";\r
 287 //            source.position(signatureLength);\r
 288 //            return sigUniCharset;\r
 289 //        } else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE\r
 290 //                && start[2] == (byte) 0xFF) {\r
 291 //            signatureLength = 3;\r
 292 //            sigUniCharset = "SCSU";\r
 293 //            source.position(signatureLength);\r
 294 //            return sigUniCharset;\r
 295 //        } else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE\r
 296 //                && start[2] == (byte) 0x28) {\r
 297 //            signatureLength = 3;\r
 298 //            sigUniCharset = "BOCU-1";\r
 299 //            source.position(signatureLength);\r
 300 //            return sigUniCharset;\r
 301 //        } else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F\r
 302 //                && start[2] == (byte) 0x76) {\r
 303 //\r
 304 //            if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) {\r
 305 //                signatureLength = 5;\r
 306 //                sigUniCharset = "UTF-7";\r
 307 //                source.position(signatureLength);\r
 308 //                return sigUniCharset;\r
 309 //            } else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39\r
 310 //                    || start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) {\r
 311 //                signatureLength = 4;\r
 312 //                sigUniCharset = "UTF-7";\r
 313 //                source.position(signatureLength);\r
 314 //                return sigUniCharset;\r
 315 //            }\r
 316 //        } else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73\r
 317 //                && start[2] == (byte) 0x66 && start[3] == (byte) 0x73) {\r
 318 //            signatureLength = 4;\r
 319 //            sigUniCharset = "UTF-EBCDIC";\r
 320 //            source.position(signatureLength);\r
 321 //            return sigUniCharset;\r
 322 //        }\r
 323 //\r
 324 //        /* no known Unicode signature byte sequence recognized */\r
 325 //        return null;\r
 326 //    }\r
 327     \r
 328     \r
 329     abstract void getUnicodeSetImpl(UnicodeSet setFillIn, int which);\r
 330     \r
 331     /**\r
 332     * <p>Returns the set of Unicode code points that can be converted by an ICU Converter. \r
 333     * <p>\r
 334     * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): The set of all Unicode code points that can be \r
 335     * roundtrip-converted (converted without any data loss) with the converter This set will not include code points that have fallback \r
 336     * mappings or are only the result of reverse fallback mappings.  See UTR #22 "Character Mapping Markup Language" at  <a href="http://www.unicode.org/reports/tr22/">http://www.unicode.org/reports/tr22/</a>\r
 337     * <p>* In the future, there may be more UConverterUnicodeSet choices to select sets with different properties.\r
 338     * <p>\r
 339     * <p>This is useful for example for\r
 340     * <ul><li>checking that a string or document can be roundtrip-converted with a converter,\r
 341     *   without/before actually performing the conversion</li>\r
 342     * <li>testing if a converter can be used for text for typical text for a certain locale,\r
 343     *   by comparing its roundtrip set with the set of ExemplarCharacters from\r
 344     *   ICU's locale data or other sources</li></ul>\r
 345     *\r
 346     * @param setFillIn A valid UnicodeSet. It will be cleared by this function before \r
 347     *                   the converter's specific set is filled in.\r
 348     * @param which A selector; currently ROUNDTRIP_SET is the only supported value.\r
 349     * @throws IllegalArgumentException if the parameters does not match.              \r
 350     * @stable ICU 4.0\r
 351     */\r
 352        public void getUnicodeSet(UnicodeSet setFillIn, int which){\r
 353            if( setFillIn == null || which != ROUNDTRIP_SET ){\r
 354                throw new IllegalArgumentException();\r
 355            }\r
 356            setFillIn.clear();\r
 357            getUnicodeSetImpl(setFillIn, which);\r
 358        }\r
 359       \r
 360        static void getNonSurrogateUnicodeSet(UnicodeSet setFillIn){\r
 361            setFillIn.add(0, 0xd7ff);\r
 362            setFillIn.add(0xe000, 0x10ffff);\r
 363        }\r
 364        \r
 365        static void getCompleteUnicodeSet(UnicodeSet setFillIn){\r
 366            setFillIn.add(0, 0x10ffff);\r
 367        }\r
 368 \r
 369 }\r
 370 \r