jars/icu4j-4_4_2-src/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java

   1 /**\r
   2 *******************************************************************************\r
   3 * Copyright (C) 2005-2009, International Business Machines Corporation and    *\r
   4 * others. All Rights Reserved.                                                *\r
   5 *******************************************************************************\r
   6 */\r
   7 package com.ibm.icu.text;\r
   8 \r
   9 import java.io.IOException;\r
  10 import java.io.InputStream;\r
  11 import java.io.Reader;\r
  12 import java.util.ArrayList;\r
  13 import java.util.Arrays;\r
  14 import java.util.Collections;\r
  15 \r
  16 \r
  17 /**\r
  18  * <code>CharsetDetector</code> provides a facility for detecting the\r
  19  * charset or encoding of character data in an unknown format.\r
  20  * The input data can either be from an input stream or an array of bytes.\r
  21  * The result of the detection operation is a list of possibly matching\r
  22  * charsets, or, for simple use, you can just ask for a Java Reader that\r
  23  * will will work over the input data.\r
  24  * <p/>\r
  25  * Character set detection is at best an imprecise operation.  The detection\r
  26  * process will attempt to identify the charset that best matches the characteristics\r
  27  * of the byte data, but the process is partly statistical in nature, and\r
  28  * the results can not be guaranteed to always be correct.\r
  29  * <p/>\r
  30  * For best accuracy in charset detection, the input data should be primarily\r
  31  * in a single language, and a minimum of a few hundred bytes worth of plain text\r
  32  * in the language are needed.  The detection process will attempt to\r
  33  * ignore html or xml style markup that could otherwise obscure the content.\r
  34  * <p/>\r
  35  * @stable ICU 3.4\r
  36  */\r
  37 public class CharsetDetector {\r
  38 \r
  39 //   Question: Should we have getters corresponding to the setters for inut text\r
  40 //   and declared encoding?\r
  41 \r
  42 //   A thought: If we were to create our own type of Java Reader, we could defer\r
  43 //   figuring out an actual charset for data that starts out with too much English\r
  44 //   only ASCII until the user actually read through to something that didn't look\r
  45 //   like 7 bit English.  If  nothing else ever appeared, we would never need to\r
  46 //   actually choose the "real" charset.  All assuming that the application just\r
  47 //   wants the data, and doesn't care about a char set name.\r
  48 \r
  49     /**\r
  50      *   Constructor\r
  51      * \r
  52      * @stable ICU 3.4\r
  53      */\r
  54     public CharsetDetector() {\r
  55     }\r
  56 \r
  57     /**\r
  58      * Set the declared encoding for charset detection.\r
  59      *  The declared encoding of an input text is an encoding obtained\r
  60      *  from an http header or xml declaration or similar source that\r
  61      *  can be provided as additional information to the charset detector.  \r
  62      *  A match between a declared encoding and a possible detected encoding\r
  63      *  will raise the quality of that detected encoding by a small delta,\r
  64      *  and will also appear as a "reason" for the match.\r
  65      * <p/>\r
  66      * A declared encoding that is incompatible with the input data being\r
  67      * analyzed will not be added to the list of possible encodings.\r
  68      * \r
  69      *  @param encoding The declared encoding \r
  70      *\r
  71      * @stable ICU 3.4\r
  72      */\r
  73     public CharsetDetector setDeclaredEncoding(String encoding) {\r
  74         fDeclaredEncoding = encoding;\r
  75         return this;\r
  76     }\r
  77     \r
  78     /**\r
  79      * Set the input text (byte) data whose charset is to be detected.\r
  80      * \r
  81      * @param in the input text of unknown encoding\r
  82      * \r
  83      * @return This CharsetDetector\r
  84      *\r
  85      * @stable ICU 3.4\r
  86      */\r
  87     public CharsetDetector setText(byte [] in) {\r
  88         fRawInput  = in;\r
  89         fRawLength = in.length;\r
  90         \r
  91         MungeInput();\r
  92         \r
  93         return this;\r
  94     }\r
  95     \r
  96     private static final int kBufSize = 8000;\r
  97 \r
  98     /**\r
  99      * Set the input text (byte) data whose charset is to be detected.\r
 100      *  <p/>\r
 101      *   The input stream that supplies the character data must have markSupported()\r
 102      *   == true; the charset detection process will read a small amount of data,\r
 103      *   then return the stream to its original position via\r
 104      *   the InputStream.reset() operation.  The exact amount that will\r
 105      *   be read depends on the characteristics of the data itself.\r
 106      *\r
 107      * @param in the input text of unknown encoding\r
 108      * \r
 109      * @return This CharsetDetector\r
 110      *\r
 111      * @stable ICU 3.4\r
 112      */\r
 113     \r
 114     public CharsetDetector setText(InputStream in) throws IOException {\r
 115         fInputStream = in;\r
 116         fInputStream.mark(kBufSize);\r
 117         fRawInput = new byte[kBufSize];   // Always make a new buffer because the\r
 118                                           //   previous one may have come from the caller,\r
 119                                           //   in which case we can't touch it.\r
 120         fRawLength = 0;\r
 121         int remainingLength = kBufSize;\r
 122         while (remainingLength > 0 ) {\r
 123             // read() may give data in smallish chunks, esp. for remote sources.  Hence, this loop.\r
 124             int  bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);\r
 125             if (bytesRead <= 0) {\r
 126                  break;\r
 127             }\r
 128             fRawLength += bytesRead;\r
 129             remainingLength -= bytesRead;\r
 130         }\r
 131         fInputStream.reset();\r
 132         \r
 133         MungeInput();                     // Strip html markup, collect byte stats.\r
 134         return this;\r
 135     }\r
 136 \r
 137   \r
 138     /**\r
 139      * Return the charset that best matches the supplied input data.\r
 140      * \r
 141      * Note though, that because the detection \r
 142      * only looks at the start of the input data,\r
 143      * there is a possibility that the returned charset will fail to handle\r
 144      * the full set of input data.\r
 145      * <p/>\r
 146      * Raise an exception if \r
 147      *  <ul>\r
 148      *    <li>no charset appears to match the data.</li>\r
 149      *    <li>no input text has been provided</li>\r
 150      *  </ul>\r
 151      *\r
 152      * @return a CharsetMatch object representing the best matching charset, or\r
 153      *         <code>null</code> if there are no matches.\r
 154      *\r
 155      * @stable ICU 3.4\r
 156      */\r
 157     public CharsetMatch detect() {\r
 158 //   TODO:  A better implementation would be to copy the detect loop from\r
 159 //          detectAll(), and cut it short as soon as a match with a high confidence\r
 160 //          is found.  This is something to be done later, after things are otherwise\r
 161 //          working.\r
 162         CharsetMatch matches[] = detectAll();\r
 163         \r
 164         if (matches == null || matches.length == 0) {\r
 165             return null;\r
 166         }\r
 167         \r
 168         return matches[0];\r
 169      }\r
 170     \r
 171     /**\r
 172      *  Return an array of all charsets that appear to be plausible\r
 173      *  matches with the input data.  The array is ordered with the\r
 174      *  best quality match first.\r
 175      * <p/>\r
 176      * Raise an exception if \r
 177      *  <ul>\r
 178      *    <li>no charsets appear to match the input data.</li>\r
 179      *    <li>no input text has been provided</li>\r
 180      *  </ul>\r
 181      * \r
 182      * @return An array of CharsetMatch objects representing possibly matching charsets.\r
 183      *\r
 184      * @stable ICU 3.4\r
 185      */\r
 186     public CharsetMatch[] detectAll() {\r
 187         CharsetRecognizer csr;\r
 188         int               i;\r
 189         int               detectResults;\r
 190         int               confidence;\r
 191         ArrayList<CharsetMatch>         matches = new ArrayList<CharsetMatch>();\r
 192         \r
 193         //  Iterate over all possible charsets, remember all that\r
 194         //    give a match quality > 0.\r
 195         for (i=0; i<fCSRecognizers.size(); i++) {\r
 196             csr = fCSRecognizers.get(i);\r
 197             detectResults = csr.match(this);\r
 198             confidence = detectResults & 0x000000ff;\r
 199             if (confidence > 0) {\r
 200                 CharsetMatch  m = new CharsetMatch(this, csr, confidence);\r
 201                 matches.add(m);\r
 202             }\r
 203         }\r
 204         Collections.sort(matches);      // CharsetMatch compares on confidence\r
 205         Collections.reverse(matches);   //  Put best match first.\r
 206         CharsetMatch [] resultArray = new CharsetMatch[matches.size()];\r
 207         resultArray = matches.toArray(resultArray);\r
 208         return resultArray;\r
 209     }\r
 210 \r
 211     \r
 212     /**\r
 213      * Autodetect the charset of an inputStream, and return a Java Reader\r
 214      * to access the converted input data.\r
 215      * <p/>\r
 216      * This is a convenience method that is equivalent to\r
 217      *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>\r
 218      * <p/>\r
 219      *   For the input stream that supplies the character data, markSupported()\r
 220      *   must be true; the  charset detection will read a small amount of data,\r
 221      *   then return the stream to its original position via\r
 222      *   the InputStream.reset() operation.  The exact amount that will\r
 223      *    be read depends on the characteristics of the data itself.\r
 224      *<p/>\r
 225      * Raise an exception if no charsets appear to match the input data.\r
 226      * \r
 227      * @param in The source of the byte data in the unknown charset.\r
 228      *\r
 229      * @param declaredEncoding  A declared encoding for the data, if available,\r
 230      *           or null or an empty string if none is available.\r
 231      *\r
 232      * @stable ICU 3.4\r
 233      */\r
 234     public Reader getReader(InputStream in, String declaredEncoding) {\r
 235         fDeclaredEncoding = declaredEncoding;\r
 236         \r
 237         try {\r
 238             setText(in);\r
 239             \r
 240             CharsetMatch match = detect();\r
 241             \r
 242             if (match == null) {\r
 243                 return null;\r
 244             }\r
 245             \r
 246             return match.getReader();\r
 247         } catch (IOException e) {\r
 248             return null;\r
 249         }\r
 250     }\r
 251 \r
 252     /**\r
 253      * Autodetect the charset of an inputStream, and return a String\r
 254      * containing the converted input data.\r
 255      * <p/>\r
 256      * This is a convenience method that is equivalent to\r
 257      *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>\r
 258      *<p/>\r
 259      * Raise an exception if no charsets appear to match the input data.\r
 260      * \r
 261      * @param in The source of the byte data in the unknown charset.\r
 262      *\r
 263      * @param declaredEncoding  A declared encoding for the data, if available,\r
 264      *           or null or an empty string if none is available.\r
 265      *\r
 266      * @stable ICU 3.4\r
 267      */\r
 268     public String getString(byte[] in, String declaredEncoding)\r
 269     {\r
 270         fDeclaredEncoding = declaredEncoding;\r
 271        \r
 272         try {\r
 273             setText(in);\r
 274             \r
 275             CharsetMatch match = detect();\r
 276             \r
 277             if (match == null) {\r
 278                 return null;\r
 279             }\r
 280             \r
 281             return match.getString(-1);\r
 282         } catch (IOException e) {\r
 283             return null;\r
 284         }\r
 285     }\r
 286 \r
 287  \r
 288     /**\r
 289      * Get the names of all char sets that can be recognized by the char set detector.\r
 290      *\r
 291      * @return an array of the names of all charsets that can be recognized\r
 292      * by the charset detector.\r
 293      *\r
 294      * @stable ICU 3.4\r
 295      */\r
 296     public static String[] getAllDetectableCharsets() {\r
 297         return fCharsetNames;\r
 298     }\r
 299     \r
 300     /**\r
 301      * Test whether or not input filtering is enabled.\r
 302      * \r
 303      * @return <code>true</code> if input text will be filtered.\r
 304      * \r
 305      * @see #enableInputFilter\r
 306      *\r
 307      * @stable ICU 3.4\r
 308      */\r
 309     public boolean inputFilterEnabled()\r
 310     {\r
 311         return fStripTags;\r
 312     }\r
 313     \r
 314     /**\r
 315      * Enable filtering of input text. If filtering is enabled,\r
 316      * text within angle brackets ("<" and ">") will be removed\r
 317      * before detection.\r
 318      * \r
 319      * @param filter <code>true</code> to enable input text filtering.\r
 320      * \r
 321      * @return The previous setting.\r
 322      *\r
 323      * @stable ICU 3.4\r
 324      */\r
 325     public boolean enableInputFilter(boolean filter)\r
 326     {\r
 327         boolean previous = fStripTags;\r
 328         \r
 329         fStripTags = filter;\r
 330         \r
 331         return previous;\r
 332     }\r
 333     \r
 334     /*\r
 335      *  MungeInput - after getting a set of raw input data to be analyzed, preprocess\r
 336      *               it by removing what appears to be html markup.\r
 337      */\r
 338     private void MungeInput() {\r
 339         int srci = 0;\r
 340         int dsti = 0;\r
 341         byte b;\r
 342         boolean  inMarkup = false;\r
 343         int      openTags = 0;\r
 344         int      badTags  = 0;\r
 345         \r
 346         //\r
 347         //  html / xml markup stripping.\r
 348         //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.\r
 349         //     discard everything within < brackets >\r
 350         //     Count how many total '<' and illegal (nested) '<' occur, so we can make some\r
 351         //     guess as to whether the input was actually marked up at all.\r
 352         if (fStripTags) {\r
 353             for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {\r
 354                 b = fRawInput[srci];\r
 355                 if (b == (byte)'<') {\r
 356                     if (inMarkup) {\r
 357                         badTags++;\r
 358                     }\r
 359                     inMarkup = true;\r
 360                     openTags++;\r
 361                 }\r
 362                 \r
 363                 if (! inMarkup) {\r
 364                     fInputBytes[dsti++] = b;\r
 365                 }\r
 366                 \r
 367                 if (b == (byte)'>') {\r
 368                     inMarkup = false;\r
 369                 }        \r
 370             }\r
 371             \r
 372             fInputLen = dsti;\r
 373         }\r
 374         \r
 375         //\r
 376         //  If it looks like this input wasn't marked up, or if it looks like it's\r
 377         //    essentially nothing but markup abandon the markup stripping.\r
 378         //    Detection will have to work on the unstripped input.\r
 379         //\r
 380         if (openTags<5 || openTags/5 < badTags || \r
 381                 (fInputLen < 100 && fRawLength>600)) {\r
 382             int limit = fRawLength;\r
 383             \r
 384             if (limit > kBufSize) {\r
 385                 limit = kBufSize;\r
 386             }\r
 387             \r
 388             for (srci=0; srci<limit; srci++) {\r
 389                 fInputBytes[srci] = fRawInput[srci];\r
 390             }\r
 391             fInputLen = srci;\r
 392         }\r
 393         \r
 394         //\r
 395         // Tally up the byte occurence statistics.\r
 396         //   These are available for use by the various detectors.\r
 397         //\r
 398         Arrays.fill(fByteStats, (short)0);\r
 399         for (srci=0; srci<fInputLen; srci++) {\r
 400             int val = fInputBytes[srci] & 0x00ff;\r
 401             fByteStats[val]++;\r
 402         }\r
 403         \r
 404         fC1Bytes = false;\r
 405         for (int i = 0x80; i <= 0x9F; i += 1) {\r
 406             if (fByteStats[i] != 0) {\r
 407                 fC1Bytes = true;\r
 408                 break;\r
 409             }\r
 410         }\r
 411      }\r
 412 \r
 413     /*\r
 414      *  The following items are accessed by individual CharsetRecongizers during\r
 415      *     the recognition process\r
 416      * \r
 417      */\r
 418     byte[]      fInputBytes =       // The text to be checked.  Markup will have been\r
 419                    new byte[kBufSize];  //   removed if appropriate.\r
 420     \r
 421     int         fInputLen;          // Length of the byte data in fInputText.\r
 422     \r
 423     short       fByteStats[] =      // byte frequency statistics for the input text.\r
 424                    new short[256];  //   Value is percent, not absolute.\r
 425                                     //   Value is rounded up, so zero really means zero occurences.\r
 426     \r
 427     boolean     fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;\r
 428                    false;\r
 429     \r
 430     String      fDeclaredEncoding;\r
 431     \r
 432     \r
 433 \r
 434     //\r
 435     //  Stuff private to CharsetDetector\r
 436     //\r
 437     byte[]               fRawInput;     // Original, untouched input bytes.\r
 438                                         //  If user gave us a byte array, this is it.\r
 439                                         //  If user gave us a stream, it's read to a \r
 440                                         //  buffer here.\r
 441     int                  fRawLength;    // Length of data in fRawInput array.\r
 442     \r
 443     InputStream          fInputStream;  // User's input stream, or null if the user\r
 444                                         //   gave us a byte array.\r
 445      \r
 446     boolean              fStripTags =   // If true, setText() will strip tags from input text.\r
 447                            false;\r
 448     \r
 449     \r
 450     /*\r
 451      * List of recognizers for all charsets known to the implementation.\r
 452      */\r
 453     private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();\r
 454     private static String [] fCharsetNames;\r
 455     \r
 456     /*\r
 457      * Create the singleton instances of the CharsetRecognizer classes\r
 458      */\r
 459     private static ArrayList<CharsetRecognizer> createRecognizers() {\r
 460         ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();\r
 461         \r
 462         recognizers.add(new CharsetRecog_UTF8());\r
 463         \r
 464         recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());\r
 465         recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());\r
 466         recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());\r
 467         recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());\r
 468         \r
 469         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());\r
 470         recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());\r
 471         recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());\r
 472         recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());\r
 473         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());\r
 474         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());\r
 475         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());\r
 476         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());\r
 477         \r
 478         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());\r
 479         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());\r
 480         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());\r
 481         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());\r
 482         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());\r
 483         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());\r
 484         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());\r
 485         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());\r
 486         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());\r
 487         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());\r
 488         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());\r
 489         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());\r
 490         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());\r
 491         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());\r
 492         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());\r
 493         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());\r
 494         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());\r
 495         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());\r
 496         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());\r
 497         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());\r
 498         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());\r
 499         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());\r
 500         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());\r
 501         \r
 502         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());\r
 503         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());\r
 504         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());\r
 505         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());\r
 506         \r
 507         // Create an array of all charset names, as a side effect.\r
 508         // Needed for the getAllDetectableCharsets() API.\r
 509         String[] charsetNames = new String [recognizers.size()];\r
 510         int out = 0;\r
 511         \r
 512         for (int i = 0; i < recognizers.size(); i++) {\r
 513             String name = recognizers.get(i).getName();\r
 514             \r
 515             if (out == 0 || ! name.equals(charsetNames[out - 1])) {\r
 516                 charsetNames[out++] = name;\r
 517             }\r
 518         }\r
 519         \r
 520         fCharsetNames = new String[out];\r
 521         System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);\r
 522         \r
 523         return recognizers;\r
 524     }\r
 525 }\r