jars/icu4j-4_4_2-src/main/classes/core/src/com/ibm/icu/text/CharsetMatch.java

   1 /**\r
   2 *******************************************************************************\r
   3 * Copyright (C) 2005-2010, International Business Machines Corporation and    *\r
   4 * others. All Rights Reserved.                                                *\r
   5 *******************************************************************************\r
   6 */\r
   7 package com.ibm.icu.text;\r
   8 \r
   9 import java.io.ByteArrayInputStream;\r
  10 import java.io.IOException;\r
  11 import java.io.InputStream;\r
  12 import java.io.InputStreamReader;\r
  13 import java.io.Reader;\r
  14 \r
  15 \r
  16 /**\r
  17  * This class represents a charset that has been identified by a CharsetDetector\r
  18  * as a possible encoding for a set of input data.  From an instance of this\r
  19  * class, you can ask for a confidence level in the charset identification,\r
  20  * or for Java Reader or String to access the original byte data in Unicode form.\r
  21  * <p/>\r
  22  * Instances of this class are created only by CharsetDetectors.\r
  23  * <p/>\r
  24  * Note:  this class has a natural ordering that is inconsistent with equals.\r
  25  *        The natural ordering is based on the match confidence value.\r
  26  *\r
  27  * @stable ICU 3.4\r
  28  */\r
  29 public class CharsetMatch implements Comparable<CharsetMatch> {\r
  30 \r
  31     \r
  32     /**\r
  33      * Create a java.io.Reader for reading the Unicode character data corresponding\r
  34      * to the original byte data supplied to the Charset detect operation.\r
  35      * <p/>\r
  36      * CAUTION:  if the source of the byte data was an InputStream, a Reader\r
  37      * can be created for only one matching char set using this method.  If more \r
  38      * than one charset needs to be tried, the caller will need to reset\r
  39      * the InputStream and create InputStreamReaders itself, based on the charset name.\r
  40      *\r
  41      * @return the Reader for the Unicode character data.\r
  42      *\r
  43      * @stable ICU 3.4\r
  44      */\r
  45     public Reader getReader() {\r
  46         InputStream inputStream = fInputStream;\r
  47         \r
  48         if (inputStream == null) {\r
  49             inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);\r
  50         }\r
  51         \r
  52         try {\r
  53             inputStream.reset();\r
  54             return new InputStreamReader(inputStream, getName());\r
  55         } catch (IOException e) {\r
  56             return null;\r
  57         }\r
  58     }\r
  59 \r
  60     /**\r
  61      * Create a Java String from Unicode character data corresponding\r
  62      * to the original byte data supplied to the Charset detect operation.\r
  63      *\r
  64      * @return a String created from the converted input data.\r
  65      *\r
  66      * @stable ICU 3.4\r
  67      */\r
  68     public String getString()  throws java.io.IOException {\r
  69         return getString(-1);\r
  70 \r
  71     }\r
  72 \r
  73     /**\r
  74      * Create a Java String from Unicode character data corresponding\r
  75      * to the original byte data supplied to the Charset detect operation.\r
  76      * The length of the returned string is limited to the specified size;\r
  77      * the string will be trunctated to this length if necessary.  A limit value of\r
  78      * zero or less is ignored, and treated as no limit.\r
  79      *\r
  80      * @param maxLength The maximium length of the String to be created when the\r
  81      *                  source of the data is an input stream, or -1 for\r
  82      *                  unlimited length.\r
  83      * @return a String created from the converted input data.\r
  84      *\r
  85      * @stable ICU 3.4\r
  86      */\r
  87     public String getString(int maxLength) throws java.io.IOException {\r
  88         String result = null;\r
  89         if (fInputStream != null) {\r
  90             StringBuilder sb = new StringBuilder();\r
  91             char[] buffer = new char[1024];\r
  92             Reader reader = getReader();\r
  93             int max = maxLength < 0? Integer.MAX_VALUE : maxLength;\r
  94             int bytesRead = 0;\r
  95             \r
  96             while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {\r
  97                 sb.append(buffer, 0, bytesRead);\r
  98                 max -= bytesRead;\r
  99             }\r
 100             \r
 101             reader.close();\r
 102             \r
 103             return sb.toString();\r
 104         } else {\r
 105             result = new String(fRawInput, getName());            \r
 106         }\r
 107         return result;\r
 108 \r
 109     }\r
 110     \r
 111     /**\r
 112      * Get an indication of the confidence in the charset detected.\r
 113      * Confidence values range from 0-100, with larger numbers indicating\r
 114      * a better match of the input data to the characteristics of the\r
 115      * charset.\r
 116      *\r
 117      * @return the confidence in the charset match\r
 118      *\r
 119      * @stable ICU 3.4\r
 120      */\r
 121     public int getConfidence() {\r
 122         return fConfidence;\r
 123     }\r
 124     \r
 125 \r
 126     /**\r
 127      * Bit flag indicating the match is based on the the encoding scheme.\r
 128      *\r
 129      * @see #getMatchType\r
 130      * @stable ICU 3.4\r
 131      */\r
 132     static public final int ENCODING_SCHEME    = 1;\r
 133     \r
 134     /**\r
 135      * Bit flag indicating the match is based on the presence of a BOM.\r
 136      * \r
 137      * @see #getMatchType\r
 138      * @stable ICU 3.4\r
 139      */\r
 140     static public final int BOM                = 2;\r
 141     \r
 142     /**\r
 143      * Bit flag indicating he match is based on the declared encoding.\r
 144      * \r
 145      * @see #getMatchType\r
 146      * @stable ICU 3.4\r
 147      */\r
 148     static public final int DECLARED_ENCODING  = 4;\r
 149     \r
 150     /**\r
 151      * Bit flag indicating the match is based on language statistics.\r
 152      *\r
 153      * @see #getMatchType\r
 154      * @stable ICU 3.4\r
 155      */\r
 156     static public final int LANG_STATISTICS    = 8;\r
 157     \r
 158     /**\r
 159      * Return flags indicating what it was about the input data \r
 160      * that caused this charset to be considered as a possible match.\r
 161      * The result is a bitfield containing zero or more of the flags\r
 162      * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.\r
 163      * A result of zero means no information is available.\r
 164      * <p>\r
 165      * Note: currently, this method always returns zero.\r
 166      * <p>\r
 167      *\r
 168      * @return the type of match found for this charset.\r
 169      *\r
 170      * @draft ICU 3.4\r
 171      * @provisional This API might change or be removed in a future release.\r
 172      */\r
 173     public int getMatchType() {\r
 174 //      TODO: create a list of enum-like constants for common combinations of types of matches.\r
 175         return 0;\r
 176     }\r
 177 \r
 178     /**\r
 179      * Get the name of the detected charset.  \r
 180      * The name will be one that can be used with other APIs on the\r
 181      * platform that accept charset names.  It is the "Canonical name"\r
 182      * as defined by the class java.nio.charset.Charset; for\r
 183      * charsets that are registered with the IANA charset registry,\r
 184      * this is the MIME-preferred registerd name.\r
 185      *\r
 186      * @see java.nio.charset.Charset\r
 187      * @see java.io.InputStreamReader\r
 188      *\r
 189      * @return The name of the charset.\r
 190      *\r
 191      * @stable ICU 3.4\r
 192      */\r
 193     public String getName() {\r
 194         return fRecognizer.getName();\r
 195     }\r
 196     \r
 197     /**\r
 198      * Get the ISO code for the language of the detected charset.  \r
 199      *\r
 200      * @return The ISO code for the language or <code>null</code> if the language cannot be determined.\r
 201      *\r
 202      * @stable ICU 3.4\r
 203      */\r
 204     public String getLanguage() {\r
 205         return fRecognizer.getLanguage();\r
 206     }\r
 207 \r
 208     /**\r
 209      * Compare to other CharsetMatch objects.\r
 210      * Comparison is based on the match confidence value, which \r
 211      *   allows CharsetDetector.detectAll() to order its results. \r
 212      *\r
 213      * @param other the CharsetMatch object to compare against.\r
 214      * @return  a negative integer, zero, or a positive integer as the \r
 215      *          confidence level of this CharsetMatch\r
 216      *          is less than, equal to, or greater than that of\r
 217      *          the argument.\r
 218      * @throws ClassCastException if the argument is not a CharsetMatch.\r
 219      * @stable ICU 4.4\r
 220      */\r
 221     public int compareTo (CharsetMatch other) {\r
 222         int compareResult = 0;\r
 223         if (this.fConfidence > other.fConfidence) {\r
 224             compareResult = 1;\r
 225         } else if (this.fConfidence < other.fConfidence) {\r
 226             compareResult = -1;\r
 227         }\r
 228         return compareResult;\r
 229     }\r
 230 \r
 231     /*\r
 232      *  Constructor.  Implementation internal\r
 233      */\r
 234     CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {\r
 235         fRecognizer = rec;\r
 236         fConfidence = conf;\r
 237         \r
 238         // The references to the original aplication input data must be copied out\r
 239         //   of the charset recognizer to here, in case the application resets the\r
 240         //   recognizer before using this CharsetMatch.\r
 241         if (det.fInputStream == null) {\r
 242             // We only want the existing input byte data if it came straight from the user,\r
 243             //   not if is just the head of a stream.\r
 244             fRawInput    = det.fRawInput;\r
 245             fRawLength   = det.fRawLength;\r
 246         }\r
 247         fInputStream = det.fInputStream;\r
 248     }\r
 249 \r
 250     \r
 251     //\r
 252     //   Private Data\r
 253     //\r
 254     private int                 fConfidence;\r
 255     private CharsetRecognizer   fRecognizer;\r
 256     private byte[]              fRawInput = null;     // Original, untouched input bytes.\r
 257                                                       //  If user gave us a byte array, this is it.\r
 258     private int                 fRawLength;           // Length of data in fRawInput array.\r
 259 \r
 260     private InputStream         fInputStream = null;  // User's input stream, or null if the user\r
 261                                                       //   gave us a byte array.\r
 262 }\r