jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/text/CharsetMatch.java

   1 /**
   2 *******************************************************************************
   3 * Copyright (C) 2005-2012, International Business Machines Corporation and    *
   4 * others. All Rights Reserved.                                                *
   5 *******************************************************************************
   6 */
   7 package com.ibm.icu.text;
   8
   9 import java.io.ByteArrayInputStream;
  10 import java.io.IOException;
  11 import java.io.InputStream;
  12 import java.io.InputStreamReader;
  13 import java.io.Reader;
  14
  15
  16 /**
  17  * This class represents a charset that has been identified by a CharsetDetector
  18  * as a possible encoding for a set of input data.  From an instance of this
  19  * class, you can ask for a confidence level in the charset identification,
  20  * or for Java Reader or String to access the original byte data in Unicode form.
  21  * <p/>
  22  * Instances of this class are created only by CharsetDetectors.
  23  * <p/>
  24  * Note:  this class has a natural ordering that is inconsistent with equals.
  25  *        The natural ordering is based on the match confidence value.
  26  *
  27  * @stable ICU 3.4
  28  */
  29 public class CharsetMatch implements Comparable<CharsetMatch> {
  30
  31
  32     /**
  33      * Create a java.io.Reader for reading the Unicode character data corresponding
  34      * to the original byte data supplied to the Charset detect operation.
  35      * <p/>
  36      * CAUTION:  if the source of the byte data was an InputStream, a Reader
  37      * can be created for only one matching char set using this method.  If more
  38      * than one charset needs to be tried, the caller will need to reset
  39      * the InputStream and create InputStreamReaders itself, based on the charset name.
  40      *
  41      * @return the Reader for the Unicode character data.
  42      *
  43      * @stable ICU 3.4
  44      */
  45     public Reader getReader() {
  46         InputStream inputStream = fInputStream;
  47
  48         if (inputStream == null) {
  49             inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
  50         }
  51
  52         try {
  53             inputStream.reset();
  54             return new InputStreamReader(inputStream, getName());
  55         } catch (IOException e) {
  56             return null;
  57         }
  58     }
  59
  60     /**
  61      * Create a Java String from Unicode character data corresponding
  62      * to the original byte data supplied to the Charset detect operation.
  63      *
  64      * @return a String created from the converted input data.
  65      *
  66      * @stable ICU 3.4
  67      */
  68     public String getString()  throws java.io.IOException {
  69         return getString(-1);
  70
  71     }
  72
  73     /**
  74      * Create a Java String from Unicode character data corresponding
  75      * to the original byte data supplied to the Charset detect operation.
  76      * The length of the returned string is limited to the specified size;
  77      * the string will be trunctated to this length if necessary.  A limit value of
  78      * zero or less is ignored, and treated as no limit.
  79      *
  80      * @param maxLength The maximium length of the String to be created when the
  81      *                  source of the data is an input stream, or -1 for
  82      *                  unlimited length.
  83      * @return a String created from the converted input data.
  84      *
  85      * @stable ICU 3.4
  86      */
  87     public String getString(int maxLength) throws java.io.IOException {
  88         String result = null;
  89         if (fInputStream != null) {
  90             StringBuilder sb = new StringBuilder();
  91             char[] buffer = new char[1024];
  92             Reader reader = getReader();
  93             int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
  94             int bytesRead = 0;
  95
  96             while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
  97                 sb.append(buffer, 0, bytesRead);
  98                 max -= bytesRead;
  99             }
 100
 101             reader.close();
 102
 103             return sb.toString();
 104         } else {
 105             String name = getName();
 106             /*
 107              * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot
 108              * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr'
 109              * should be stripped off before creating the string.
 110              */
 111             int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl");
 112             if (startSuffix > 0) {
 113                 name = name.substring(0, startSuffix);
 114             }
 115             result = new String(fRawInput, name);
 116         }
 117         return result;
 118
 119     }
 120
 121     /**
 122      * Get an indication of the confidence in the charset detected.
 123      * Confidence values range from 0-100, with larger numbers indicating
 124      * a better match of the input data to the characteristics of the
 125      * charset.
 126      *
 127      * @return the confidence in the charset match
 128      *
 129      * @stable ICU 3.4
 130      */
 131     public int getConfidence() {
 132         return fConfidence;
 133     }
 134
 135     /**
 136      * Get the name of the detected charset.
 137      * The name will be one that can be used with other APIs on the
 138      * platform that accept charset names.  It is the "Canonical name"
 139      * as defined by the class java.nio.charset.Charset; for
 140      * charsets that are registered with the IANA charset registry,
 141      * this is the MIME-preferred registerd name.
 142      *
 143      * @see java.nio.charset.Charset
 144      * @see java.io.InputStreamReader
 145      *
 146      * @return The name of the charset.
 147      *
 148      * @stable ICU 3.4
 149      */
 150     public String getName() {
 151         return fCharsetName;
 152     }
 153
 154     /**
 155      * Get the ISO code for the language of the detected charset.
 156      *
 157      * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
 158      *
 159      * @stable ICU 3.4
 160      */
 161     public String getLanguage() {
 162         return fLang;
 163     }
 164
 165     /**
 166      * Compare to other CharsetMatch objects.
 167      * Comparison is based on the match confidence value, which
 168      *   allows CharsetDetector.detectAll() to order its results.
 169      *
 170      * @param other the CharsetMatch object to compare against.
 171      * @return  a negative integer, zero, or a positive integer as the
 172      *          confidence level of this CharsetMatch
 173      *          is less than, equal to, or greater than that of
 174      *          the argument.
 175      * @throws ClassCastException if the argument is not a CharsetMatch.
 176      * @stable ICU 4.4
 177      */
 178     public int compareTo (CharsetMatch other) {
 179         int compareResult = 0;
 180         if (this.fConfidence > other.fConfidence) {
 181             compareResult = 1;
 182         } else if (this.fConfidence < other.fConfidence) {
 183             compareResult = -1;
 184         }
 185         return compareResult;
 186     }
 187
 188     /*
 189      *  Constructor.  Implementation internal
 190      */
 191     CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
 192         fConfidence = conf;
 193
 194         // The references to the original application input data must be copied out
 195         //   of the charset recognizer to here, in case the application resets the
 196         //   recognizer before using this CharsetMatch.
 197         if (det.fInputStream == null) {
 198             // We only want the existing input byte data if it came straight from the user,
 199             //   not if is just the head of a stream.
 200             fRawInput    = det.fRawInput;
 201             fRawLength   = det.fRawLength;
 202         }
 203         fInputStream = det.fInputStream;
 204         fCharsetName = rec.getName();
 205         fLang = rec.getLanguage();
 206     }
 207
 208     /*
 209      *  Constructor.  Implementation internal
 210      */
 211     CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) {
 212         fConfidence = conf;
 213
 214         // The references to the original application input data must be copied out
 215         //   of the charset recognizer to here, in case the application resets the
 216         //   recognizer before using this CharsetMatch.
 217         if (det.fInputStream == null) {
 218             // We only want the existing input byte data if it came straight from the user,
 219             //   not if is just the head of a stream.
 220             fRawInput    = det.fRawInput;
 221             fRawLength   = det.fRawLength;
 222         }
 223         fInputStream = det.fInputStream;
 224         fCharsetName = csName;
 225         fLang = lang;
 226     }
 227
 228
 229     //
 230     //   Private Data
 231     //
 232     private int                 fConfidence;
 233     private byte[]              fRawInput = null;     // Original, untouched input bytes.
 234                                                       //  If user gave us a byte array, this is it.
 235     private int                 fRawLength;           // Length of data in fRawInput array.
 236
 237     private InputStream         fInputStream = null;  // User's input stream, or null if the user
 238                                                       //   gave us a byte array.
 239
 240     private String              fCharsetName;         // The name of the charset this CharsetMatch
 241                                                       //   represents.  Filled in by the recognizer.
 242     private String              fLang;                // The language, if one was determined by
 243                                                       //   the recognizer during the detect operation.
 244 }