2 *******************************************************************************
\r
3 * Copyright (C) 2005-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.io.ByteArrayInputStream;
\r
10 import java.io.IOException;
\r
11 import java.io.InputStream;
\r
12 import java.io.InputStreamReader;
\r
13 import java.io.Reader;
\r
17 * This class represents a charset that has been identified by a CharsetDetector
\r
18 * as a possible encoding for a set of input data. From an instance of this
\r
19 * class, you can ask for a confidence level in the charset identification,
\r
20 * or for Java Reader or String to access the original byte data in Unicode form.
\r
22 * Instances of this class are created only by CharsetDetectors.
\r
24 * Note: this class has a natural ordering that is inconsistent with equals.
\r
25 * The natural ordering is based on the match confidence value.
\r
29 public class CharsetMatch implements Comparable<CharsetMatch> {
\r
33 * Create a java.io.Reader for reading the Unicode character data corresponding
\r
34 * to the original byte data supplied to the Charset detect operation.
\r
36 * CAUTION: if the source of the byte data was an InputStream, a Reader
\r
37 * can be created for only one matching char set using this method. If more
\r
38 * than one charset needs to be tried, the caller will need to reset
\r
39 * the InputStream and create InputStreamReaders itself, based on the charset name.
\r
41 * @return the Reader for the Unicode character data.
\r
45 public Reader getReader() {
\r
46 InputStream inputStream = fInputStream;
\r
48 if (inputStream == null) {
\r
49 inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
\r
53 inputStream.reset();
\r
54 return new InputStreamReader(inputStream, getName());
\r
55 } catch (IOException e) {
\r
61 * Create a Java String from Unicode character data corresponding
\r
62 * to the original byte data supplied to the Charset detect operation.
\r
64 * @return a String created from the converted input data.
\r
68 public String getString() throws java.io.IOException {
\r
69 return getString(-1);
\r
74 * Create a Java String from Unicode character data corresponding
\r
75 * to the original byte data supplied to the Charset detect operation.
\r
76 * The length of the returned string is limited to the specified size;
\r
77 * the string will be trunctated to this length if necessary. A limit value of
\r
78 * zero or less is ignored, and treated as no limit.
\r
80 * @param maxLength The maximium length of the String to be created when the
\r
81 * source of the data is an input stream, or -1 for
\r
83 * @return a String created from the converted input data.
\r
87 public String getString(int maxLength) throws java.io.IOException {
\r
88 String result = null;
\r
89 if (fInputStream != null) {
\r
90 StringBuilder sb = new StringBuilder();
\r
91 char[] buffer = new char[1024];
\r
92 Reader reader = getReader();
\r
93 int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
\r
96 while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
\r
97 sb.append(buffer, 0, bytesRead);
\r
103 return sb.toString();
\r
105 result = new String(fRawInput, getName());
\r
112 * Get an indication of the confidence in the charset detected.
\r
113 * Confidence values range from 0-100, with larger numbers indicating
\r
114 * a better match of the input data to the characteristics of the
\r
117 * @return the confidence in the charset match
\r
121 public int getConfidence() {
\r
122 return fConfidence;
\r
127 * Bit flag indicating the match is based on the the encoding scheme.
\r
129 * @see #getMatchType
\r
132 static public final int ENCODING_SCHEME = 1;
\r
135 * Bit flag indicating the match is based on the presence of a BOM.
\r
137 * @see #getMatchType
\r
140 static public final int BOM = 2;
\r
143 * Bit flag indicating he match is based on the declared encoding.
\r
145 * @see #getMatchType
\r
148 static public final int DECLARED_ENCODING = 4;
\r
151 * Bit flag indicating the match is based on language statistics.
\r
153 * @see #getMatchType
\r
156 static public final int LANG_STATISTICS = 8;
\r
159 * Return flags indicating what it was about the input data
\r
160 * that caused this charset to be considered as a possible match.
\r
161 * The result is a bitfield containing zero or more of the flags
\r
162 * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
\r
163 * A result of zero means no information is available.
\r
165 * Note: currently, this method always returns zero.
\r
168 * @return the type of match found for this charset.
\r
171 * @provisional This API might change or be removed in a future release.
\r
173 public int getMatchType() {
\r
174 // TODO: create a list of enum-like constants for common combinations of types of matches.
\r
179 * Get the name of the detected charset.
\r
180 * The name will be one that can be used with other APIs on the
\r
181 * platform that accept charset names. It is the "Canonical name"
\r
182 * as defined by the class java.nio.charset.Charset; for
\r
183 * charsets that are registered with the IANA charset registry,
\r
184 * this is the MIME-preferred registerd name.
\r
186 * @see java.nio.charset.Charset
\r
187 * @see java.io.InputStreamReader
\r
189 * @return The name of the charset.
\r
193 public String getName() {
\r
194 return fRecognizer.getName();
\r
198 * Get the ISO code for the language of the detected charset.
\r
200 * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
\r
204 public String getLanguage() {
\r
205 return fRecognizer.getLanguage();
\r
209 * Compare to other CharsetMatch objects.
\r
210 * Comparison is based on the match confidence value, which
\r
211 * allows CharsetDetector.detectAll() to order its results.
\r
213 * @param other the CharsetMatch object to compare against.
\r
214 * @return a negative integer, zero, or a positive integer as the
\r
215 * confidence level of this CharsetMatch
\r
216 * is less than, equal to, or greater than that of
\r
218 * @throws ClassCastException if the argument is not a CharsetMatch.
\r
221 public int compareTo (CharsetMatch other) {
\r
222 int compareResult = 0;
\r
223 if (this.fConfidence > other.fConfidence) {
\r
225 } else if (this.fConfidence < other.fConfidence) {
\r
226 compareResult = -1;
\r
228 return compareResult;
\r
232 * Constructor. Implementation internal
\r
234 CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
\r
236 fConfidence = conf;
\r
238 // The references to the original aplication input data must be copied out
\r
239 // of the charset recognizer to here, in case the application resets the
\r
240 // recognizer before using this CharsetMatch.
\r
241 if (det.fInputStream == null) {
\r
242 // We only want the existing input byte data if it came straight from the user,
\r
243 // not if is just the head of a stream.
\r
244 fRawInput = det.fRawInput;
\r
245 fRawLength = det.fRawLength;
\r
247 fInputStream = det.fInputStream;
\r
254 private int fConfidence;
\r
255 private CharsetRecognizer fRecognizer;
\r
256 private byte[] fRawInput = null; // Original, untouched input bytes.
\r
257 // If user gave us a byte array, this is it.
\r
258 private int fRawLength; // Length of data in fRawInput array.
\r
260 private InputStream fInputStream = null; // User's input stream, or null if the user
\r
261 // gave us a byte array.
\r