2 *******************************************************************************
3 * Copyright (C) 2005-2012, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.text;
9 import java.io.ByteArrayInputStream;
10 import java.io.IOException;
11 import java.io.InputStream;
12 import java.io.InputStreamReader;
13 import java.io.Reader;
17 * This class represents a charset that has been identified by a CharsetDetector
18 * as a possible encoding for a set of input data. From an instance of this
19 * class, you can ask for a confidence level in the charset identification,
20 * or for Java Reader or String to access the original byte data in Unicode form.
22 * Instances of this class are created only by CharsetDetectors.
24 * Note: this class has a natural ordering that is inconsistent with equals.
25 * The natural ordering is based on the match confidence value.
29 public class CharsetMatch implements Comparable<CharsetMatch> {
33 * Create a java.io.Reader for reading the Unicode character data corresponding
34 * to the original byte data supplied to the Charset detect operation.
36 * CAUTION: if the source of the byte data was an InputStream, a Reader
37 * can be created for only one matching char set using this method. If more
38 * than one charset needs to be tried, the caller will need to reset
39 * the InputStream and create InputStreamReaders itself, based on the charset name.
41 * @return the Reader for the Unicode character data.
45 public Reader getReader() {
46 InputStream inputStream = fInputStream;
48 if (inputStream == null) {
49 inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
54 return new InputStreamReader(inputStream, getName());
55 } catch (IOException e) {
61 * Create a Java String from Unicode character data corresponding
62 * to the original byte data supplied to the Charset detect operation.
64 * @return a String created from the converted input data.
68 public String getString() throws java.io.IOException {
74 * Create a Java String from Unicode character data corresponding
75 * to the original byte data supplied to the Charset detect operation.
76 * The length of the returned string is limited to the specified size;
77 * the string will be trunctated to this length if necessary. A limit value of
78 * zero or less is ignored, and treated as no limit.
80 * @param maxLength The maximium length of the String to be created when the
81 * source of the data is an input stream, or -1 for
83 * @return a String created from the converted input data.
87 public String getString(int maxLength) throws java.io.IOException {
89 if (fInputStream != null) {
90 StringBuilder sb = new StringBuilder();
91 char[] buffer = new char[1024];
92 Reader reader = getReader();
93 int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
96 while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
97 sb.append(buffer, 0, bytesRead);
103 return sb.toString();
105 String name = getName();
107 * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot
108 * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr'
109 * should be stripped off before creating the string.
111 int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl");
112 if (startSuffix > 0) {
113 name = name.substring(0, startSuffix);
115 result = new String(fRawInput, name);
122 * Get an indication of the confidence in the charset detected.
123 * Confidence values range from 0-100, with larger numbers indicating
124 * a better match of the input data to the characteristics of the
127 * @return the confidence in the charset match
131 public int getConfidence() {
136 * Get the name of the detected charset.
137 * The name will be one that can be used with other APIs on the
138 * platform that accept charset names. It is the "Canonical name"
139 * as defined by the class java.nio.charset.Charset; for
140 * charsets that are registered with the IANA charset registry,
141 * this is the MIME-preferred registerd name.
143 * @see java.nio.charset.Charset
144 * @see java.io.InputStreamReader
146 * @return The name of the charset.
150 public String getName() {
155 * Get the ISO code for the language of the detected charset.
157 * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
161 public String getLanguage() {
166 * Compare to other CharsetMatch objects.
167 * Comparison is based on the match confidence value, which
168 * allows CharsetDetector.detectAll() to order its results.
170 * @param other the CharsetMatch object to compare against.
171 * @return a negative integer, zero, or a positive integer as the
172 * confidence level of this CharsetMatch
173 * is less than, equal to, or greater than that of
175 * @throws ClassCastException if the argument is not a CharsetMatch.
178 public int compareTo (CharsetMatch other) {
179 int compareResult = 0;
180 if (this.fConfidence > other.fConfidence) {
182 } else if (this.fConfidence < other.fConfidence) {
185 return compareResult;
189 * Constructor. Implementation internal
191 CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
194 // The references to the original application input data must be copied out
195 // of the charset recognizer to here, in case the application resets the
196 // recognizer before using this CharsetMatch.
197 if (det.fInputStream == null) {
198 // We only want the existing input byte data if it came straight from the user,
199 // not if is just the head of a stream.
200 fRawInput = det.fRawInput;
201 fRawLength = det.fRawLength;
203 fInputStream = det.fInputStream;
204 fCharsetName = rec.getName();
205 fLang = rec.getLanguage();
209 * Constructor. Implementation internal
211 CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) {
214 // The references to the original application input data must be copied out
215 // of the charset recognizer to here, in case the application resets the
216 // recognizer before using this CharsetMatch.
217 if (det.fInputStream == null) {
218 // We only want the existing input byte data if it came straight from the user,
219 // not if is just the head of a stream.
220 fRawInput = det.fRawInput;
221 fRawLength = det.fRawLength;
223 fInputStream = det.fInputStream;
224 fCharsetName = csName;
232 private int fConfidence;
233 private byte[] fRawInput = null; // Original, untouched input bytes.
234 // If user gave us a byte array, this is it.
235 private int fRawLength; // Length of data in fRawInput array.
237 private InputStream fInputStream = null; // User's input stream, or null if the user
238 // gave us a byte array.
240 private String fCharsetName; // The name of the charset this CharsetMatch
241 // represents. Filled in by the recognizer.
242 private String fLang; // The language, if one was determined by
243 // the recognizer during the detect operation.