2 *******************************************************************************
\r
3 * Copyright (C) 2005-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.io.IOException;
\r
10 import java.io.InputStream;
\r
11 import java.io.Reader;
\r
12 import java.util.ArrayList;
\r
13 import java.util.Arrays;
\r
14 import java.util.Collections;
\r
18 * <code>CharsetDetector</code> provides a facility for detecting the
\r
19 * charset or encoding of character data in an unknown format.
\r
20 * The input data can either be from an input stream or an array of bytes.
\r
21 * The result of the detection operation is a list of possibly matching
\r
22 * charsets, or, for simple use, you can just ask for a Java Reader that
\r
23 * will will work over the input data.
\r
25 * Character set detection is at best an imprecise operation. The detection
\r
26 * process will attempt to identify the charset that best matches the characteristics
\r
27 * of the byte data, but the process is partly statistical in nature, and
\r
28 * the results can not be guaranteed to always be correct.
\r
30 * For best accuracy in charset detection, the input data should be primarily
\r
31 * in a single language, and a minimum of a few hundred bytes worth of plain text
\r
32 * in the language are needed. The detection process will attempt to
\r
33 * ignore html or xml style markup that could otherwise obscure the content.
\r
37 public class CharsetDetector {
\r
39 // Question: Should we have getters corresponding to the setters for inut text
\r
40 // and declared encoding?
\r
42 // A thought: If we were to create our own type of Java Reader, we could defer
\r
43 // figuring out an actual charset for data that starts out with too much English
\r
44 // only ASCII until the user actually read through to something that didn't look
\r
45 // like 7 bit English. If nothing else ever appeared, we would never need to
\r
46 // actually choose the "real" charset. All assuming that the application just
\r
47 // wants the data, and doesn't care about a char set name.
\r
54 public CharsetDetector() {
\r
58 * Set the declared encoding for charset detection.
\r
59 * The declared encoding of an input text is an encoding obtained
\r
60 * from an http header or xml declaration or similar source that
\r
61 * can be provided as additional information to the charset detector.
\r
62 * A match between a declared encoding and a possible detected encoding
\r
63 * will raise the quality of that detected encoding by a small delta,
\r
64 * and will also appear as a "reason" for the match.
\r
66 * A declared encoding that is incompatible with the input data being
\r
67 * analyzed will not be added to the list of possible encodings.
\r
69 * @param encoding The declared encoding
\r
73 public CharsetDetector setDeclaredEncoding(String encoding) {
\r
74 fDeclaredEncoding = encoding;
\r
79 * Set the input text (byte) data whose charset is to be detected.
\r
81 * @param in the input text of unknown encoding
\r
83 * @return This CharsetDetector
\r
87 public CharsetDetector setText(byte [] in) {
\r
89 fRawLength = in.length;
\r
96 private static final int kBufSize = 8000;
\r
99 * Set the input text (byte) data whose charset is to be detected.
\r
101 * The input stream that supplies the character data must have markSupported()
\r
102 * == true; the charset detection process will read a small amount of data,
\r
103 * then return the stream to its original position via
\r
104 * the InputStream.reset() operation. The exact amount that will
\r
105 * be read depends on the characteristics of the data itself.
\r
107 * @param in the input text of unknown encoding
\r
109 * @return This CharsetDetector
\r
114 public CharsetDetector setText(InputStream in) throws IOException {
\r
116 fInputStream.mark(kBufSize);
\r
117 fRawInput = new byte[kBufSize]; // Always make a new buffer because the
\r
118 // previous one may have come from the caller,
\r
119 // in which case we can't touch it.
\r
121 int remainingLength = kBufSize;
\r
122 while (remainingLength > 0 ) {
\r
123 // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop.
\r
124 int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
\r
125 if (bytesRead <= 0) {
\r
128 fRawLength += bytesRead;
\r
129 remainingLength -= bytesRead;
\r
131 fInputStream.reset();
\r
133 MungeInput(); // Strip html markup, collect byte stats.
\r
139 * Return the charset that best matches the supplied input data.
\r
141 * Note though, that because the detection
\r
142 * only looks at the start of the input data,
\r
143 * there is a possibility that the returned charset will fail to handle
\r
144 * the full set of input data.
\r
146 * Raise an exception if
\r
148 * <li>no charset appears to match the data.</li>
\r
149 * <li>no input text has been provided</li>
\r
152 * @return a CharsetMatch object representing the best matching charset, or
\r
153 * <code>null</code> if there are no matches.
\r
157 public CharsetMatch detect() {
\r
158 // TODO: A better implementation would be to copy the detect loop from
\r
159 // detectAll(), and cut it short as soon as a match with a high confidence
\r
160 // is found. This is something to be done later, after things are otherwise
\r
162 CharsetMatch matches[] = detectAll();
\r
164 if (matches == null || matches.length == 0) {
\r
172 * Return an array of all charsets that appear to be plausible
\r
173 * matches with the input data. The array is ordered with the
\r
174 * best quality match first.
\r
176 * Raise an exception if
\r
178 * <li>no charsets appear to match the input data.</li>
\r
179 * <li>no input text has been provided</li>
\r
182 * @return An array of CharsetMatch objects representing possibly matching charsets.
\r
186 public CharsetMatch[] detectAll() {
\r
187 CharsetRecognizer csr;
\r
191 ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
\r
193 // Iterate over all possible charsets, remember all that
\r
194 // give a match quality > 0.
\r
195 for (i=0; i<fCSRecognizers.size(); i++) {
\r
196 csr = fCSRecognizers.get(i);
\r
197 detectResults = csr.match(this);
\r
198 confidence = detectResults & 0x000000ff;
\r
199 if (confidence > 0) {
\r
200 CharsetMatch m = new CharsetMatch(this, csr, confidence);
\r
204 Collections.sort(matches); // CharsetMatch compares on confidence
\r
205 Collections.reverse(matches); // Put best match first.
\r
206 CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
\r
207 resultArray = matches.toArray(resultArray);
\r
208 return resultArray;
\r
213 * Autodetect the charset of an inputStream, and return a Java Reader
\r
214 * to access the converted input data.
\r
216 * This is a convenience method that is equivalent to
\r
217 * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
\r
219 * For the input stream that supplies the character data, markSupported()
\r
220 * must be true; the charset detection will read a small amount of data,
\r
221 * then return the stream to its original position via
\r
222 * the InputStream.reset() operation. The exact amount that will
\r
223 * be read depends on the characteristics of the data itself.
\r
225 * Raise an exception if no charsets appear to match the input data.
\r
227 * @param in The source of the byte data in the unknown charset.
\r
229 * @param declaredEncoding A declared encoding for the data, if available,
\r
230 * or null or an empty string if none is available.
\r
234 public Reader getReader(InputStream in, String declaredEncoding) {
\r
235 fDeclaredEncoding = declaredEncoding;
\r
240 CharsetMatch match = detect();
\r
242 if (match == null) {
\r
246 return match.getReader();
\r
247 } catch (IOException e) {
\r
253 * Autodetect the charset of an inputStream, and return a String
\r
254 * containing the converted input data.
\r
256 * This is a convenience method that is equivalent to
\r
257 * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
\r
259 * Raise an exception if no charsets appear to match the input data.
\r
261 * @param in The source of the byte data in the unknown charset.
\r
263 * @param declaredEncoding A declared encoding for the data, if available,
\r
264 * or null or an empty string if none is available.
\r
268 public String getString(byte[] in, String declaredEncoding)
\r
270 fDeclaredEncoding = declaredEncoding;
\r
275 CharsetMatch match = detect();
\r
277 if (match == null) {
\r
281 return match.getString(-1);
\r
282 } catch (IOException e) {
\r
289 * Get the names of all char sets that can be recognized by the char set detector.
\r
291 * @return an array of the names of all charsets that can be recognized
\r
292 * by the charset detector.
\r
296 public static String[] getAllDetectableCharsets() {
\r
297 return fCharsetNames;
\r
301 * Test whether or not input filtering is enabled.
\r
303 * @return <code>true</code> if input text will be filtered.
\r
305 * @see #enableInputFilter
\r
309 public boolean inputFilterEnabled()
\r
315 * Enable filtering of input text. If filtering is enabled,
\r
316 * text within angle brackets ("<" and ">") will be removed
\r
317 * before detection.
\r
319 * @param filter <code>true</code> to enable input text filtering.
\r
321 * @return The previous setting.
\r
325 public boolean enableInputFilter(boolean filter)
\r
327 boolean previous = fStripTags;
\r
329 fStripTags = filter;
\r
335 * MungeInput - after getting a set of raw input data to be analyzed, preprocess
\r
336 * it by removing what appears to be html markup.
\r
338 private void MungeInput() {
\r
342 boolean inMarkup = false;
\r
347 // html / xml markup stripping.
\r
348 // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
\r
349 // discard everything within < brackets >
\r
350 // Count how many total '<' and illegal (nested) '<' occur, so we can make some
\r
351 // guess as to whether the input was actually marked up at all.
\r
353 for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
\r
354 b = fRawInput[srci];
\r
355 if (b == (byte)'<') {
\r
364 fInputBytes[dsti++] = b;
\r
367 if (b == (byte)'>') {
\r
376 // If it looks like this input wasn't marked up, or if it looks like it's
\r
377 // essentially nothing but markup abandon the markup stripping.
\r
378 // Detection will have to work on the unstripped input.
\r
380 if (openTags<5 || openTags/5 < badTags ||
\r
381 (fInputLen < 100 && fRawLength>600)) {
\r
382 int limit = fRawLength;
\r
384 if (limit > kBufSize) {
\r
388 for (srci=0; srci<limit; srci++) {
\r
389 fInputBytes[srci] = fRawInput[srci];
\r
395 // Tally up the byte occurence statistics.
\r
396 // These are available for use by the various detectors.
\r
398 Arrays.fill(fByteStats, (short)0);
\r
399 for (srci=0; srci<fInputLen; srci++) {
\r
400 int val = fInputBytes[srci] & 0x00ff;
\r
405 for (int i = 0x80; i <= 0x9F; i += 1) {
\r
406 if (fByteStats[i] != 0) {
\r
414 * The following items are accessed by individual CharsetRecongizers during
\r
415 * the recognition process
\r
418 byte[] fInputBytes = // The text to be checked. Markup will have been
\r
419 new byte[kBufSize]; // removed if appropriate.
\r
421 int fInputLen; // Length of the byte data in fInputText.
\r
423 short fByteStats[] = // byte frequency statistics for the input text.
\r
424 new short[256]; // Value is percent, not absolute.
\r
425 // Value is rounded up, so zero really means zero occurences.
\r
427 boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input;
\r
430 String fDeclaredEncoding;
\r
435 // Stuff private to CharsetDetector
\r
437 byte[] fRawInput; // Original, untouched input bytes.
\r
438 // If user gave us a byte array, this is it.
\r
439 // If user gave us a stream, it's read to a
\r
441 int fRawLength; // Length of data in fRawInput array.
\r
443 InputStream fInputStream; // User's input stream, or null if the user
\r
444 // gave us a byte array.
\r
446 boolean fStripTags = // If true, setText() will strip tags from input text.
\r
451 * List of recognizers for all charsets known to the implementation.
\r
453 private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
\r
454 private static String [] fCharsetNames;
\r
457 * Create the singleton instances of the CharsetRecognizer classes
\r
459 private static ArrayList<CharsetRecognizer> createRecognizers() {
\r
460 ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
\r
462 recognizers.add(new CharsetRecog_UTF8());
\r
464 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
\r
465 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
\r
466 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
\r
467 recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
\r
469 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
\r
470 recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
\r
471 recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
\r
472 recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
\r
473 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
\r
474 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
\r
475 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
\r
476 recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
\r
478 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
\r
479 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
\r
480 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
\r
481 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
\r
482 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
\r
483 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
\r
484 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
\r
485 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
\r
486 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
\r
487 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
\r
488 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
\r
489 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
\r
490 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
\r
491 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
\r
492 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
\r
493 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
\r
494 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
\r
495 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
\r
496 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
\r
497 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
\r
498 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
\r
499 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
\r
500 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
\r
502 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
\r
503 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
\r
504 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
\r
505 recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
\r
507 // Create an array of all charset names, as a side effect.
\r
508 // Needed for the getAllDetectableCharsets() API.
\r
509 String[] charsetNames = new String [recognizers.size()];
\r
512 for (int i = 0; i < recognizers.size(); i++) {
\r
513 String name = recognizers.get(i).getName();
\r
515 if (out == 0 || ! name.equals(charsetNames[out - 1])) {
\r
516 charsetNames[out++] = name;
\r
520 fCharsetNames = new String[out];
\r
521 System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
\r
523 return recognizers;
\r