2 ****************************************************************************
3 * Copyright (C) 2005-2012, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ****************************************************************************
8 package com.ibm.icu.text;
10 import java.util.Arrays;
13 * CharsetRecognizer implemenation for Asian - double or multi-byte - charsets.
14 * Match is determined mostly by the input data adhering to the
15 * encoding scheme for the charset, and, optionally,
16 * frequency-of-occurence of characters.
18 * Instances of this class are singletons, one per encoding
19 * being recognized. They are created in the main
20 * CharsetDetector class and kept in the global list of available
21 * encodings to be checked. The specific encoding being recognized
22 * is determined by subclass.
24 abstract class CharsetRecog_mbcs extends CharsetRecognizer {
27 * Get the IANA name of this charset.
28 * @return the charset name.
30 abstract String getName() ;
34 * Test the match of this charset with the input text data
35 * which is obtained via the CharsetDetector object.
37 * @param det The CharsetDetector, which contains the input text
38 * to be checked for being in this charset.
39 * @return Two values packed into one int (Damn java, anyhow)
41 * bits 0-7: the match confidence, ranging from 0-100
43 * bits 8-15: The match reason, an enum-like value.
45 int match(CharsetDetector det, int [] commonChars) {
46 @SuppressWarnings("unused")
47 int singleByteCharCount = 0; //TODO Do we really need this?
48 int doubleByteCharCount = 0;
49 int commonCharCount = 0;
51 int totalCharCount = 0;
53 iteratedChar iter = new iteratedChar();
56 for (iter.reset(); nextChar(iter, det);) {
61 long cv = iter.charValue & 0xFFFFFFFFL;
64 singleByteCharCount++;
66 doubleByteCharCount++;
67 if (commonChars != null) {
68 // NOTE: This assumes that there are no 4-byte common chars.
69 if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
75 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
76 // Bail out early if the byte data is not matching the encoding scheme.
81 if (doubleByteCharCount <= 10 && badCharCount== 0) {
82 // Not many multi-byte chars.
83 if (doubleByteCharCount == 0 && totalCharCount < 10) {
84 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
85 // We don't have enough data to have any confidence.
86 // Statistical analysis of single byte non-ASCII charcters would probably help here.
90 // ASCII or ISO file? It's probably not our encoding,
91 // but is not incompatible with our encoding, so don't give it a zero.
99 // No match if there are too many characters that don't fit the encoding scheme.
100 // (should we have zero tolerance for these?)
102 if (doubleByteCharCount < 20*badCharCount) {
107 if (commonChars == null) {
108 // We have no statistics on frequently occuring characters.
109 // Assess confidence purely on having a reasonable number of
110 // multi-byte characters (the more the better
111 confidence = 30 + doubleByteCharCount - 20*badCharCount;
112 if (confidence > 100) {
117 // Frequency of occurence statistics exist.
119 double maxVal = Math.log((float)doubleByteCharCount / 4);
120 double scaleFactor = 90.0 / maxVal;
121 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
122 confidence = Math.min(confidence, 100);
124 } // end of detectBlock:
129 // "Character" iterated character class.
130 // Recognizers for specific mbcs encodings make their "characters" available
131 // by providing a nextChar() function that fills in an instance of iteratedChar
132 // with the next char from the input.
133 // The returned characters are not converted to Unicode, but remain as the raw
134 // bytes (concatenated into an int) from the codepage data.
136 // For Asian charsets, use the raw input rather than the input that has been
137 // stripped of markup. Detection only considers multi-byte chars, effectively
138 // stripping markup anyway, and double byte chars do occur in markup too.
140 static class iteratedChar {
141 int charValue = 0; // 1-4 bytes from the raw input data
144 boolean error = false;
145 boolean done = false;
155 int nextByte(CharsetDetector det) {
156 if (nextIndex >= det.fRawLength) {
160 int byteValue = (int)det.fRawInput[nextIndex++] & 0x00ff;
166 * Get the next character (however many bytes it is) from the input data
167 * Subclasses for specific charset encodings must implement this function
168 * to get characters according to the rules of their encoding scheme.
170 * This function is not a method of class iteratedChar only because
171 * that would require a lot of extra derived classes, which is awkward.
172 * @param it The iteratedChar "struct" into which the returned char is placed.
173 * @param det The charset detector, which is needed to get at the input byte data
174 * being iterated over.
175 * @return True if a character was returned, false at end of input.
177 abstract boolean nextChar(iteratedChar it, CharsetDetector det);
184 * Shift-JIS charset recognizer.
187 static class CharsetRecog_sjis extends CharsetRecog_mbcs {
188 static int [] commonChars =
189 // TODO: This set of data comes from the character frequency-
190 // of-occurence analysis tool. The data needs to be moved
191 // into a resource and loaded from there.
192 {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
193 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
194 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
195 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
196 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
197 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
199 boolean nextChar(iteratedChar it, CharsetDetector det) {
200 it.index = it.nextIndex;
203 firstByte = it.charValue = it.nextByte(det);
208 if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
212 int secondByte = it.nextByte(det);
213 if (secondByte < 0) {
216 it.charValue = (firstByte << 8) | secondByte;
217 if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
218 // Illegal second byte value.
224 CharsetMatch match(CharsetDetector det) {
225 int confidence = match(det, commonChars);
226 return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
233 public String getLanguage()
243 * Big5 charset recognizer.
246 static class CharsetRecog_big5 extends CharsetRecog_mbcs {
247 static int [] commonChars =
248 // TODO: This set of data comes from the character frequency-
249 // of-occurence analysis tool. The data needs to be moved
250 // into a resource and loaded from there.
251 {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
252 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
253 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
254 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
255 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
256 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
257 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
258 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
259 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
260 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
262 boolean nextChar(iteratedChar it, CharsetDetector det) {
263 it.index = it.nextIndex;
266 firstByte = it.charValue = it.nextByte(det);
271 if (firstByte <= 0x7f || firstByte==0xff) {
272 // single byte character.
276 int secondByte = it.nextByte(det);
277 if (secondByte < 0) {
280 it.charValue = (it.charValue << 8) | secondByte;
282 if (secondByte < 0x40 ||
284 secondByte == 0xff) {
290 CharsetMatch match(CharsetDetector det) {
291 int confidence = match(det, commonChars);
292 return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
300 public String getLanguage()
308 * EUC charset recognizers. One abstract class that provides the common function
309 * for getting the next character according to the EUC encoding scheme,
310 * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
313 abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
317 * Get the next character value for EUC based encodings.
318 * Character "value" is simply the raw bytes that make up the character
319 * packed into an int.
321 boolean nextChar(iteratedChar it, CharsetDetector det) {
322 it.index = it.nextIndex;
327 //int fourthByte = 0;
330 firstByte = it.charValue = it.nextByte(det);
332 // Ran off the end of the input data
336 if (firstByte <= 0x8d) {
341 secondByte = it.nextByte(det);
342 it.charValue = (it.charValue << 8) | secondByte;
344 if (firstByte >= 0xA1 && firstByte <= 0xfe) {
346 if (secondByte < 0xa1) {
351 if (firstByte == 0x8e) {
353 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
354 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
355 // We don't know which we've got.
356 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
357 // bytes will look like a well formed 2 byte char.
358 if (secondByte < 0xa1) {
364 if (firstByte == 0x8f) {
366 // Three byte total char size, two bytes of actual char value.
367 thirdByte = it.nextByte(det);
368 it.charValue = (it.charValue << 8) | thirdByte;
369 if (thirdByte < 0xa1) {
375 return (it.done == false);
379 * The charset recognize for EUC-JP. A singleton instance of this class
380 * is created and kept by the public CharsetDetector class
382 static class CharsetRecog_euc_jp extends CharsetRecog_euc {
383 static int [] commonChars =
384 // TODO: This set of data comes from the character frequency-
385 // of-occurence analysis tool. The data needs to be moved
386 // into a resource and loaded from there.
387 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
388 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
389 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
390 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
391 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
392 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
393 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
394 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
395 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
396 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
401 CharsetMatch match(CharsetDetector det) {
402 int confidence = match(det, commonChars);
403 return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
406 public String getLanguage()
413 * The charset recognize for EUC-KR. A singleton instance of this class
414 * is created and kept by the public CharsetDetector class
416 static class CharsetRecog_euc_kr extends CharsetRecog_euc {
417 static int [] commonChars =
418 // TODO: This set of data comes from the character frequency-
419 // of-occurence analysis tool. The data needs to be moved
420 // into a resource and loaded from there.
421 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
422 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
423 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
424 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
425 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
426 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
427 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
428 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
429 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
430 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
436 CharsetMatch match(CharsetDetector det) {
437 int confidence = match(det, commonChars);
438 return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
441 public String getLanguage()
450 * GB-18030 recognizer. Uses simplified Chinese statistics.
453 static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
457 * Get the next character value for EUC based encodings.
458 * Character "value" is simply the raw bytes that make up the character
459 * packed into an int.
461 boolean nextChar(iteratedChar it, CharsetDetector det) {
462 it.index = it.nextIndex;
470 firstByte = it.charValue = it.nextByte(det);
473 // Ran off the end of the input data
478 if (firstByte <= 0x80) {
483 secondByte = it.nextByte(det);
484 it.charValue = (it.charValue << 8) | secondByte;
486 if (firstByte >= 0x81 && firstByte <= 0xFE) {
488 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
493 if (secondByte >= 0x30 && secondByte <= 0x39) {
494 thirdByte = it.nextByte(det);
496 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
497 fourthByte = it.nextByte(det);
499 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
500 it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
511 return (it.done == false);
514 static int [] commonChars =
515 // TODO: This set of data comes from the character frequency-
516 // of-occurence analysis tool. The data needs to be moved
517 // into a resource and loaded from there.
518 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
519 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
520 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
521 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
522 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
523 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
524 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
525 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
526 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
527 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
534 CharsetMatch match(CharsetDetector det) {
535 int confidence = match(det, commonChars);
536 return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
539 public String getLanguage()