2 ****************************************************************************
\r
3 * Copyright (C) 2005-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 ****************************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import java.util.Arrays;
\r
13 * CharsetRecognizer implemenation for Asian - double or multi-byte - charsets.
\r
14 * Match is determined mostly by the input data adhering to the
\r
15 * encoding scheme for the charset, and, optionally,
\r
16 * frequency-of-occurence of characters.
\r
18 * Instances of this class are singletons, one per encoding
\r
19 * being recognized. They are created in the main
\r
20 * CharsetDetector class and kept in the global list of available
\r
21 * encodings to be checked. The specific encoding being recognized
\r
22 * is determined by subclass.
\r
24 abstract class CharsetRecog_mbcs extends CharsetRecognizer {
\r
27 * Get the IANA name of this charset.
\r
28 * @return the charset name.
\r
30 abstract String getName() ;
\r
34 * Test the match of this charset with the input text data
\r
35 * which is obtained via the CharsetDetector object.
\r
37 * @param det The CharsetDetector, which contains the input text
\r
38 * to be checked for being in this charset.
\r
39 * @return Two values packed into one int (Damn java, anyhow)
\r
41 * bits 0-7: the match confidence, ranging from 0-100
\r
43 * bits 8-15: The match reason, an enum-like value.
\r
45 int match(CharsetDetector det, int [] commonChars) {
\r
46 int singleByteCharCount = 0;
\r
47 int doubleByteCharCount = 0;
\r
48 int commonCharCount = 0;
\r
49 int badCharCount = 0;
\r
50 int totalCharCount = 0;
\r
52 iteratedChar iter = new iteratedChar();
\r
55 for (iter.reset(); nextChar(iter, det);) {
\r
60 long cv = iter.charValue & 0xFFFFFFFFL;
\r
63 singleByteCharCount++;
\r
65 doubleByteCharCount++;
\r
66 if (commonChars != null) {
\r
67 // NOTE: This assumes that there are no 4-byte common chars.
\r
68 if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
\r
74 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
\r
75 // Bail out early if the byte data is not matching the encoding scheme.
\r
80 if (doubleByteCharCount <= 10 && badCharCount== 0) {
\r
81 // Not many multi-byte chars.
\r
82 if (doubleByteCharCount == 0 && totalCharCount < 10) {
\r
83 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
\r
84 // We don't have enough data to have any confidence.
\r
85 // Statistical analysis of single byte non-ASCII charcters would probably help here.
\r
89 // ASCII or ISO file? It's probably not our encoding,
\r
90 // but is not incompatible with our encoding, so don't give it a zero.
\r
98 // No match if there are too many characters that don't fit the encoding scheme.
\r
99 // (should we have zero tolerance for these?)
\r
101 if (doubleByteCharCount < 20*badCharCount) {
\r
106 if (commonChars == null) {
\r
107 // We have no statistics on frequently occuring characters.
\r
108 // Assess confidence purely on having a reasonable number of
\r
109 // multi-byte characters (the more the better
\r
110 confidence = 30 + doubleByteCharCount - 20*badCharCount;
\r
111 if (confidence > 100) {
\r
116 // Frequency of occurence statistics exist.
\r
118 double maxVal = Math.log((float)doubleByteCharCount / 4);
\r
119 double scaleFactor = 90.0 / maxVal;
\r
120 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
\r
121 confidence = Math.min(confidence, 100);
\r
123 } // end of detectBlock:
\r
128 // "Character" iterated character class.
\r
129 // Recognizers for specific mbcs encodings make their "characters" available
\r
130 // by providing a nextChar() function that fills in an instance of iteratedChar
\r
131 // with the next char from the input.
\r
132 // The returned characters are not converted to Unicode, but remain as the raw
\r
133 // bytes (concatenated into an int) from the codepage data.
\r
135 // For Asian charsets, use the raw input rather than the input that has been
\r
136 // stripped of markup. Detection only considers multi-byte chars, effectively
\r
137 // stripping markup anyway, and double byte chars do occur in markup too.
\r
139 static class iteratedChar {
\r
140 int charValue = 0; // 1-4 bytes from the raw input data
\r
143 boolean error = false;
\r
144 boolean done = false;
\r
154 int nextByte(CharsetDetector det) {
\r
155 if (nextIndex >= det.fRawLength) {
\r
159 int byteValue = (int)det.fRawInput[nextIndex++] & 0x00ff;
\r
165 * Get the next character (however many bytes it is) from the input data
\r
166 * Subclasses for specific charset encodings must implement this function
\r
167 * to get characters according to the rules of their encoding scheme.
\r
169 * This function is not a method of class iteratedChar only because
\r
170 * that would require a lot of extra derived classes, which is awkward.
\r
171 * @param it The iteratedChar "struct" into which the returned char is placed.
\r
172 * @param det The charset detector, which is needed to get at the input byte data
\r
173 * being iterated over.
\r
174 * @return True if a character was returned, false at end of input.
\r
176 abstract boolean nextChar(iteratedChar it, CharsetDetector det);
\r
183 * Shift-JIS charset recognizer.
\r
186 static class CharsetRecog_sjis extends CharsetRecog_mbcs {
\r
187 static int [] commonChars =
\r
188 // TODO: This set of data comes from the character frequency-
\r
189 // of-occurence analysis tool. The data needs to be moved
\r
190 // into a resource and loaded from there.
\r
191 {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
\r
192 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
\r
193 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
\r
194 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
\r
195 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
\r
196 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
\r
198 boolean nextChar(iteratedChar it, CharsetDetector det) {
\r
199 it.index = it.nextIndex;
\r
202 firstByte = it.charValue = it.nextByte(det);
\r
203 if (firstByte < 0) {
\r
207 if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
\r
211 int secondByte = it.nextByte(det);
\r
212 if (secondByte < 0) {
\r
215 it.charValue = (firstByte << 8) | secondByte;
\r
216 if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
\r
217 // Illegal second byte value.
\r
223 int match(CharsetDetector det) {
\r
224 return match(det, commonChars);
\r
228 return "Shift_JIS";
\r
231 public String getLanguage()
\r
241 * Big5 charset recognizer.
\r
244 static class CharsetRecog_big5 extends CharsetRecog_mbcs {
\r
245 static int [] commonChars =
\r
246 // TODO: This set of data comes from the character frequency-
\r
247 // of-occurence analysis tool. The data needs to be moved
\r
248 // into a resource and loaded from there.
\r
249 {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
\r
250 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
\r
251 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
\r
252 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
\r
253 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
\r
254 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
\r
255 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
\r
256 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
\r
257 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
\r
258 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
\r
260 boolean nextChar(iteratedChar it, CharsetDetector det) {
\r
261 it.index = it.nextIndex;
\r
264 firstByte = it.charValue = it.nextByte(det);
\r
265 if (firstByte < 0) {
\r
269 if (firstByte <= 0x7f || firstByte==0xff) {
\r
270 // single byte character.
\r
274 int secondByte = it.nextByte(det);
\r
275 if (secondByte < 0) {
\r
278 it.charValue = (it.charValue << 8) | secondByte;
\r
280 if (secondByte < 0x40 ||
\r
281 secondByte ==0x7f ||
\r
282 secondByte == 0xff) {
\r
288 int match(CharsetDetector det) {
\r
289 return match(det, commonChars);
\r
297 public String getLanguage()
\r
305 * EUC charset recognizers. One abstract class that provides the common function
\r
306 * for getting the next character according to the EUC encoding scheme,
\r
307 * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
\r
310 abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
\r
314 * Get the next character value for EUC based encodings.
\r
315 * Character "value" is simply the raw bytes that make up the character
\r
316 * packed into an int.
\r
318 boolean nextChar(iteratedChar it, CharsetDetector det) {
\r
319 it.index = it.nextIndex;
\r
322 int secondByte = 0;
\r
324 //int fourthByte = 0;
\r
327 firstByte = it.charValue = it.nextByte(det);
\r
328 if (firstByte < 0) {
\r
329 // Ran off the end of the input data
\r
333 if (firstByte <= 0x8d) {
\r
334 // single byte char
\r
338 secondByte = it.nextByte(det);
\r
339 it.charValue = (it.charValue << 8) | secondByte;
\r
341 if (firstByte >= 0xA1 && firstByte <= 0xfe) {
\r
343 if (secondByte < 0xa1) {
\r
348 if (firstByte == 0x8e) {
\r
350 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
\r
351 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
\r
352 // We don't know which we've got.
\r
353 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
\r
354 // bytes will look like a well formed 2 byte char.
\r
355 if (secondByte < 0xa1) {
\r
361 if (firstByte == 0x8f) {
\r
363 // Three byte total char size, two bytes of actual char value.
\r
364 thirdByte = it.nextByte(det);
\r
365 it.charValue = (it.charValue << 8) | thirdByte;
\r
366 if (thirdByte < 0xa1) {
\r
372 return (it.done == false);
\r
376 * The charset recognize for EUC-JP. A singleton instance of this class
\r
377 * is created and kept by the public CharsetDetector class
\r
379 static class CharsetRecog_euc_jp extends CharsetRecog_euc {
\r
380 static int [] commonChars =
\r
381 // TODO: This set of data comes from the character frequency-
\r
382 // of-occurence analysis tool. The data needs to be moved
\r
383 // into a resource and loaded from there.
\r
384 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
\r
385 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
\r
386 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
\r
387 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
\r
388 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
\r
389 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
\r
390 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
\r
391 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
\r
392 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
\r
393 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
\r
398 int match(CharsetDetector det) {
\r
399 return match(det, commonChars);
\r
402 public String getLanguage()
\r
409 * The charset recognize for EUC-KR. A singleton instance of this class
\r
410 * is created and kept by the public CharsetDetector class
\r
412 static class CharsetRecog_euc_kr extends CharsetRecog_euc {
\r
413 static int [] commonChars =
\r
414 // TODO: This set of data comes from the character frequency-
\r
415 // of-occurence analysis tool. The data needs to be moved
\r
416 // into a resource and loaded from there.
\r
417 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
\r
418 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
\r
419 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
\r
420 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
\r
421 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
\r
422 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
\r
423 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
\r
424 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
\r
425 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
\r
426 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
\r
432 int match(CharsetDetector det) {
\r
433 return match(det, commonChars);
\r
436 public String getLanguage()
\r
445 * GB-18030 recognizer. Uses simplified Chinese statistics.
\r
448 static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
\r
452 * Get the next character value for EUC based encodings.
\r
453 * Character "value" is simply the raw bytes that make up the character
\r
454 * packed into an int.
\r
456 boolean nextChar(iteratedChar it, CharsetDetector det) {
\r
457 it.index = it.nextIndex;
\r
460 int secondByte = 0;
\r
462 int fourthByte = 0;
\r
465 firstByte = it.charValue = it.nextByte(det);
\r
467 if (firstByte < 0) {
\r
468 // Ran off the end of the input data
\r
473 if (firstByte <= 0x80) {
\r
474 // single byte char
\r
478 secondByte = it.nextByte(det);
\r
479 it.charValue = (it.charValue << 8) | secondByte;
\r
481 if (firstByte >= 0x81 && firstByte <= 0xFE) {
\r
483 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
\r
488 if (secondByte >= 0x30 && secondByte <= 0x39) {
\r
489 thirdByte = it.nextByte(det);
\r
491 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
\r
492 fourthByte = it.nextByte(det);
\r
494 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
\r
495 it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
\r
506 return (it.done == false);
\r
509 static int [] commonChars =
\r
510 // TODO: This set of data comes from the character frequency-
\r
511 // of-occurence analysis tool. The data needs to be moved
\r
512 // into a resource and loaded from there.
\r
513 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
\r
514 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
\r
515 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
\r
516 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
\r
517 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
\r
518 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
\r
519 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
\r
520 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
\r
521 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
\r
522 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
\r
529 int match(CharsetDetector det) {
\r
530 return match(det, commonChars);
\r
533 public String getLanguage()
\r