2 ****************************************************************************
\r
3 * Copyright (C) 2005-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 ****************************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import java.util.Arrays;
\r
13 * CharsetRecognizer implemenation for Asian - double or multi-byte - charsets.
\r
14 * Match is determined mostly by the input data adhering to the
\r
15 * encoding scheme for the charset, and, optionally,
\r
16 * frequency-of-occurence of characters.
\r
18 * Instances of this class are singletons, one per encoding
\r
19 * being recognized. They are created in the main
\r
20 * CharsetDetector class and kept in the global list of available
\r
21 * encodings to be checked. The specific encoding being recognized
\r
22 * is determined by subclass.
\r
26 abstract class CharsetRecog_mbcs extends CharsetRecognizer {
\r
29 * Get the IANA name of this charset.
\r
30 * @return the charset name.
\r
32 abstract String getName() ;
\r
36 * Test the match of this charset with the input text data
\r
37 * which is obtained via the CharsetDetector object.
\r
39 * @param det The CharsetDetector, which contains the input text
\r
40 * to be checked for being in this charset.
\r
41 * @return Two values packed into one int (Damn java, anyhow)
\r
43 * bits 0-7: the match confidence, ranging from 0-100
\r
45 * bits 8-15: The match reason, an enum-like value.
\r
47 int match(CharsetDetector det, int [] commonChars) {
\r
48 int singleByteCharCount = 0;
\r
49 int doubleByteCharCount = 0;
\r
50 int commonCharCount = 0;
\r
51 int badCharCount = 0;
\r
52 int totalCharCount = 0;
\r
54 iteratedChar iter = new iteratedChar();
\r
57 for (iter.reset(); nextChar(iter, det);) {
\r
62 long cv = iter.charValue & 0xFFFFFFFFL;
\r
65 singleByteCharCount++;
\r
67 doubleByteCharCount++;
\r
68 if (commonChars != null) {
\r
69 // NOTE: This assumes that there are no 4-byte common chars.
\r
70 if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
\r
76 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
\r
77 // Bail out early if the byte data is not matching the encoding scheme.
\r
82 if (doubleByteCharCount <= 10 && badCharCount== 0) {
\r
83 // Not many multi-byte chars.
\r
84 if (doubleByteCharCount == 0 && totalCharCount < 10) {
\r
85 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
\r
86 // We don't have enough data to have any confidence.
\r
87 // Statistical analysis of single byte non-ASCII charcters would probably help here.
\r
91 // ASCII or ISO file? It's probably not our encoding,
\r
92 // but is not incompatible with our encoding, so don't give it a zero.
\r
100 // No match if there are too many characters that don't fit the encoding scheme.
\r
101 // (should we have zero tolerance for these?)
\r
103 if (doubleByteCharCount < 20*badCharCount) {
\r
108 if (commonChars == null) {
\r
109 // We have no statistics on frequently occuring characters.
\r
110 // Assess confidence purely on having a reasonable number of
\r
111 // multi-byte characters (the more the better
\r
112 confidence = 30 + doubleByteCharCount - 20*badCharCount;
\r
113 if (confidence > 100) {
\r
118 // Frequency of occurence statistics exist.
\r
120 double maxVal = Math.log((float)doubleByteCharCount / 4);
\r
121 double scaleFactor = 90.0 / maxVal;
\r
122 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
\r
123 confidence = Math.min(confidence, 100);
\r
125 } // end of detectBlock:
\r
130 // "Character" iterated character class.
\r
131 // Recognizers for specific mbcs encodings make their "characters" available
\r
132 // by providing a nextChar() function that fills in an instance of iteratedChar
\r
133 // with the next char from the input.
\r
134 // The returned characters are not converted to Unicode, but remain as the raw
\r
135 // bytes (concatenated into an int) from the codepage data.
\r
137 // For Asian charsets, use the raw input rather than the input that has been
\r
138 // stripped of markup. Detection only considers multi-byte chars, effectively
\r
139 // stripping markup anyway, and double byte chars do occur in markup too.
\r
141 static class iteratedChar {
\r
142 int charValue = 0; // 1-4 bytes from the raw input data
\r
145 boolean error = false;
\r
146 boolean done = false;
\r
156 int nextByte(CharsetDetector det) {
\r
157 if (nextIndex >= det.fRawLength) {
\r
161 int byteValue = (int)det.fRawInput[nextIndex++] & 0x00ff;
\r
167 * Get the next character (however many bytes it is) from the input data
\r
168 * Subclasses for specific charset encodings must implement this function
\r
169 * to get characters according to the rules of their encoding scheme.
\r
171 * This function is not a method of class iteratedChar only because
\r
172 * that would require a lot of extra derived classes, which is awkward.
\r
173 * @param it The iteratedChar "struct" into which the returned char is placed.
\r
174 * @param det The charset detector, which is needed to get at the input byte data
\r
175 * being iterated over.
\r
176 * @return True if a character was returned, false at end of input.
\r
178 abstract boolean nextChar(iteratedChar it, CharsetDetector det);
\r
185 * Shift-JIS charset recognizer.
\r
188 static class CharsetRecog_sjis extends CharsetRecog_mbcs {
\r
189 static int [] commonChars =
\r
190 // TODO: This set of data comes from the character frequency-
\r
191 // of-occurence analysis tool. The data needs to be moved
\r
192 // into a resource and loaded from there.
\r
193 {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
\r
194 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
\r
195 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
\r
196 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
\r
197 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
\r
198 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
\r
200 boolean nextChar(iteratedChar it, CharsetDetector det) {
\r
201 it.index = it.nextIndex;
\r
204 firstByte = it.charValue = it.nextByte(det);
\r
205 if (firstByte < 0) {
\r
209 if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
\r
213 int secondByte = it.nextByte(det);
\r
214 if (secondByte < 0) {
\r
217 it.charValue = (firstByte << 8) | secondByte;
\r
218 if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
\r
219 // Illegal second byte value.
\r
225 int match(CharsetDetector det) {
\r
226 return match(det, commonChars);
\r
230 return "Shift_JIS";
\r
233 public String getLanguage()
\r
243 * Big5 charset recognizer.
\r
246 static class CharsetRecog_big5 extends CharsetRecog_mbcs {
\r
247 static int [] commonChars =
\r
248 // TODO: This set of data comes from the character frequency-
\r
249 // of-occurence analysis tool. The data needs to be moved
\r
250 // into a resource and loaded from there.
\r
251 {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
\r
252 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
\r
253 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
\r
254 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
\r
255 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
\r
256 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
\r
257 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
\r
258 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
\r
259 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
\r
260 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
\r
262 boolean nextChar(iteratedChar it, CharsetDetector det) {
\r
263 it.index = it.nextIndex;
\r
266 firstByte = it.charValue = it.nextByte(det);
\r
267 if (firstByte < 0) {
\r
271 if (firstByte <= 0x7f || firstByte==0xff) {
\r
272 // single byte character.
\r
276 int secondByte = it.nextByte(det);
\r
277 if (secondByte < 0) {
\r
280 it.charValue = (it.charValue << 8) | secondByte;
\r
282 if (secondByte < 0x40 ||
\r
283 secondByte ==0x7f ||
\r
284 secondByte == 0xff) {
\r
290 int match(CharsetDetector det) {
\r
291 return match(det, commonChars);
\r
299 public String getLanguage()
\r
307 * EUC charset recognizers. One abstract class that provides the common function
\r
308 * for getting the next character according to the EUC encoding scheme,
\r
309 * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
\r
312 abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
\r
316 * Get the next character value for EUC based encodings.
\r
317 * Character "value" is simply the raw bytes that make up the character
\r
318 * packed into an int.
\r
320 boolean nextChar(iteratedChar it, CharsetDetector det) {
\r
321 it.index = it.nextIndex;
\r
324 int secondByte = 0;
\r
326 //int fourthByte = 0;
\r
329 firstByte = it.charValue = it.nextByte(det);
\r
330 if (firstByte < 0) {
\r
331 // Ran off the end of the input data
\r
335 if (firstByte <= 0x8d) {
\r
336 // single byte char
\r
340 secondByte = it.nextByte(det);
\r
341 it.charValue = (it.charValue << 8) | secondByte;
\r
343 if (firstByte >= 0xA1 && firstByte <= 0xfe) {
\r
345 if (secondByte < 0xa1) {
\r
350 if (firstByte == 0x8e) {
\r
352 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
\r
353 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
\r
354 // We don't know which we've got.
\r
355 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
\r
356 // bytes will look like a well formed 2 byte char.
\r
357 if (secondByte < 0xa1) {
\r
363 if (firstByte == 0x8f) {
\r
365 // Three byte total char size, two bytes of actual char value.
\r
366 thirdByte = it.nextByte(det);
\r
367 it.charValue = (it.charValue << 8) | thirdByte;
\r
368 if (thirdByte < 0xa1) {
\r
374 return (it.done == false);
\r
378 * The charset recognize for EUC-JP. A singleton instance of this class
\r
379 * is created and kept by the public CharsetDetector class
\r
381 static class CharsetRecog_euc_jp extends CharsetRecog_euc {
\r
382 static int [] commonChars =
\r
383 // TODO: This set of data comes from the character frequency-
\r
384 // of-occurence analysis tool. The data needs to be moved
\r
385 // into a resource and loaded from there.
\r
386 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
\r
387 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
\r
388 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
\r
389 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
\r
390 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
\r
391 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
\r
392 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
\r
393 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
\r
394 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
\r
395 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
\r
400 int match(CharsetDetector det) {
\r
401 return match(det, commonChars);
\r
404 public String getLanguage()
\r
411 * The charset recognize for EUC-KR. A singleton instance of this class
\r
412 * is created and kept by the public CharsetDetector class
\r
414 static class CharsetRecog_euc_kr extends CharsetRecog_euc {
\r
415 static int [] commonChars =
\r
416 // TODO: This set of data comes from the character frequency-
\r
417 // of-occurence analysis tool. The data needs to be moved
\r
418 // into a resource and loaded from there.
\r
419 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
\r
420 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
\r
421 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
\r
422 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
\r
423 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
\r
424 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
\r
425 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
\r
426 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
\r
427 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
\r
428 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
\r
434 int match(CharsetDetector det) {
\r
435 return match(det, commonChars);
\r
438 public String getLanguage()
\r
447 * GB-18030 recognizer. Uses simplified Chinese statistics.
\r
450 static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
\r
454 * Get the next character value for EUC based encodings.
\r
455 * Character "value" is simply the raw bytes that make up the character
\r
456 * packed into an int.
\r
458 boolean nextChar(iteratedChar it, CharsetDetector det) {
\r
459 it.index = it.nextIndex;
\r
462 int secondByte = 0;
\r
464 int fourthByte = 0;
\r
467 firstByte = it.charValue = it.nextByte(det);
\r
469 if (firstByte < 0) {
\r
470 // Ran off the end of the input data
\r
475 if (firstByte <= 0x80) {
\r
476 // single byte char
\r
480 secondByte = it.nextByte(det);
\r
481 it.charValue = (it.charValue << 8) | secondByte;
\r
483 if (firstByte >= 0x81 && firstByte <= 0xFE) {
\r
485 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
\r
490 if (secondByte >= 0x30 && secondByte <= 0x39) {
\r
491 thirdByte = it.nextByte(det);
\r
493 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
\r
494 fourthByte = it.nextByte(det);
\r
496 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
\r
497 it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
\r
508 return (it.done == false);
\r
511 static int [] commonChars =
\r
512 // TODO: This set of data comes from the character frequency-
\r
513 // of-occurence analysis tool. The data needs to be moved
\r
514 // into a resource and loaded from there.
\r
515 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
\r
516 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
\r
517 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
\r
518 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
\r
519 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
\r
520 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
\r
521 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
\r
522 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
\r
523 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
\r
524 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
\r
531 int match(CharsetDetector det) {
\r
532 return match(det, commonChars);
\r
535 public String getLanguage()
\r