2 *******************************************************************************
\r
3 * Copyright (C) 2005 - 2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
10 * class CharsetRecog_2022 part of the ICU charset detection imlementation.
\r
11 * This is a superclass for the individual detectors for
\r
12 * each of the detectable members of the ISO 2022 family
\r
15 * The separate classes are nested within this class.
\r
17 abstract class CharsetRecog_2022 extends CharsetRecognizer {
\r
21 * Matching function shared among the 2022 detectors JP, CN and KR
\r
22 * Counts up the number of legal an unrecognized escape sequences in
\r
23 * the sample of text, and computes a score based on the total number &
\r
24 * the proportion that fit the encoding.
\r
27 * @param text the byte buffer containing text to analyse
\r
28 * @param textLen the size of the text in the byte.
\r
29 * @param escapeSequences the byte escape sequences to test for.
\r
30 * @return match quality, in the range of 0-100.
\r
32 int match(byte [] text, int textLen, byte [][] escapeSequences) {
\r
40 for (i=0; i<textLen; i++) {
\r
41 if (text[i] == 0x1b) {
\r
43 for (escN=0; escN<escapeSequences.length; escN++) {
\r
44 byte [] seq = escapeSequences[escN];
\r
46 if ((textLen - i) < seq.length) {
\r
47 continue checkEscapes;
\r
50 for (j=1; j<seq.length; j++) {
\r
51 if (seq[j] != text[i+j]) {
\r
52 continue checkEscapes;
\r
64 if (text[i] == 0x0e || text[i] == 0x0f) {
\r
75 // Initial quality is based on relative proportion of recongized vs.
\r
76 // unrecognized escape sequences.
\r
77 // All good: quality = 100;
\r
78 // half or less good: quality = 0;
\r
79 // linear inbetween.
\r
80 quality = (100*hits - 100*misses) / (hits + misses);
\r
82 // Back off quality if there were too few escape sequences seen.
\r
83 // Include shifts in this computation, so that KR does not get penalized
\r
84 // for having only a single Escape sequence, but many shifts.
\r
85 if (hits+shifts < 5) {
\r
86 quality -= (5-(hits+shifts))*10;
\r
98 static class CharsetRecog_2022JP extends CharsetRecog_2022 {
\r
99 private byte [] [] escapeSequences = {
\r
100 {0x1b, 0x24, 0x28, 0x43}, // KS X 1001:1992
\r
101 {0x1b, 0x24, 0x28, 0x44}, // JIS X 212-1990
\r
102 {0x1b, 0x24, 0x40}, // JIS C 6226-1978
\r
103 {0x1b, 0x24, 0x41}, // GB 2312-80
\r
104 {0x1b, 0x24, 0x42}, // JIS X 208-1983
\r
105 {0x1b, 0x26, 0x40}, // JIS X 208 1990, 1997
\r
106 {0x1b, 0x28, 0x42}, // ASCII
\r
107 {0x1b, 0x28, 0x48}, // JIS-Roman
\r
108 {0x1b, 0x28, 0x49}, // Half-width katakana
\r
109 {0x1b, 0x28, 0x4a}, // JIS-Roman
\r
110 {0x1b, 0x2e, 0x41}, // ISO 8859-1
\r
111 {0x1b, 0x2e, 0x46} // ISO 8859-7
\r
115 return "ISO-2022-JP";
\r
118 int match(CharsetDetector det) {
\r
119 return match(det.fInputBytes, det.fInputLen, escapeSequences);
\r
123 static class CharsetRecog_2022KR extends CharsetRecog_2022 {
\r
124 private byte [] [] escapeSequences = {
\r
125 {0x1b, 0x24, 0x29, 0x43}
\r
129 return "ISO-2022-KR";
\r
132 int match(CharsetDetector det) {
\r
133 return match(det.fInputBytes, det.fInputLen, escapeSequences);
\r
138 static class CharsetRecog_2022CN extends CharsetRecog_2022 {
\r
139 private byte [] [] escapeSequences = {
\r
140 {0x1b, 0x24, 0x29, 0x41}, // GB 2312-80
\r
141 {0x1b, 0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
\r
142 {0x1b, 0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
\r
143 {0x1b, 0x24, 0x29, 0x45}, // ISO-IR-165
\r
144 {0x1b, 0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
\r
145 {0x1b, 0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
\r
146 {0x1b, 0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
\r
147 {0x1b, 0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
\r
148 {0x1b, 0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
\r
149 {0x1b, 0x4e}, // SS2
\r
150 {0x1b, 0x4f}, // SS3
\r
154 return "ISO-2022-CN";
\r
158 int match(CharsetDetector det) {
\r
159 return match(det.fInputBytes, det.fInputLen, escapeSequences);
\r