jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/text/CharsetRecog_2022.java

   1 /*
   2 *******************************************************************************
   3 * Copyright (C) 2005 - 2012, International Business Machines Corporation and  *
   4 * others. All Rights Reserved.                                                *
   5 *******************************************************************************
   6 */
   7 package com.ibm.icu.text;
   8
   9 /**
  10  *  class CharsetRecog_2022  part of the ICU charset detection imlementation.
  11  *                           This is a superclass for the individual detectors for
  12  *                           each of the detectable members of the ISO 2022 family
  13  *                           of encodings.
  14  *
  15  *                           The separate classes are nested within this class.
  16  */
  17 abstract class CharsetRecog_2022 extends CharsetRecognizer {
  18
  19
  20     /**
  21      * Matching function shared among the 2022 detectors JP, CN and KR
  22      * Counts up the number of legal an unrecognized escape sequences in
  23      * the sample of text, and computes a score based on the total number &
  24      * the proportion that fit the encoding.
  25      *
  26      *
  27      * @param text the byte buffer containing text to analyse
  28      * @param textLen  the size of the text in the byte.
  29      * @param escapeSequences the byte escape sequences to test for.
  30      * @return match quality, in the range of 0-100.
  31      */
  32     int   match(byte [] text, int textLen, byte [][] escapeSequences) {
  33         int     i, j;
  34         int     escN;
  35         int     hits   = 0;
  36         int     misses = 0;
  37         int     shifts = 0;
  38         int     quality;
  39         scanInput:
  40             for (i=0; i<textLen; i++) {
  41                 if (text[i] == 0x1b) {
  42                     checkEscapes:
  43                         for (escN=0; escN<escapeSequences.length; escN++) {
  44                             byte [] seq = escapeSequences[escN];
  45
  46                             if ((textLen - i) < seq.length) {
  47                                 continue checkEscapes;
  48                             }
  49
  50                             for (j=1; j<seq.length; j++) {
  51                                 if (seq[j] != text[i+j])  {
  52                                     continue checkEscapes;
  53                                 }
  54                             }
  55
  56                             hits++;
  57                             i += seq.length-1;
  58                             continue scanInput;
  59                         }
  60
  61                         misses++;
  62                 }
  63
  64                 if (text[i] == 0x0e || text[i] == 0x0f) {
  65                     // Shift in/out
  66                     shifts++;
  67                 }
  68             }
  69
  70         if (hits == 0) {
  71             return 0;
  72         }
  73
  74         //
  75         // Initial quality is based on relative proportion of recongized vs.
  76         //   unrecognized escape sequences.
  77         //   All good:  quality = 100;
  78         //   half or less good: quality = 0;
  79         //   linear inbetween.
  80         quality = (100*hits - 100*misses) / (hits + misses);
  81
  82         // Back off quality if there were too few escape sequences seen.
  83         //   Include shifts in this computation, so that KR does not get penalized
  84         //   for having only a single Escape sequence, but many shifts.
  85         if (hits+shifts < 5) {
  86             quality -= (5-(hits+shifts))*10;
  87         }
  88
  89         if (quality < 0) {
  90             quality = 0;
  91         }
  92         return quality;
  93     }
  94
  95
  96
  97
  98     static class CharsetRecog_2022JP extends CharsetRecog_2022 {
  99         private byte [] [] escapeSequences = {
 100                 {0x1b, 0x24, 0x28, 0x43},   // KS X 1001:1992
 101                 {0x1b, 0x24, 0x28, 0x44},   // JIS X 212-1990
 102                 {0x1b, 0x24, 0x40},         // JIS C 6226-1978
 103                 {0x1b, 0x24, 0x41},         // GB 2312-80
 104                 {0x1b, 0x24, 0x42},         // JIS X 208-1983
 105                 {0x1b, 0x26, 0x40},         // JIS X 208 1990, 1997
 106                 {0x1b, 0x28, 0x42},         // ASCII
 107                 {0x1b, 0x28, 0x48},         // JIS-Roman
 108                 {0x1b, 0x28, 0x49},         // Half-width katakana
 109                 {0x1b, 0x28, 0x4a},         // JIS-Roman
 110                 {0x1b, 0x2e, 0x41},         // ISO 8859-1
 111                 {0x1b, 0x2e, 0x46}          // ISO 8859-7
 112                 };
 113
 114         String getName() {
 115             return "ISO-2022-JP";
 116         }
 117
 118         CharsetMatch   match(CharsetDetector det) {
 119             int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
 120             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
 121         }
 122     }
 123
 124     static class CharsetRecog_2022KR extends CharsetRecog_2022 {
 125         private byte [] [] escapeSequences = {
 126                 {0x1b, 0x24, 0x29, 0x43}
 127                  };
 128
 129         String getName() {
 130             return "ISO-2022-KR";
 131         }
 132
 133         CharsetMatch   match(CharsetDetector det) {
 134             int confidence =  match(det.fInputBytes, det.fInputLen, escapeSequences);
 135             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
 136         }
 137     }
 138
 139     static class CharsetRecog_2022CN extends CharsetRecog_2022 {
 140         private byte [] [] escapeSequences = {
 141                 {0x1b, 0x24, 0x29, 0x41},   // GB 2312-80
 142                 {0x1b, 0x24, 0x29, 0x47},   // CNS 11643-1992 Plane 1
 143                 {0x1b, 0x24, 0x2A, 0x48},   // CNS 11643-1992 Plane 2
 144                 {0x1b, 0x24, 0x29, 0x45},   // ISO-IR-165
 145                 {0x1b, 0x24, 0x2B, 0x49},   // CNS 11643-1992 Plane 3
 146                 {0x1b, 0x24, 0x2B, 0x4A},   // CNS 11643-1992 Plane 4
 147                 {0x1b, 0x24, 0x2B, 0x4B},   // CNS 11643-1992 Plane 5
 148                 {0x1b, 0x24, 0x2B, 0x4C},   // CNS 11643-1992 Plane 6
 149                 {0x1b, 0x24, 0x2B, 0x4D},   // CNS 11643-1992 Plane 7
 150                 {0x1b, 0x4e},               // SS2
 151                 {0x1b, 0x4f},               // SS3
 152         };
 153
 154         String getName() {
 155             return "ISO-2022-CN";
 156         }
 157
 158         CharsetMatch   match(CharsetDetector det) {
 159             int confidence = match(det.fInputBytes, det.fInputLen, escapeSequences);
 160             return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
 161         }
 162     }
 163
 164 }
 165