jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/text/CharsetRecog_mbcs.java

   1 /*
   2  ****************************************************************************
   3  * Copyright (C) 2005-2012, International Business Machines Corporation and *
   4  * others. All Rights Reserved.                                             *
   5  ****************************************************************************
   6  *
   7  */
   8 package com.ibm.icu.text;
   9
  10 import java.util.Arrays;
  11
  12 /**
  13  * CharsetRecognizer implemenation for Asian  - double or multi-byte - charsets.
  14  *                   Match is determined mostly by the input data adhering to the
  15  *                   encoding scheme for the charset, and, optionally,
  16  *                   frequency-of-occurence of characters.
  17  * <p/>
  18  *                   Instances of this class are singletons, one per encoding
  19  *                   being recognized.  They are created in the main
  20  *                   CharsetDetector class and kept in the global list of available
  21  *                   encodings to be checked.  The specific encoding being recognized
  22  *                   is determined by subclass.
  23  */
  24 abstract class CharsetRecog_mbcs extends CharsetRecognizer {
  25
  26    /**
  27      * Get the IANA name of this charset.
  28      * @return the charset name.
  29      */
  30     abstract String      getName() ;
  31
  32
  33     /**
  34      * Test the match of this charset with the input text data
  35      *      which is obtained via the CharsetDetector object.
  36      *
  37      * @param det  The CharsetDetector, which contains the input text
  38      *             to be checked for being in this charset.
  39      * @return     Two values packed into one int  (Damn java, anyhow)
  40      *             <br/>
  41      *             bits 0-7:  the match confidence, ranging from 0-100
  42      *             <br/>
  43      *             bits 8-15: The match reason, an enum-like value.
  44      */
  45     int match(CharsetDetector det, int [] commonChars) {
  46         @SuppressWarnings("unused")
  47         int   singleByteCharCount = 0;  //TODO Do we really need this?
  48         int   doubleByteCharCount = 0;
  49         int   commonCharCount     = 0;
  50         int   badCharCount        = 0;
  51         int   totalCharCount      = 0;
  52         int   confidence          = 0;
  53         iteratedChar   iter       = new iteratedChar();
  54
  55         detectBlock: {
  56             for (iter.reset(); nextChar(iter, det);) {
  57                 totalCharCount++;
  58                 if (iter.error) {
  59                     badCharCount++;
  60                 } else {
  61                     long cv = iter.charValue & 0xFFFFFFFFL;
  62
  63                     if (cv <= 0xff) {
  64                         singleByteCharCount++;
  65                     } else {
  66                         doubleByteCharCount++;
  67                         if (commonChars != null) {
  68                             // NOTE: This assumes that there are no 4-byte common chars.
  69                             if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
  70                                 commonCharCount++;
  71                             }
  72                         }
  73                     }
  74                 }
  75                 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
  76                     // Bail out early if the byte data is not matching the encoding scheme.
  77                     break detectBlock;
  78                 }
  79             }
  80
  81             if (doubleByteCharCount <= 10 && badCharCount== 0) {
  82                 // Not many multi-byte chars.
  83                 if (doubleByteCharCount == 0 && totalCharCount < 10) {
  84                     // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
  85                     // We don't have enough data to have any confidence.
  86                     // Statistical analysis of single byte non-ASCII charcters would probably help here.
  87                     confidence = 0;
  88                 }
  89                 else {
  90                     //   ASCII or ISO file?  It's probably not our encoding,
  91                     //   but is not incompatible with our encoding, so don't give it a zero.
  92                     confidence = 10;
  93                 }
  94
  95                 break detectBlock;
  96             }
  97
  98             //
  99             //  No match if there are too many characters that don't fit the encoding scheme.
 100             //    (should we have zero tolerance for these?)
 101             //
 102             if (doubleByteCharCount < 20*badCharCount) {
 103                 confidence = 0;
 104                 break detectBlock;
 105             }
 106
 107             if (commonChars == null) {
 108                 // We have no statistics on frequently occuring characters.
 109                 //  Assess confidence purely on having a reasonable number of
 110                 //  multi-byte characters (the more the better
 111                 confidence = 30 + doubleByteCharCount - 20*badCharCount;
 112                 if (confidence > 100) {
 113                     confidence = 100;
 114                 }
 115             }else {
 116                 //
 117                 // Frequency of occurence statistics exist.
 118                 //
 119                 double maxVal = Math.log((float)doubleByteCharCount / 4);
 120                 double scaleFactor = 90.0 / maxVal;
 121                 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
 122                 confidence = Math.min(confidence, 100);
 123             }
 124         }   // end of detectBlock:
 125
 126         return confidence;
 127     }
 128
 129      // "Character"  iterated character class.
 130      //    Recognizers for specific mbcs encodings make their "characters" available
 131      //    by providing a nextChar() function that fills in an instance of iteratedChar
 132      //    with the next char from the input.
 133      //    The returned characters are not converted to Unicode, but remain as the raw
 134      //    bytes (concatenated into an int) from the codepage data.
 135      //
 136      //  For Asian charsets, use the raw input rather than the input that has been
 137      //   stripped of markup.  Detection only considers multi-byte chars, effectively
 138      //   stripping markup anyway, and double byte chars do occur in markup too.
 139      //
 140      static class iteratedChar {
 141          int             charValue = 0;             // 1-4 bytes from the raw input data
 142          int             index     = 0;
 143          int             nextIndex = 0;
 144          boolean         error     = false;
 145          boolean         done      = false;
 146
 147          void reset() {
 148              charValue = 0;
 149              index     = -1;
 150              nextIndex = 0;
 151              error     = false;
 152              done      = false;
 153          }
 154
 155          int nextByte(CharsetDetector det) {
 156              if (nextIndex >= det.fRawLength) {
 157                  done = true;
 158                  return -1;
 159              }
 160              int byteValue = (int)det.fRawInput[nextIndex++] & 0x00ff;
 161              return byteValue;
 162          }
 163      }
 164
 165      /**
 166       * Get the next character (however many bytes it is) from the input data
 167       *    Subclasses for specific charset encodings must implement this function
 168       *    to get characters according to the rules of their encoding scheme.
 169       *
 170       *  This function is not a method of class iteratedChar only because
 171       *   that would require a lot of extra derived classes, which is awkward.
 172       * @param it  The iteratedChar "struct" into which the returned char is placed.
 173       * @param det The charset detector, which is needed to get at the input byte data
 174       *            being iterated over.
 175       * @return    True if a character was returned, false at end of input.
 176       */
 177      abstract boolean nextChar(iteratedChar it, CharsetDetector det);
 178
 179
 180
 181
 182
 183      /**
 184       *   Shift-JIS charset recognizer.
 185       *
 186       */
 187      static class CharsetRecog_sjis extends CharsetRecog_mbcs {
 188          static int [] commonChars =
 189              // TODO:  This set of data comes from the character frequency-
 190              //        of-occurence analysis tool.  The data needs to be moved
 191              //        into a resource and loaded from there.
 192             {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
 193              0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
 194              0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
 195              0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
 196              0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
 197              0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
 198
 199          boolean nextChar(iteratedChar it, CharsetDetector det) {
 200              it.index = it.nextIndex;
 201              it.error = false;
 202              int firstByte;
 203              firstByte = it.charValue = it.nextByte(det);
 204              if (firstByte < 0) {
 205                  return false;
 206              }
 207
 208              if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
 209                  return true;
 210              }
 211
 212              int secondByte = it.nextByte(det);
 213              if (secondByte < 0)  {
 214                  return false;
 215              }
 216              it.charValue = (firstByte << 8) | secondByte;
 217              if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
 218                  // Illegal second byte value.
 219                  it.error = true;
 220              }
 221              return true;
 222          }
 223
 224          CharsetMatch match(CharsetDetector det) {
 225              int confidence = match(det, commonChars);
 226              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
 227          }
 228
 229          String getName() {
 230              return "Shift_JIS";
 231          }
 232
 233          public String getLanguage()
 234          {
 235              return "ja";
 236          }
 237
 238
 239      }
 240
 241
 242      /**
 243       *   Big5 charset recognizer.
 244       *
 245       */
 246      static class CharsetRecog_big5 extends CharsetRecog_mbcs {
 247          static int [] commonChars =
 248              // TODO:  This set of data comes from the character frequency-
 249              //        of-occurence analysis tool.  The data needs to be moved
 250              //        into a resource and loaded from there.
 251             {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
 252              0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
 253              0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
 254              0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
 255              0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
 256              0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
 257              0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
 258              0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
 259              0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
 260              0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
 261
 262          boolean nextChar(iteratedChar it, CharsetDetector det) {
 263              it.index = it.nextIndex;
 264              it.error = false;
 265              int firstByte;
 266              firstByte = it.charValue = it.nextByte(det);
 267              if (firstByte < 0) {
 268                  return false;
 269              }
 270
 271              if (firstByte <= 0x7f || firstByte==0xff) {
 272                  // single byte character.
 273                  return true;
 274              }
 275
 276              int secondByte = it.nextByte(det);
 277              if (secondByte < 0)  {
 278                  return false;
 279              }
 280              it.charValue = (it.charValue << 8) | secondByte;
 281
 282              if (secondByte < 0x40 ||
 283                  secondByte ==0x7f ||
 284                  secondByte == 0xff) {
 285                      it.error = true;
 286              }
 287              return true;
 288          }
 289
 290          CharsetMatch match(CharsetDetector det) {
 291              int confidence = match(det, commonChars);
 292              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
 293          }
 294
 295          String getName() {
 296              return "Big5";
 297          }
 298
 299
 300          public String getLanguage()
 301          {
 302              return "zh";
 303          }
 304      }
 305
 306
 307      /**
 308       *   EUC charset recognizers.  One abstract class that provides the common function
 309       *             for getting the next character according to the EUC encoding scheme,
 310       *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
 311       *
 312       */
 313      abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
 314
 315          /*
 316           *  (non-Javadoc)
 317           *  Get the next character value for EUC based encodings.
 318           *  Character "value" is simply the raw bytes that make up the character
 319           *     packed into an int.
 320           */
 321          boolean nextChar(iteratedChar it, CharsetDetector det) {
 322              it.index = it.nextIndex;
 323              it.error = false;
 324              int firstByte  = 0;
 325              int secondByte = 0;
 326              int thirdByte  = 0;
 327              //int fourthByte = 0;
 328
 329              buildChar: {
 330                  firstByte = it.charValue = it.nextByte(det);
 331                  if (firstByte < 0) {
 332                      // Ran off the end of the input data
 333                      it.done = true;
 334                      break buildChar;
 335                  }
 336                  if (firstByte <= 0x8d) {
 337                      // single byte char
 338                      break buildChar;
 339                  }
 340
 341                  secondByte = it.nextByte(det);
 342                  it.charValue = (it.charValue << 8) | secondByte;
 343
 344                  if (firstByte >= 0xA1 && firstByte <= 0xfe) {
 345                      // Two byte Char
 346                      if (secondByte < 0xa1) {
 347                          it.error = true;
 348                      }
 349                      break buildChar;
 350                  }
 351                  if (firstByte == 0x8e) {
 352                      // Code Set 2.
 353                      //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
 354                      //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
 355                      // We don't know which we've got.
 356                      // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
 357                      //   bytes will look like a well formed 2 byte char.
 358                      if (secondByte < 0xa1) {
 359                          it.error = true;
 360                      }
 361                      break buildChar;
 362                  }
 363
 364                  if (firstByte == 0x8f) {
 365                      // Code set 3.
 366                      // Three byte total char size, two bytes of actual char value.
 367                      thirdByte    = it.nextByte(det);
 368                      it.charValue = (it.charValue << 8) | thirdByte;
 369                      if (thirdByte < 0xa1) {
 370                          it.error = true;
 371                      }
 372                  }
 373               }
 374
 375              return (it.done == false);
 376          }
 377
 378          /**
 379           * The charset recognize for EUC-JP.  A singleton instance of this class
 380           *    is created and kept by the public CharsetDetector class
 381           */
 382          static class CharsetRecog_euc_jp extends CharsetRecog_euc {
 383              static int [] commonChars =
 384                  // TODO:  This set of data comes from the character frequency-
 385                  //        of-occurence analysis tool.  The data needs to be moved
 386                  //        into a resource and loaded from there.
 387                 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
 388                  0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
 389                  0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
 390                  0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
 391                  0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
 392                  0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
 393                  0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
 394                  0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
 395                  0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
 396                  0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
 397              String getName() {
 398                  return "EUC-JP";
 399              }
 400
 401              CharsetMatch match(CharsetDetector det) {
 402                  int confidence = match(det, commonChars);
 403                  return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
 404              }
 405
 406              public String getLanguage()
 407              {
 408                  return "ja";
 409              }
 410          }
 411
 412          /**
 413           * The charset recognize for EUC-KR.  A singleton instance of this class
 414           *    is created and kept by the public CharsetDetector class
 415           */
 416          static class CharsetRecog_euc_kr extends CharsetRecog_euc {
 417              static int [] commonChars =
 418                  // TODO:  This set of data comes from the character frequency-
 419                  //        of-occurence analysis tool.  The data needs to be moved
 420                  //        into a resource and loaded from there.
 421                 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
 422                  0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
 423                  0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
 424                  0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
 425                  0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
 426                  0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
 427                  0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
 428                  0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
 429                  0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
 430                  0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
 431
 432              String getName() {
 433                  return "EUC-KR";
 434              }
 435
 436              CharsetMatch match(CharsetDetector det) {
 437                  int confidence = match(det, commonChars);
 438                  return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
 439              }
 440
 441              public String getLanguage()
 442              {
 443                  return "ko";
 444              }
 445          }
 446      }
 447
 448      /**
 449       *
 450       *   GB-18030 recognizer. Uses simplified Chinese statistics.
 451       *
 452       */
 453      static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
 454
 455          /*
 456           *  (non-Javadoc)
 457           *  Get the next character value for EUC based encodings.
 458           *  Character "value" is simply the raw bytes that make up the character
 459           *     packed into an int.
 460           */
 461          boolean nextChar(iteratedChar it, CharsetDetector det) {
 462              it.index = it.nextIndex;
 463              it.error = false;
 464              int firstByte  = 0;
 465              int secondByte = 0;
 466              int thirdByte  = 0;
 467              int fourthByte = 0;
 468
 469              buildChar: {
 470                  firstByte = it.charValue = it.nextByte(det);
 471
 472                  if (firstByte < 0) {
 473                      // Ran off the end of the input data
 474                      it.done = true;
 475                      break buildChar;
 476                  }
 477
 478                  if (firstByte <= 0x80) {
 479                      // single byte char
 480                      break buildChar;
 481                  }
 482
 483                  secondByte = it.nextByte(det);
 484                  it.charValue = (it.charValue << 8) | secondByte;
 485
 486                  if (firstByte >= 0x81 && firstByte <= 0xFE) {
 487                      // Two byte Char
 488                      if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
 489                          break buildChar;
 490                      }
 491
 492                      // Four byte char
 493                      if (secondByte >= 0x30 && secondByte <= 0x39) {
 494                          thirdByte = it.nextByte(det);
 495
 496                          if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
 497                              fourthByte = it.nextByte(det);
 498
 499                              if (fourthByte >= 0x30 && fourthByte <= 0x39) {
 500                                  it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
 501                                  break buildChar;
 502                              }
 503                          }
 504                      }
 505
 506                      it.error = true;
 507                      break buildChar;
 508                  }
 509              }
 510
 511              return (it.done == false);
 512          }
 513
 514          static int [] commonChars =
 515              // TODO:  This set of data comes from the character frequency-
 516              //        of-occurence analysis tool.  The data needs to be moved
 517              //        into a resource and loaded from there.
 518             {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
 519              0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
 520              0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
 521              0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
 522              0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
 523              0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
 524              0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
 525              0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
 526              0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
 527              0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
 528
 529
 530          String getName() {
 531              return "GB18030";
 532          }
 533
 534          CharsetMatch match(CharsetDetector det) {
 535              int confidence = match(det, commonChars);
 536              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
 537          }
 538
 539          public String getLanguage()
 540          {
 541              return "zh";
 542          }
 543      }
 544
 545
 546 }