jars/icu4j-4_4_2-src/main/classes/core/src/com/ibm/icu/text/CharsetRecog_mbcs.java

   1 /*\r
   2  ****************************************************************************\r
   3  * Copyright (C) 2005-2010, International Business Machines Corporation and *\r
   4  * others. All Rights Reserved.                                             *\r
   5  ****************************************************************************\r
   6  *\r
   7  */\r
   8 package com.ibm.icu.text;\r
   9 \r
  10 import java.util.Arrays;\r
  11 \r
  12 /**\r
  13  * CharsetRecognizer implemenation for Asian  - double or multi-byte - charsets.\r
  14  *                   Match is determined mostly by the input data adhering to the\r
  15  *                   encoding scheme for the charset, and, optionally,\r
  16  *                   frequency-of-occurence of characters.\r
  17  * <p/>\r
  18  *                   Instances of this class are singletons, one per encoding\r
  19  *                   being recognized.  They are created in the main\r
  20  *                   CharsetDetector class and kept in the global list of available\r
  21  *                   encodings to be checked.  The specific encoding being recognized\r
  22  *                   is determined by subclass.\r
  23  */\r
  24 abstract class CharsetRecog_mbcs extends CharsetRecognizer {\r
  25 \r
  26    /**\r
  27      * Get the IANA name of this charset.\r
  28      * @return the charset name.\r
  29      */\r
  30     abstract String      getName() ;\r
  31     \r
  32     \r
  33     /**\r
  34      * Test the match of this charset with the input text data\r
  35      *      which is obtained via the CharsetDetector object.\r
  36      * \r
  37      * @param det  The CharsetDetector, which contains the input text\r
  38      *             to be checked for being in this charset.\r
  39      * @return     Two values packed into one int  (Damn java, anyhow)\r
  40      *             <br/>\r
  41      *             bits 0-7:  the match confidence, ranging from 0-100\r
  42      *             <br/>\r
  43      *             bits 8-15: The match reason, an enum-like value.\r
  44      */\r
  45     int match(CharsetDetector det, int [] commonChars) {\r
  46         int   singleByteCharCount = 0;\r
  47         int   doubleByteCharCount = 0;\r
  48         int   commonCharCount     = 0;\r
  49         int   badCharCount        = 0;\r
  50         int   totalCharCount      = 0;\r
  51         int   confidence          = 0;\r
  52         iteratedChar   iter       = new iteratedChar();\r
  53         \r
  54         detectBlock: {\r
  55             for (iter.reset(); nextChar(iter, det);) {\r
  56                 totalCharCount++;\r
  57                 if (iter.error) {\r
  58                     badCharCount++; \r
  59                 } else {\r
  60                     long cv = iter.charValue & 0xFFFFFFFFL;\r
  61                                         \r
  62                     if (cv <= 0xff) {\r
  63                         singleByteCharCount++;\r
  64                     } else {\r
  65                         doubleByteCharCount++;\r
  66                         if (commonChars != null) {\r
  67                             // NOTE: This assumes that there are no 4-byte common chars.\r
  68                             if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {\r
  69                                 commonCharCount++;\r
  70                             }\r
  71                         }\r
  72                     }\r
  73                 }\r
  74                 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {\r
  75                     // Bail out early if the byte data is not matching the encoding scheme.\r
  76                     break detectBlock;\r
  77                 }\r
  78             }\r
  79             \r
  80             if (doubleByteCharCount <= 10 && badCharCount== 0) {\r
  81                 // Not many multi-byte chars.\r
  82                 if (doubleByteCharCount == 0 && totalCharCount < 10) {\r
  83                     // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.\r
  84                     // We don't have enough data to have any confidence.\r
  85                     // Statistical analysis of single byte non-ASCII charcters would probably help here.\r
  86                     confidence = 0;\r
  87                 }\r
  88                 else {\r
  89                     //   ASCII or ISO file?  It's probably not our encoding,\r
  90                     //   but is not incompatible with our encoding, so don't give it a zero.\r
  91                     confidence = 10;\r
  92                 }\r
  93                 \r
  94                 break detectBlock;\r
  95             }\r
  96             \r
  97             //\r
  98             //  No match if there are too many characters that don't fit the encoding scheme.\r
  99             //    (should we have zero tolerance for these?)\r
 100             //\r
 101             if (doubleByteCharCount < 20*badCharCount) {\r
 102                 confidence = 0;\r
 103                 break detectBlock;\r
 104             }\r
 105             \r
 106             if (commonChars == null) {\r
 107                 // We have no statistics on frequently occuring characters.\r
 108                 //  Assess confidence purely on having a reasonable number of\r
 109                 //  multi-byte characters (the more the better\r
 110                 confidence = 30 + doubleByteCharCount - 20*badCharCount;\r
 111                 if (confidence > 100) {\r
 112                     confidence = 100;\r
 113                 }\r
 114             }else {\r
 115                 //\r
 116                 // Frequency of occurence statistics exist.\r
 117                 //\r
 118                 double maxVal = Math.log((float)doubleByteCharCount / 4);\r
 119                 double scaleFactor = 90.0 / maxVal;\r
 120                 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);\r
 121                 confidence = Math.min(confidence, 100);\r
 122             }\r
 123         }   // end of detectBlock:\r
 124         \r
 125         return confidence;\r
 126     }\r
 127     \r
 128      // "Character"  iterated character class.\r
 129      //    Recognizers for specific mbcs encodings make their "characters" available\r
 130      //    by providing a nextChar() function that fills in an instance of iteratedChar\r
 131      //    with the next char from the input.\r
 132      //    The returned characters are not converted to Unicode, but remain as the raw\r
 133      //    bytes (concatenated into an int) from the codepage data.\r
 134      //\r
 135      //  For Asian charsets, use the raw input rather than the input that has been\r
 136      //   stripped of markup.  Detection only considers multi-byte chars, effectively\r
 137      //   stripping markup anyway, and double byte chars do occur in markup too.\r
 138      //\r
 139      static class iteratedChar {\r
 140          int             charValue = 0;             // 1-4 bytes from the raw input data\r
 141          int             index     = 0;\r
 142          int             nextIndex = 0;\r
 143          boolean         error     = false;\r
 144          boolean         done      = false;\r
 145          \r
 146          void reset() {\r
 147              charValue = 0;\r
 148              index     = -1;\r
 149              nextIndex = 0;\r
 150              error     = false;\r
 151              done      = false;\r
 152          }\r
 153          \r
 154          int nextByte(CharsetDetector det) {\r
 155              if (nextIndex >= det.fRawLength) {\r
 156                  done = true;\r
 157                  return -1;\r
 158              }\r
 159              int byteValue = (int)det.fRawInput[nextIndex++] & 0x00ff;\r
 160              return byteValue;\r
 161          }       \r
 162      }\r
 163      \r
 164      /**\r
 165       * Get the next character (however many bytes it is) from the input data\r
 166       *    Subclasses for specific charset encodings must implement this function\r
 167       *    to get characters according to the rules of their encoding scheme.\r
 168       * \r
 169       *  This function is not a method of class iteratedChar only because\r
 170       *   that would require a lot of extra derived classes, which is awkward.\r
 171       * @param it  The iteratedChar "struct" into which the returned char is placed.\r
 172       * @param det The charset detector, which is needed to get at the input byte data\r
 173       *            being iterated over.\r
 174       * @return    True if a character was returned, false at end of input.\r
 175       */\r
 176      abstract boolean nextChar(iteratedChar it, CharsetDetector det);\r
 177      \r
 178 \r
 179 \r
 180      \r
 181      \r
 182      /**\r
 183       *   Shift-JIS charset recognizer.   \r
 184       *\r
 185       */\r
 186      static class CharsetRecog_sjis extends CharsetRecog_mbcs {\r
 187          static int [] commonChars = \r
 188              // TODO:  This set of data comes from the character frequency-\r
 189              //        of-occurence analysis tool.  The data needs to be moved\r
 190              //        into a resource and loaded from there.\r
 191             {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, \r
 192              0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, \r
 193              0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, \r
 194              0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, \r
 195              0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, \r
 196              0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};\r
 197          \r
 198          boolean nextChar(iteratedChar it, CharsetDetector det) {\r
 199              it.index = it.nextIndex;\r
 200              it.error = false;\r
 201              int firstByte;\r
 202              firstByte = it.charValue = it.nextByte(det);\r
 203              if (firstByte < 0) {\r
 204                  return false;\r
 205              }\r
 206              \r
 207              if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {\r
 208                  return true;\r
 209              }\r
 210              \r
 211              int secondByte = it.nextByte(det);\r
 212              if (secondByte < 0)  {\r
 213                  return false;          \r
 214              }\r
 215              it.charValue = (firstByte << 8) | secondByte;\r
 216              if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {\r
 217                  // Illegal second byte value.\r
 218                  it.error = true;\r
 219              }\r
 220              return true;\r
 221          }\r
 222          \r
 223          int match(CharsetDetector det) {\r
 224              return match(det, commonChars);\r
 225          }\r
 226          \r
 227          String getName() {\r
 228              return "Shift_JIS";\r
 229          }\r
 230          \r
 231          public String getLanguage()\r
 232          {\r
 233              return "ja";\r
 234          }\r
 235 \r
 236          \r
 237      }\r
 238      \r
 239      \r
 240      /**\r
 241       *   Big5 charset recognizer.   \r
 242       *\r
 243       */\r
 244      static class CharsetRecog_big5 extends CharsetRecog_mbcs {\r
 245          static int [] commonChars = \r
 246              // TODO:  This set of data comes from the character frequency-\r
 247              //        of-occurence analysis tool.  The data needs to be moved\r
 248              //        into a resource and loaded from there.\r
 249             {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, \r
 250              0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, \r
 251              0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, \r
 252              0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, \r
 253              0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, \r
 254              0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, \r
 255              0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, \r
 256              0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, \r
 257              0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, \r
 258              0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};\r
 259           \r
 260          boolean nextChar(iteratedChar it, CharsetDetector det) {\r
 261              it.index = it.nextIndex;\r
 262              it.error = false;\r
 263              int firstByte;\r
 264              firstByte = it.charValue = it.nextByte(det);\r
 265              if (firstByte < 0) {\r
 266                  return false;\r
 267              }\r
 268              \r
 269              if (firstByte <= 0x7f || firstByte==0xff) {\r
 270                  // single byte character.\r
 271                  return true;\r
 272              }\r
 273              \r
 274              int secondByte = it.nextByte(det);\r
 275              if (secondByte < 0)  {\r
 276                  return false;          \r
 277              }\r
 278              it.charValue = (it.charValue << 8) | secondByte;\r
 279 \r
 280              if (secondByte < 0x40 ||\r
 281                  secondByte ==0x7f ||\r
 282                  secondByte == 0xff) {\r
 283                      it.error = true;\r
 284              }\r
 285              return true;\r
 286          }\r
 287          \r
 288          int match(CharsetDetector det) {\r
 289              return match(det, commonChars);\r
 290          }\r
 291          \r
 292          String getName() {\r
 293              return "Big5";\r
 294          }\r
 295          \r
 296          \r
 297          public String getLanguage()\r
 298          {\r
 299              return "zh";\r
 300          }\r
 301      }\r
 302      \r
 303      \r
 304      /**\r
 305       *   EUC charset recognizers.  One abstract class that provides the common function\r
 306       *             for getting the next character according to the EUC encoding scheme,\r
 307       *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.   \r
 308       *\r
 309       */\r
 310      abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {\r
 311          \r
 312          /*\r
 313           *  (non-Javadoc)\r
 314           *  Get the next character value for EUC based encodings.\r
 315           *  Character "value" is simply the raw bytes that make up the character\r
 316           *     packed into an int.\r
 317           */\r
 318          boolean nextChar(iteratedChar it, CharsetDetector det) {\r
 319              it.index = it.nextIndex;\r
 320              it.error = false;\r
 321              int firstByte  = 0;\r
 322              int secondByte = 0;\r
 323              int thirdByte  = 0;\r
 324              //int fourthByte = 0;\r
 325              \r
 326              buildChar: {\r
 327                  firstByte = it.charValue = it.nextByte(det);                 \r
 328                  if (firstByte < 0) {\r
 329                      // Ran off the end of the input data\r
 330                      it.done = true;\r
 331                      break buildChar;\r
 332                  }\r
 333                  if (firstByte <= 0x8d) {\r
 334                      // single byte char\r
 335                      break buildChar;\r
 336                  }\r
 337                  \r
 338                  secondByte = it.nextByte(det);\r
 339                  it.charValue = (it.charValue << 8) | secondByte;\r
 340                  \r
 341                  if (firstByte >= 0xA1 && firstByte <= 0xfe) {\r
 342                      // Two byte Char\r
 343                      if (secondByte < 0xa1) {\r
 344                          it.error = true;\r
 345                      }\r
 346                      break buildChar;\r
 347                  }\r
 348                  if (firstByte == 0x8e) {\r
 349                      // Code Set 2.\r
 350                      //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.\r
 351                      //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.\r
 352                      // We don't know which we've got.\r
 353                      // Treat it like EUC-JP.  If the data really was EUC-TW, the following two\r
 354                      //   bytes will look like a well formed 2 byte char.  \r
 355                      if (secondByte < 0xa1) {\r
 356                          it.error = true;\r
 357                      }\r
 358                      break buildChar;                     \r
 359                  }\r
 360                  \r
 361                  if (firstByte == 0x8f) {\r
 362                      // Code set 3.\r
 363                      // Three byte total char size, two bytes of actual char value.\r
 364                      thirdByte    = it.nextByte(det);\r
 365                      it.charValue = (it.charValue << 8) | thirdByte;\r
 366                      if (thirdByte < 0xa1) {\r
 367                          it.error = true;\r
 368                      }\r
 369                  }\r
 370               }\r
 371              \r
 372              return (it.done == false);\r
 373          }\r
 374          \r
 375          /**\r
 376           * The charset recognize for EUC-JP.  A singleton instance of this class\r
 377           *    is created and kept by the public CharsetDetector class\r
 378           */\r
 379          static class CharsetRecog_euc_jp extends CharsetRecog_euc {\r
 380              static int [] commonChars = \r
 381                  // TODO:  This set of data comes from the character frequency-\r
 382                  //        of-occurence analysis tool.  The data needs to be moved\r
 383                  //        into a resource and loaded from there.\r
 384                 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, \r
 385                  0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, \r
 386                  0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, \r
 387                  0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, \r
 388                  0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, \r
 389                  0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, \r
 390                  0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, \r
 391                  0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, \r
 392                  0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, \r
 393                  0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};             \r
 394              String getName() {\r
 395                  return "EUC-JP";\r
 396              }\r
 397              \r
 398              int match(CharsetDetector det) {\r
 399                  return match(det, commonChars);\r
 400              }\r
 401              \r
 402              public String getLanguage()\r
 403              {\r
 404                  return "ja";\r
 405              }\r
 406          }\r
 407          \r
 408          /**\r
 409           * The charset recognize for EUC-KR.  A singleton instance of this class\r
 410           *    is created and kept by the public CharsetDetector class\r
 411           */\r
 412          static class CharsetRecog_euc_kr extends CharsetRecog_euc {\r
 413              static int [] commonChars = \r
 414                  // TODO:  This set of data comes from the character frequency-\r
 415                  //        of-occurence analysis tool.  The data needs to be moved\r
 416                  //        into a resource and loaded from there.\r
 417                 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, \r
 418                  0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, \r
 419                  0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, \r
 420                  0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, \r
 421                  0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, \r
 422                  0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, \r
 423                  0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, \r
 424                  0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, \r
 425                  0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, \r
 426                  0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};\r
 427              \r
 428              String getName() {\r
 429                  return "EUC-KR";\r
 430              }\r
 431              \r
 432              int match(CharsetDetector det) {\r
 433                  return match(det, commonChars);\r
 434              }\r
 435              \r
 436              public String getLanguage()\r
 437              {\r
 438                  return "ko";\r
 439              }\r
 440          }\r
 441      }\r
 442      \r
 443      /**\r
 444       * \r
 445       *   GB-18030 recognizer. Uses simplified Chinese statistics.   \r
 446       *\r
 447       */\r
 448      static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {\r
 449          \r
 450          /*\r
 451           *  (non-Javadoc)\r
 452           *  Get the next character value for EUC based encodings.\r
 453           *  Character "value" is simply the raw bytes that make up the character\r
 454           *     packed into an int.\r
 455           */\r
 456          boolean nextChar(iteratedChar it, CharsetDetector det) {\r
 457              it.index = it.nextIndex;\r
 458              it.error = false;\r
 459              int firstByte  = 0;\r
 460              int secondByte = 0;\r
 461              int thirdByte  = 0;\r
 462              int fourthByte = 0;\r
 463              \r
 464              buildChar: {\r
 465                  firstByte = it.charValue = it.nextByte(det); \r
 466                  \r
 467                  if (firstByte < 0) {\r
 468                      // Ran off the end of the input data\r
 469                      it.done = true;\r
 470                      break buildChar;\r
 471                  }\r
 472                  \r
 473                  if (firstByte <= 0x80) {\r
 474                      // single byte char\r
 475                      break buildChar;\r
 476                  }\r
 477                  \r
 478                  secondByte = it.nextByte(det);\r
 479                  it.charValue = (it.charValue << 8) | secondByte;\r
 480                  \r
 481                  if (firstByte >= 0x81 && firstByte <= 0xFE) {\r
 482                      // Two byte Char\r
 483                      if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {\r
 484                          break buildChar;\r
 485                      }\r
 486                      \r
 487                      // Four byte char\r
 488                      if (secondByte >= 0x30 && secondByte <= 0x39) {\r
 489                          thirdByte = it.nextByte(det);\r
 490                          \r
 491                          if (thirdByte >= 0x81 && thirdByte <= 0xFE) {\r
 492                              fourthByte = it.nextByte(det);\r
 493                              \r
 494                              if (fourthByte >= 0x30 && fourthByte <= 0x39) {\r
 495                                  it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;\r
 496                                  break buildChar;\r
 497                              }\r
 498                          }\r
 499                      }\r
 500                      \r
 501                      it.error = true;\r
 502                      break buildChar;\r
 503                  }\r
 504              }\r
 505                  \r
 506              return (it.done == false);\r
 507          }\r
 508          \r
 509          static int [] commonChars = \r
 510              // TODO:  This set of data comes from the character frequency-\r
 511              //        of-occurence analysis tool.  The data needs to be moved\r
 512              //        into a resource and loaded from there.\r
 513             {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, \r
 514              0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, \r
 515              0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, \r
 516              0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, \r
 517              0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, \r
 518              0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, \r
 519              0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, \r
 520              0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, \r
 521              0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, \r
 522              0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};\r
 523 \r
 524          \r
 525          String getName() {\r
 526              return "GB18030";\r
 527          }\r
 528          \r
 529          int match(CharsetDetector det) {\r
 530              return match(det, commonChars);\r
 531          }\r
 532          \r
 533          public String getLanguage()\r
 534          {\r
 535              return "zh";\r
 536          }\r
 537      }\r
 538      \r
 539      \r
 540 }\r