jars/icu4j-4_8_1_1/main/classes/core/src/com/ibm/icu/text/CharsetRecog_UTF8.java

   1 /**
   2 *******************************************************************************
   3 * Copyright (C) 2005 - 2010, International Business Machines Corporation and  *
   4 * others. All Rights Reserved.                                                *
   5 *******************************************************************************
   6 */
   7 package com.ibm.icu.text;
   8
   9 /**
  10  * Charset recognizer for UTF-8
  11  */
  12 class CharsetRecog_UTF8 extends CharsetRecognizer {
  13
  14     String getName() {
  15         return "UTF-8";
  16     }
  17
  18     /* (non-Javadoc)
  19      * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
  20      */
  21     int match(CharsetDetector det) {
  22         boolean     hasBOM = false;
  23         int         numValid = 0;
  24         int         numInvalid = 0;
  25         byte        input[] = det.fRawInput;
  26         int         i;
  27         int         trailBytes = 0;
  28         int         confidence;
  29
  30         if (det.fRawLength >= 3 &&
  31                 (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb & (input[2] & 0xFF) == 0xbf) {
  32             hasBOM = true;
  33         }
  34
  35         // Scan for multi-byte sequences
  36         for (i=0; i<det.fRawLength; i++) {
  37             int b = input[i];
  38             if ((b & 0x80) == 0) {
  39                 continue;   // ASCII
  40             }
  41
  42             // Hi bit on char found.  Figure out how long the sequence should be
  43             if ((b & 0x0e0) == 0x0c0) {
  44                 trailBytes = 1;
  45             } else if ((b & 0x0f0) == 0x0e0) {
  46                 trailBytes = 2;
  47             } else if ((b & 0x0f8) == 0xf0) {
  48                 trailBytes = 3;
  49             } else {
  50                 numInvalid++;
  51                 if (numInvalid > 5) {
  52                     break;
  53                 }
  54                 trailBytes = 0;
  55             }
  56
  57             // Verify that we've got the right number of trail bytes in the sequence
  58             for (;;) {
  59                 i++;
  60                 if (i>=det.fRawLength) {
  61                     break;
  62                 }
  63                 b = input[i];
  64                 if ((b & 0xc0) != 0x080) {
  65                     numInvalid++;
  66                     break;
  67                 }
  68                 if (--trailBytes == 0) {
  69                     numValid++;
  70                     break;
  71                 }
  72             }
  73
  74         }
  75
  76         // Cook up some sort of confidence score, based on presense of a BOM
  77         //    and the existence of valid and/or invalid multi-byte sequences.
  78         confidence = 0;
  79         if (hasBOM && numInvalid==0) {
  80             confidence = 100;
  81         } else if (hasBOM && numValid > numInvalid*10) {
  82             confidence = 80;
  83         } else if (numValid > 3 && numInvalid == 0) {
  84             confidence = 100;
  85         } else if (numValid > 0 && numInvalid == 0) {
  86             confidence = 80;
  87         } else if (numValid == 0 && numInvalid == 0) {
  88             // Plain ASCII.
  89             confidence = 10;
  90         } else if (numValid > numInvalid*10) {
  91             // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
  92             confidence = 25;
  93         }
  94         return confidence;
  95     }
  96
  97 }