]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_8_1_1/main/classes/core/src/com/ibm/icu/text/CharsetRecog_UTF8.java
Added flags.
[Dictionary.git] / jars / icu4j-4_8_1_1 / main / classes / core / src / com / ibm / icu / text / CharsetRecog_UTF8.java
1 /**
2 *******************************************************************************
3 * Copyright (C) 2005 - 2010, International Business Machines Corporation and  *
4 * others. All Rights Reserved.                                                *
5 *******************************************************************************
6 */
7 package com.ibm.icu.text;
8
9 /**
10  * Charset recognizer for UTF-8
11  */
12 class CharsetRecog_UTF8 extends CharsetRecognizer {
13
14     String getName() {
15         return "UTF-8";
16     }
17
18     /* (non-Javadoc)
19      * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
20      */
21     int match(CharsetDetector det) {
22         boolean     hasBOM = false;
23         int         numValid = 0;
24         int         numInvalid = 0;
25         byte        input[] = det.fRawInput;
26         int         i;
27         int         trailBytes = 0;
28         int         confidence;
29         
30         if (det.fRawLength >= 3 && 
31                 (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb & (input[2] & 0xFF) == 0xbf) {
32             hasBOM = true;
33         }
34         
35         // Scan for multi-byte sequences
36         for (i=0; i<det.fRawLength; i++) {
37             int b = input[i];
38             if ((b & 0x80) == 0) {
39                 continue;   // ASCII
40             }
41             
42             // Hi bit on char found.  Figure out how long the sequence should be
43             if ((b & 0x0e0) == 0x0c0) {
44                 trailBytes = 1;                
45             } else if ((b & 0x0f0) == 0x0e0) {
46                 trailBytes = 2;
47             } else if ((b & 0x0f8) == 0xf0) {
48                 trailBytes = 3;
49             } else {
50                 numInvalid++;
51                 if (numInvalid > 5) {
52                     break;
53                 }
54                 trailBytes = 0;
55             }
56                 
57             // Verify that we've got the right number of trail bytes in the sequence
58             for (;;) {
59                 i++;
60                 if (i>=det.fRawLength) {
61                     break;
62                 }
63                 b = input[i];
64                 if ((b & 0xc0) != 0x080) {
65                     numInvalid++;
66                     break;
67                 }
68                 if (--trailBytes == 0) {
69                     numValid++;
70                     break;
71                 }
72             }
73                         
74         }
75         
76         // Cook up some sort of confidence score, based on presense of a BOM
77         //    and the existence of valid and/or invalid multi-byte sequences.
78         confidence = 0;
79         if (hasBOM && numInvalid==0) {
80             confidence = 100;
81         } else if (hasBOM && numValid > numInvalid*10) {
82             confidence = 80;
83         } else if (numValid > 3 && numInvalid == 0) {
84             confidence = 100;            
85         } else if (numValid > 0 && numInvalid == 0) {
86             confidence = 80;
87         } else if (numValid == 0 && numInvalid == 0) {
88             // Plain ASCII.  
89             confidence = 10;            
90         } else if (numValid > numInvalid*10) {
91             // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
92             confidence = 25;
93         }
94         return confidence;
95     }
96
97 }