jars/icu4j-4_4_2-src/main/classes/core/src/com/ibm/icu/text/CharsetRecog_Unicode.java

   1 /*\r
   2  *******************************************************************************\r
   3  * Copyright (C) 1996-2010, International Business Machines Corporation and    *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  *\r
   7  */\r
   8 \r
   9 package com.ibm.icu.text;\r
  10 \r
  11 /**\r
  12  * This class matches UTF-16 and UTF-32, both big- and little-endian. The\r
  13  * BOM will be used if it is present.\r
  14  */\r
  15 abstract class CharsetRecog_Unicode extends CharsetRecognizer {\r
  16 \r
  17     /* (non-Javadoc)\r
  18      * @see com.ibm.icu.text.CharsetRecognizer#getName()\r
  19      */\r
  20     abstract String getName();\r
  21 \r
  22     /* (non-Javadoc)\r
  23      * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)\r
  24      */\r
  25     abstract int match(CharsetDetector det);\r
  26     \r
  27     static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode\r
  28     {\r
  29         String getName()\r
  30         {\r
  31             return "UTF-16BE";\r
  32         }\r
  33         \r
  34         int match(CharsetDetector det)\r
  35         {\r
  36             byte[] input = det.fRawInput;\r
  37             \r
  38             if (input.length>=2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) {\r
  39                 return 100;\r
  40             }\r
  41             \r
  42             // TODO: Do some statistics to check for unsigned UTF-16BE\r
  43             return 0;\r
  44         }\r
  45     }\r
  46     \r
  47     static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode\r
  48     {\r
  49         String getName()\r
  50         {\r
  51             return "UTF-16LE";\r
  52         }\r
  53         \r
  54         int match(CharsetDetector det)\r
  55         {\r
  56             byte[] input = det.fRawInput;\r
  57             \r
  58             if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE))\r
  59             {\r
  60                // An LE BOM is present.\r
  61                if (input.length>=4 && input[2] == 0x00 && input[3] == 0x00) {\r
  62                    // It is probably UTF-32 LE, not UTF-16\r
  63                    return 0;\r
  64                }\r
  65                return 100;\r
  66             }        \r
  67             \r
  68             // TODO: Do some statistics to check for unsigned UTF-16LE\r
  69             return 0;\r
  70         }\r
  71     }\r
  72     \r
  73     static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode\r
  74     {\r
  75         abstract int getChar(byte[] input, int index);\r
  76         \r
  77         abstract String getName();\r
  78         \r
  79         int match(CharsetDetector det)\r
  80         {\r
  81             byte[] input   = det.fRawInput;\r
  82             int limit      = (det.fRawLength / 4) * 4;\r
  83             int numValid   = 0;\r
  84             int numInvalid = 0;\r
  85             boolean hasBOM = false;\r
  86             int confidence = 0;\r
  87             \r
  88             if (limit==0) {\r
  89                 return 0;\r
  90             }\r
  91             if (getChar(input, 0) == 0x0000FEFF) {\r
  92                 hasBOM = true;\r
  93             }\r
  94             \r
  95             for(int i = 0; i < limit; i += 4) {\r
  96                 int ch = getChar(input, i);\r
  97                 \r
  98                 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {\r
  99                     numInvalid += 1;\r
 100                 } else {\r
 101                     numValid += 1;\r
 102                 }\r
 103             }\r
 104             \r
 105             \r
 106             // Cook up some sort of confidence score, based on presence of a BOM\r
 107             //    and the existence of valid and/or invalid multi-byte sequences.\r
 108             if (hasBOM && numInvalid==0) {\r
 109                 confidence = 100;\r
 110             } else if (hasBOM && numValid > numInvalid*10) {\r
 111                 confidence = 80;\r
 112             } else if (numValid > 3 && numInvalid == 0) {\r
 113                 confidence = 100;            \r
 114             } else if (numValid > 0 && numInvalid == 0) {\r
 115                 confidence = 80;\r
 116             } else if (numValid > numInvalid*10) {\r
 117                 // Probably corrupt UTF-32BE data.  Valid sequences aren't likely by chance.\r
 118                 confidence = 25;\r
 119             }\r
 120             \r
 121             return confidence;\r
 122         }\r
 123     }\r
 124     \r
 125     static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32\r
 126     {\r
 127         int getChar(byte[] input, int index)\r
 128         {\r
 129             return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |\r
 130                    (input[index + 2] & 0xFF) <<  8 | (input[index + 3] & 0xFF);\r
 131         }\r
 132         \r
 133         String getName()\r
 134         {\r
 135             return "UTF-32BE";\r
 136         }\r
 137     }\r
 138 \r
 139     \r
 140     static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32\r
 141     {\r
 142         int getChar(byte[] input, int index)\r
 143         {\r
 144             return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |\r
 145                    (input[index + 1] & 0xFF) <<  8 | (input[index + 0] & 0xFF);\r
 146         }\r
 147         \r
 148         String getName()\r
 149         {\r
 150             return "UTF-32LE";\r
 151         }\r
 152     }\r
 153 }\r