jars/icu4j-52_1/tools/misc/src/com/ibm/icu/dev/tool/charsetdet/mbcs/EUCTool.java

   1 /*
   2  ***********************************************************************
   3  *
   4  * Copyright (C) 2005-2012, International Business Machines Corporation and
   5  * others. All Rights Reserved.
   6  *
   7  ***********************************************************************
   8  *
   9  * euc_tool
  10  *
  11  *    This tool produces the character usage frequency statistics for the EUC family
  12  *    of charsets, for use by the ICU charset detectors.
  13  *
  14  *    usage:  java euc_tool [-d] [directory path]
  15  *
  16  *        -d:   Produce the data in a form to be exported to the ICU implementation
  17  *              Default is to produce an informative dump.
  18  *
  19  *        directory path
  20  *              Source directory for the files to be analyzed.
  21  *              Default is the current directory.
  22  *              There should be three subdirectories under the specified directory, one
  23  *              each for EUC_JP, EUC_CN and EUC_KR.  Within each of these subdirectories
  24  *              should be text files in the specified encoding.
  25  *
  26  */
  27
  28 package com.ibm.icu.dev.tool.charsetdet.mbcs;
  29
  30 import java.io.File;
  31 import java.io.FileInputStream;
  32 import java.util.ArrayList;
  33 import java.util.Arrays;
  34 import java.util.HashMap;
  35 import java.util.List;
  36
  37 public class EUCTool {
  38
  39     // The file buffer and file data length need to be out in class member variables
  40     //  so that the code lifted from charSet detection for scanning the multi-byte chars
  41     //  can see them conveniently.
  42     byte []    buf = new byte[1000000];
  43     int        fileSize;
  44
  45     boolean    option_d = false;    // data option.  Produce exportable data
  46     boolean    option_v = true;     // verbose informaional output.
  47
  48
  49
  50     public static void main(String[] args) {
  51         EUCTool  This = new EUCTool();
  52         This.Main(args);
  53     }
  54
  55
  56
  57     void Main(String[] args) {
  58         int i;
  59
  60         //
  61         //   Command Line Option Handling
  62         //
  63         String     dirName  = ".";
  64         for (i=0; i<args.length; i++) {
  65             if (args[i].equals("-d")) {
  66                 option_d = true;
  67                 option_v = false;
  68                 continue;
  69             }
  70             if (args[i].startsWith("-")) {
  71                 System.err.println("Unrecongized option: " + args[i]);
  72                 System.exit(-1);
  73             }
  74             dirName = args[i];
  75         }
  76
  77         //
  78         //  Verify that the specified directory exists.
  79         //
  80         File dir = new File(dirName);
  81         if (dir.isDirectory() == false) {
  82             System.err.println("\"" + dirName + "\" is not a directory");
  83             System.exit(-1);
  84         }
  85
  86         //
  87         //  Do each subdirectory of the specified directory.  There should be
  88         //    one per each encoding - euc-kr, euc-cn, euc-jp
  89         //
  90         File[] dirs  = dir.listFiles();
  91         for (i=0; i<dirs.length; i++) {
  92             if (dirs[i].isDirectory()) {
  93                 String nam = dirs[i].getName();
  94                 if (nam.equalsIgnoreCase("CVS")) {
  95                     continue;
  96                 }
  97                 processDir(dirs[i]);
  98             }
  99         }
 100     }
 101
 102     //
 103     // Collect statistics from all ordinary files in a specified directory.
 104     //
 105     void processDir(File dir) {
 106         int      totalMbcsChars  = 0;
 107         HashMap  m = new HashMap(10000);
 108         int      i;
 109
 110         System.out.println(dir.getName());
 111         File[] files = dir.listFiles();
 112         for (i=0; i<files.length; i++) {
 113             FileInputStream is = null;
 114             try {
 115                 if (files[i].isFile()) {
 116                     is = new FileInputStream(files[i]);
 117                     fileSize = is.read(buf);
 118                     if (option_v) {
 119                         System.out.println(files[i].getPath());
 120                         System.out.println("  " + fileSize + " bytes.");
 121                     }
 122                     iteratedChar ichar = new iteratedChar();
 123                     int fileChars     = 0;
 124                     int fileMbcsChars = 0;
 125                     int errs          = 0;
 126
 127                     while (nextChar(ichar)) {
 128                         if (ichar.error == true) {
 129                             errs++;
 130                             continue;
 131                         }
 132                         fileChars++;
 133                         if (ichar.charValue > 255) {
 134                             fileMbcsChars++;
 135                             totalMbcsChars++;
 136                         }
 137                         if (ichar.charValue <= 255) {
 138                             // Don't keep occurence statistics for the single byte range
 139                             continue;
 140                         }
 141
 142                         //
 143                         //  Frequency of occurence statistics are accumulated in a map.
 144                         //
 145                         ChEl  keyEl = new ChEl(ichar.charValue, 0);
 146                         ChEl  valEl = (ChEl)m.get(keyEl);
 147                         if (valEl == null) {
 148                             m.put(keyEl, keyEl);
 149                             valEl = keyEl;
 150                         }
 151                         valEl.occurences++;
 152                     }
 153                     if (option_v) {
 154                         System.out.println("  " + fileChars     + " Chars");
 155                         System.out.println("  " + fileMbcsChars + " mbcs Chars");
 156                         System.out.println("  " + errs          + " errors");
 157                         System.out.println("\n");
 158                     }
 159                 }
 160             }
 161             catch (Exception e) {
 162                 System.err.println("Exception:" + e);
 163
 164             }
 165             finally {
 166                 if (is != null) {
 167                     try {
 168                         is.close();
 169                     } catch (Exception e) {
 170                         // ignore
 171                     }
 172                 }
 173             }
 174         }
 175
 176         //
 177         //  We've processed through all of the files.
 178         //     sort and dump out the frequency statistics.
 179         //
 180         Object [] encounteredChars = m.values().toArray();
 181         Arrays.sort(encounteredChars);
 182         int cumulativeChars = 0;
 183         int cumulativePercent = 0;
 184         if (option_v) {
 185             System.out.println("# <char code> <occurences>  <Cumulative %>");
 186             for (i=0; i<encounteredChars.length; i++) {
 187                 ChEl c = (ChEl)encounteredChars[i];
 188                 cumulativeChars += c.occurences;
 189                 cumulativePercent = cumulativeChars*100/totalMbcsChars;
 190                 System.out.println(i + "   " + Integer.toHexString(c.charCode) + "        "
 191                         + c.occurences + "         " + cumulativePercent);
 192             }
 193         }
 194         if (option_d) {
 195             //
 196             //   Output the list of characters formatted for pasting into a
 197             //     Java source code array initializer.
 198             //     Resort into order based on the character code value, not
 199             //      on frequency of occurence.
 200             //
 201             List  charList = new ArrayList();
 202
 203             for (i=0; i<100 && cumulativePercent<50; i++) {
 204                 ChEl c = (ChEl)encounteredChars[i];
 205                 cumulativeChars += c.occurences;
 206                 cumulativePercent = cumulativeChars*100/totalMbcsChars;
 207                 charList.add(new Integer(c.charCode));
 208             }
 209             Object [] sortedChars = charList.toArray();
 210             Arrays.sort(sortedChars);
 211
 212             System.out.print("          {");
 213             for (i=0; i<sortedChars.length; i++) {
 214                 if (i != 0) {
 215                     System.out.print(", ");
 216                     if ((i)%10 == 0) {
 217                         System.out.print("\n           ");
 218                     }
 219                 }
 220                 int cp = ((Integer)sortedChars[i]).intValue();
 221                 System.out.print("0x" + Integer.toHexString(cp));
 222             }
 223             System.out.println("};");
 224         }
 225     }
 226
 227     //
 228     //  This is a little class containing a
 229     //    multi-byte character value and an occurence count for that char.
 230     //  Instances of this class are kept in the collection that accumulates statistics
 231     //
 232     //  WARNING:  this class's natural ordering (from Comparable) and equals()
 233     //            are inconsistent.
 234
 235     static class ChEl implements Comparable {
 236         int charCode;
 237         int occurences;
 238
 239         ChEl(int c, int o) {
 240             charCode = c;
 241             occurences = o;
 242         }
 243
 244         // Equals needs to work with a map, with the charCode as the key.
 245         //   For insertion/lookup, we care about the char code only, not the occurence count.
 246         public boolean equals(Object other) {
 247             ChEl o = (ChEl)other;
 248             return o.charCode == this.charCode;
 249         }
 250
 251         // Hashcode needs to be compatible with equals
 252         //   We're using this in a hashMap!
 253         public int hashCode() {
 254             return charCode;
 255         }
 256
 257         // We want to be able to sort the results by frequency of occurence
 258         //   Compare backwards.  We want most frequent chars first.
 259         public int compareTo(Object other) {
 260             ChEl o = (ChEl)other;
 261             return (this.occurences> o.occurences? -1 :
 262                    (this.occurences==o.occurences?  0 : 1));
 263         }
 264
 265     }
 266
 267     //
 268     // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs
 269     //              Pulls out one logical char according to the rules of EUC encoding.
 270     //
 271     class iteratedChar {
 272         int             charValue = 0;             // The char value is a value from the encoding.
 273                                                    //   It's meaning is not well defined, other than
 274                                                    //   different encodings
 275         int             index     = 0;
 276         int             nextIndex = 0;
 277         boolean         error     = false;
 278         boolean         done      = false;
 279
 280         void reset() {
 281             charValue = 0;
 282             index     = -1;
 283             nextIndex = 0;
 284             error     = false;
 285             done      = false;
 286         }
 287
 288         int nextByte() {
 289             if (nextIndex >= fileSize) {
 290                 done = true;
 291                 return -1;
 292             }
 293             int byteValue = (int)buf[nextIndex++] & 0x00ff;
 294             return byteValue;
 295         }
 296     }
 297
 298
 299     boolean nextChar(iteratedChar it) {
 300         it.index = it.nextIndex;
 301         it.error = false;
 302         int firstByte  = 0;
 303         int secondByte = 0;
 304         int thirdByte  = 0;
 305         int fourthByte = 0;
 306
 307         buildChar: {
 308             firstByte = it.charValue = it.nextByte();
 309             if (firstByte < 0) {
 310                 // Ran off the end of the input data
 311                 it.done = true;
 312                 break buildChar;
 313             }
 314             if (firstByte <= 0x8d) {
 315                 // single byte char
 316                 break buildChar;
 317             }
 318
 319             secondByte = it.nextByte();
 320             it.charValue = (it.charValue << 8) | secondByte;
 321
 322             if (firstByte >= 0xA1 && firstByte <= 0xfe) {
 323                 // Two byte Char
 324                 if (secondByte < 0xa1) {
 325                     it.error = true;
 326                 }
 327                 break buildChar;
 328             }
 329             if (firstByte == 0x8e) {
 330                 // Code Set 2.
 331                 //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
 332                 //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
 333                 // We don't know which we've got.
 334                 // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
 335                 //   bytes will look like a well formed 2 byte char.
 336                 if (secondByte < 0xa1) {
 337                     it.error = true;
 338                 }
 339                 break buildChar;
 340             }
 341
 342             if (firstByte == 0x8f) {
 343                 // Code set 3.
 344                 // Three byte total char size, two bytes of actual char value.
 345                 thirdByte    = it.nextByte();
 346                 it.charValue = (it.charValue << 8) | thirdByte;
 347                 if (thirdByte < 0xa1) {
 348                     it.error = true;
 349                 }
 350             }
 351
 352         }
 353         if (it.error) {
 354             System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte)
 355                     + " " +  Integer.toHexString(thirdByte) + " " + Integer.toHexString(fourthByte));
 356         }
 357         return (it.done == false);
 358     }
 359 }
 360
 361
 362
 363