2 ***********************************************************************
4 * Copyright (C) 2005-2012, International Business Machines Corporation and
5 * others. All Rights Reserved.
7 ***********************************************************************
11 * This tool produces the character usage frequency statistics for the EUC family
12 * of charsets, for use by the ICU charset detectors.
14 * usage: java euc_tool [-d] [directory path]
16 * -d: Produce the data in a form to be exported to the ICU implementation
17 * Default is to produce an informative dump.
20 * Source directory for the files to be analyzed.
21 * Default is the current directory.
22 * There should be three subdirectories under the specified directory, one
23 * each for EUC_JP, EUC_CN and EUC_KR. Within each of these subdirectories
24 * should be text files in the specified encoding.
28 package com.ibm.icu.dev.tool.charsetdet.mbcs;
31 import java.io.FileInputStream;
32 import java.util.ArrayList;
33 import java.util.Arrays;
34 import java.util.HashMap;
35 import java.util.List;
37 public class EUCTool {
39 // The file buffer and file data length need to be out in class member variables
40 // so that the code lifted from charSet detection for scanning the multi-byte chars
41 // can see them conveniently.
42 byte [] buf = new byte[1000000];
45 boolean option_d = false; // data option. Produce exportable data
46 boolean option_v = true; // verbose informaional output.
50 public static void main(String[] args) {
51 EUCTool This = new EUCTool();
57 void Main(String[] args) {
61 // Command Line Option Handling
64 for (i=0; i<args.length; i++) {
65 if (args[i].equals("-d")) {
70 if (args[i].startsWith("-")) {
71 System.err.println("Unrecongized option: " + args[i]);
78 // Verify that the specified directory exists.
80 File dir = new File(dirName);
81 if (dir.isDirectory() == false) {
82 System.err.println("\"" + dirName + "\" is not a directory");
87 // Do each subdirectory of the specified directory. There should be
88 // one per each encoding - euc-kr, euc-cn, euc-jp
90 File[] dirs = dir.listFiles();
91 for (i=0; i<dirs.length; i++) {
92 if (dirs[i].isDirectory()) {
93 String nam = dirs[i].getName();
94 if (nam.equalsIgnoreCase("CVS")) {
103 // Collect statistics from all ordinary files in a specified directory.
105 void processDir(File dir) {
106 int totalMbcsChars = 0;
107 HashMap m = new HashMap(10000);
110 System.out.println(dir.getName());
111 File[] files = dir.listFiles();
112 for (i=0; i<files.length; i++) {
113 FileInputStream is = null;
115 if (files[i].isFile()) {
116 is = new FileInputStream(files[i]);
117 fileSize = is.read(buf);
119 System.out.println(files[i].getPath());
120 System.out.println(" " + fileSize + " bytes.");
122 iteratedChar ichar = new iteratedChar();
124 int fileMbcsChars = 0;
127 while (nextChar(ichar)) {
128 if (ichar.error == true) {
133 if (ichar.charValue > 255) {
137 if (ichar.charValue <= 255) {
138 // Don't keep occurence statistics for the single byte range
143 // Frequency of occurence statistics are accumulated in a map.
145 ChEl keyEl = new ChEl(ichar.charValue, 0);
146 ChEl valEl = (ChEl)m.get(keyEl);
154 System.out.println(" " + fileChars + " Chars");
155 System.out.println(" " + fileMbcsChars + " mbcs Chars");
156 System.out.println(" " + errs + " errors");
157 System.out.println("\n");
161 catch (Exception e) {
162 System.err.println("Exception:" + e);
169 } catch (Exception e) {
177 // We've processed through all of the files.
178 // sort and dump out the frequency statistics.
180 Object [] encounteredChars = m.values().toArray();
181 Arrays.sort(encounteredChars);
182 int cumulativeChars = 0;
183 int cumulativePercent = 0;
185 System.out.println("# <char code> <occurences> <Cumulative %>");
186 for (i=0; i<encounteredChars.length; i++) {
187 ChEl c = (ChEl)encounteredChars[i];
188 cumulativeChars += c.occurences;
189 cumulativePercent = cumulativeChars*100/totalMbcsChars;
190 System.out.println(i + " " + Integer.toHexString(c.charCode) + " "
191 + c.occurences + " " + cumulativePercent);
196 // Output the list of characters formatted for pasting into a
197 // Java source code array initializer.
198 // Resort into order based on the character code value, not
199 // on frequency of occurence.
201 List charList = new ArrayList();
203 for (i=0; i<100 && cumulativePercent<50; i++) {
204 ChEl c = (ChEl)encounteredChars[i];
205 cumulativeChars += c.occurences;
206 cumulativePercent = cumulativeChars*100/totalMbcsChars;
207 charList.add(new Integer(c.charCode));
209 Object [] sortedChars = charList.toArray();
210 Arrays.sort(sortedChars);
212 System.out.print(" {");
213 for (i=0; i<sortedChars.length; i++) {
215 System.out.print(", ");
217 System.out.print("\n ");
220 int cp = ((Integer)sortedChars[i]).intValue();
221 System.out.print("0x" + Integer.toHexString(cp));
223 System.out.println("};");
228 // This is a little class containing a
229 // multi-byte character value and an occurence count for that char.
230 // Instances of this class are kept in the collection that accumulates statistics
232 // WARNING: this class's natural ordering (from Comparable) and equals()
235 static class ChEl implements Comparable {
244 // Equals needs to work with a map, with the charCode as the key.
245 // For insertion/lookup, we care about the char code only, not the occurence count.
246 public boolean equals(Object other) {
247 ChEl o = (ChEl)other;
248 return o.charCode == this.charCode;
251 // Hashcode needs to be compatible with equals
252 // We're using this in a hashMap!
253 public int hashCode() {
257 // We want to be able to sort the results by frequency of occurence
258 // Compare backwards. We want most frequent chars first.
259 public int compareTo(Object other) {
260 ChEl o = (ChEl)other;
261 return (this.occurences> o.occurences? -1 :
262 (this.occurences==o.occurences? 0 : 1));
268 // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs
269 // Pulls out one logical char according to the rules of EUC encoding.
272 int charValue = 0; // The char value is a value from the encoding.
273 // It's meaning is not well defined, other than
274 // different encodings
277 boolean error = false;
278 boolean done = false;
289 if (nextIndex >= fileSize) {
293 int byteValue = (int)buf[nextIndex++] & 0x00ff;
299 boolean nextChar(iteratedChar it) {
300 it.index = it.nextIndex;
308 firstByte = it.charValue = it.nextByte();
310 // Ran off the end of the input data
314 if (firstByte <= 0x8d) {
319 secondByte = it.nextByte();
320 it.charValue = (it.charValue << 8) | secondByte;
322 if (firstByte >= 0xA1 && firstByte <= 0xfe) {
324 if (secondByte < 0xa1) {
329 if (firstByte == 0x8e) {
331 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
332 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
333 // We don't know which we've got.
334 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
335 // bytes will look like a well formed 2 byte char.
336 if (secondByte < 0xa1) {
342 if (firstByte == 0x8f) {
344 // Three byte total char size, two bytes of actual char value.
345 thirdByte = it.nextByte();
346 it.charValue = (it.charValue << 8) | thirdByte;
347 if (thirdByte < 0xa1) {
354 System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte)
355 + " " + Integer.toHexString(thirdByte) + " " + Integer.toHexString(fourthByte));
357 return (it.done == false);