2 ***********************************************************************
4 * Copyright (C) 2006, International Business Machines Corporation and
5 * others. All Rights Reserved.
7 ***********************************************************************
11 * This tool produces the character usage frequency statistics for the Big5
12 * Chinese charset, for use by the ICU charset detectors.
14 * usage: java BIG5Tool [-d] [directory path]
16 * -d: Produce the data in a form to be exported to the ICU implementation
17 * Default is to produce an informative dump.
19 * -sjis Do Shift_JIS. The structure of sjis is very similar to Big5.
22 * Source directory for the text files to be analyzed.
23 * All files in the specified directory must be in the Big5 encoding.
27 package com.ibm.icu.dev.tool.charsetdet.mbcs;
30 import java.io.FileInputStream;
31 import java.util.ArrayList;
32 import java.util.Arrays;
33 import java.util.HashMap;
34 import java.util.List;
37 public class BIG5Tool {
39 // The file buffer and file data length need to be out in class member variables
40 // so that the code lifted from charSet detection for scanning the multi-byte chars
41 // can see them conveniently.
42 byte [] buf = new byte[1000000];
45 boolean option_d = false; // data option. Produce exportable data
46 boolean option_v = true; // verbose informaional output.
47 boolean sjis = false; // True if input text files are Shift_JIS encoded.
51 public static void main(String[] args) {
52 BIG5Tool This = new BIG5Tool();
58 void Main(String[] args) {
62 // Command Line Option Handling
64 String dirName = null;
65 for (i=0; i<args.length; i++) {
66 if (args[i].equals("-d")) {
71 if (args[i].equals("-sjis")) {
75 if (args[i].startsWith("-")) {
76 System.err.println("Unrecognized option: " + args[i]);
79 if (dirName == null) {
82 System.err.println("Unrecognized option: " + dirName);
86 if (dirName == null) {
91 // Verify that the specified directory exists.
93 File dir = new File(dirName);
94 if (dir.isDirectory() == false) {
95 System.err.println("\"" + dirName + "\" is not a directory");
103 // Collect statistics from all ordinary files in a specified directory.
105 void processDir(File dir) {
106 int totalMbcsChars = 0;
107 HashMap m = new HashMap(10000);
110 System.out.println(dir.getName());
111 File[] files = dir.listFiles();
112 for (i=0; i<files.length; i++) {
114 if (files[i].isFile()) {
115 FileInputStream is = new FileInputStream(files[i]);
116 fileSize = is.read(buf);
118 System.out.println(files[i].getPath());
119 System.out.println(" " + fileSize + " bytes.");
121 iteratedChar ichar = new iteratedChar();
123 int fileMbcsChars = 0;
126 while (nextChar(ichar)) {
127 if (ichar.error == true) {
132 if (ichar.charValue > 255) {
136 if (ichar.charValue <= 255) {
137 // Don't keep occurence statistics for the single byte range
142 // Frequency of occurence statistics are accumulated in a map.
144 ChEl keyEl = new ChEl(ichar.charValue, 0);
145 ChEl valEl = (ChEl)m.get(keyEl);
153 System.out.println(" " + fileChars + " Chars");
154 System.out.println(" " + fileMbcsChars + " mbcs Chars");
155 System.out.println(" " + errs + " errors");
156 System.out.println("\n");
160 catch (Exception e) {
161 System.err.println("Exception:" + e);
167 // We've processed through all of the files.
168 // sort and dump out the frequency statistics.
170 Object [] encounteredChars = m.values().toArray();
171 Arrays.sort(encounteredChars);
172 int cumulativeChars = 0;
173 int cumulativePercent = 0;
175 System.out.println("# <char code> <occurences> <Cumulative %>");
176 for (i=0; i<encounteredChars.length; i++) {
177 ChEl c = (ChEl)encounteredChars[i];
178 cumulativeChars += c.occurences;
179 cumulativePercent = cumulativeChars*100/totalMbcsChars;
180 System.out.println(i + " " + Integer.toHexString(c.charCode) + " "
181 + c.occurences + " " + cumulativePercent);
186 // Output the list of characters formatted for pasting into a
187 // Java source code array initializer.
188 // Resort into order based on the character code value, not
189 // on frequency of occurence.
191 List charList = new ArrayList();
193 for (i=0; i<100 && cumulativePercent<50; i++) {
194 ChEl c = (ChEl)encounteredChars[i];
195 cumulativeChars += c.occurences;
196 cumulativePercent = cumulativeChars*100/totalMbcsChars;
197 charList.add(new Integer(c.charCode));
199 Object [] sortedChars = charList.toArray();
200 Arrays.sort(sortedChars);
202 System.out.print(" {");
203 for (i=0; i<sortedChars.length; i++) {
205 System.out.print(", ");
207 System.out.print("\n ");
210 int cp = ((Integer)sortedChars[i]).intValue();
211 System.out.print("0x" + Integer.toHexString(cp));
213 System.out.println("};");
218 // This is a little class containing a
219 // multi-byte character value and an occurence count for that char.
220 // Instances of this class are kept in the collection that accumulates statistics
222 // WARNING: this class's natural ordering (from Comparable) and equals()
225 static class ChEl implements Comparable {
234 // Equals needs to work with a map, with the charCode as the key.
235 // For insertion/lookup, we care about the char code only, not the occurence count.
236 public boolean equals(Object other) {
237 ChEl o = (ChEl)other;
238 return o.charCode == this.charCode;
241 // Hashcode needs to be compatible with equals
242 // We're using this in a hashMap!
243 public int hashCode() {
247 // We want to be able to sort the results by frequency of occurence
248 // Compare backwards. We want most frequent chars first.
249 public int compareTo(Object other) {
250 ChEl o = (ChEl)other;
251 return (this.occurences> o.occurences? -1 :
252 (this.occurences==o.occurences? 0 : 1));
258 // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs
259 // Pulls out one logical char according to the rules of EUC encoding.
262 int charValue = 0; // The char value is a value from the encoding.
263 // It's meaning is not well defined, other than
264 // different encodings
267 boolean error = false;
268 boolean done = false;
279 if (nextIndex >= fileSize) {
283 int byteValue = (int)buf[nextIndex++] & 0x00ff;
289 boolean nextChar(iteratedChar it) {
290 it.index = it.nextIndex;
296 firstByte = it.charValue = it.nextByte();
298 // Ran off the end of the input data
302 if (firstByte <= 0x0080 ||
303 (sjis && firstByte>=0x00a0 && firstByte< 0x00e0) ||
304 (sjis && firstByte>=0x00fd && firstByte<=0x00ff)) {
309 secondByte = it.nextByte();
310 it.charValue = (it.charValue << 8) | secondByte;
312 if (secondByte < 0x40 ||
313 secondByte == 0x007f ||
314 secondByte == 0x00ff ||
315 sjis && secondByte >= 0x00fd) {
320 System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte));
324 return (it.done == false);