2 ***********************************************************************
\r
4 * Copyright (C) 2006, International Business Machines Corporation and
\r
5 * others. All Rights Reserved.
\r
7 ***********************************************************************
\r
11 * This tool produces the character usage frequency statistics for the Big5
\r
12 * Chinese charset, for use by the ICU charset detectors.
\r
14 * usage: java BIG5Tool [-d] [directory path]
\r
16 * -d: Produce the data in a form to be exported to the ICU implementation
\r
17 * Default is to produce an informative dump.
\r
19 * -sjis Do Shift_JIS. The structure of sjis is very similar to Big5.
\r
22 * Source directory for the text files to be analyzed.
\r
23 * All files in the specified directory must be in the Big5 encoding.
\r
27 package com.ibm.icu.dev.tool.charsetdet.mbcs;
\r
29 import java.io.File;
\r
30 import java.io.FileInputStream;
\r
31 import java.util.ArrayList;
\r
32 import java.util.Arrays;
\r
33 import java.util.HashMap;
\r
34 import java.util.List;
\r
37 public class BIG5Tool {
\r
39 // The file buffer and file data length need to be out in class member variables
\r
40 // so that the code lifted from charSet detection for scanning the multi-byte chars
\r
41 // can see them conveniently.
\r
42 byte [] buf = new byte[1000000];
\r
45 boolean option_d = false; // data option. Produce exportable data
\r
46 boolean option_v = true; // verbose informaional output.
\r
47 boolean sjis = false; // True if input text files are Shift_JIS encoded.
\r
51 public static void main(String[] args) {
\r
52 BIG5Tool This = new BIG5Tool();
\r
58 void Main(String[] args) {
\r
62 // Command Line Option Handling
\r
64 String dirName = null;
\r
65 for (i=0; i<args.length; i++) {
\r
66 if (args[i].equals("-d")) {
\r
71 if (args[i].equals("-sjis")) {
\r
75 if (args[i].startsWith("-")) {
\r
76 System.err.println("Unrecognized option: " + args[i]);
\r
79 if (dirName == null) {
\r
82 System.err.println("Unrecognized option: " + dirName);
\r
86 if (dirName == null) {
\r
91 // Verify that the specified directory exists.
\r
93 File dir = new File(dirName);
\r
94 if (dir.isDirectory() == false) {
\r
95 System.err.println("\"" + dirName + "\" is not a directory");
\r
103 // Collect statistics from all ordinary files in a specified directory.
\r
105 void processDir(File dir) {
\r
106 int totalMbcsChars = 0;
\r
107 HashMap m = new HashMap(10000);
\r
110 System.out.println(dir.getName());
\r
111 File[] files = dir.listFiles();
\r
112 for (i=0; i<files.length; i++) {
\r
114 if (files[i].isFile()) {
\r
115 FileInputStream is = new FileInputStream(files[i]);
\r
116 fileSize = is.read(buf);
\r
118 System.out.println(files[i].getPath());
\r
119 System.out.println(" " + fileSize + " bytes.");
\r
121 iteratedChar ichar = new iteratedChar();
\r
123 int fileMbcsChars = 0;
\r
126 while (nextChar(ichar)) {
\r
127 if (ichar.error == true) {
\r
132 if (ichar.charValue > 255) {
\r
136 if (ichar.charValue <= 255) {
\r
137 // Don't keep occurence statistics for the single byte range
\r
142 // Frequency of occurence statistics are accumulated in a map.
\r
144 ChEl keyEl = new ChEl(ichar.charValue, 0);
\r
145 ChEl valEl = (ChEl)m.get(keyEl);
\r
146 if (valEl == null) {
\r
147 m.put(keyEl, keyEl);
\r
150 valEl.occurences++;
\r
153 System.out.println(" " + fileChars + " Chars");
\r
154 System.out.println(" " + fileMbcsChars + " mbcs Chars");
\r
155 System.out.println(" " + errs + " errors");
\r
156 System.out.println("\n");
\r
160 catch (Exception e) {
\r
161 System.err.println("Exception:" + e);
\r
167 // We've processed through all of the files.
\r
168 // sort and dump out the frequency statistics.
\r
170 Object [] encounteredChars = m.values().toArray();
\r
171 Arrays.sort(encounteredChars);
\r
172 int cumulativeChars = 0;
\r
173 int cumulativePercent = 0;
\r
175 System.out.println("# <char code> <occurences> <Cumulative %>");
\r
176 for (i=0; i<encounteredChars.length; i++) {
\r
177 ChEl c = (ChEl)encounteredChars[i];
\r
178 cumulativeChars += c.occurences;
\r
179 cumulativePercent = cumulativeChars*100/totalMbcsChars;
\r
180 System.out.println(i + " " + Integer.toHexString(c.charCode) + " "
\r
181 + c.occurences + " " + cumulativePercent);
\r
186 // Output the list of characters formatted for pasting into a
\r
187 // Java source code array initializer.
\r
188 // Resort into order based on the character code value, not
\r
189 // on frequency of occurence.
\r
191 List charList = new ArrayList();
\r
193 for (i=0; i<100 && cumulativePercent<50; i++) {
\r
194 ChEl c = (ChEl)encounteredChars[i];
\r
195 cumulativeChars += c.occurences;
\r
196 cumulativePercent = cumulativeChars*100/totalMbcsChars;
\r
197 charList.add(new Integer(c.charCode));
\r
199 Object [] sortedChars = charList.toArray();
\r
200 Arrays.sort(sortedChars);
\r
202 System.out.print(" {");
\r
203 for (i=0; i<sortedChars.length; i++) {
\r
205 System.out.print(", ");
\r
207 System.out.print("\n ");
\r
210 int cp = ((Integer)sortedChars[i]).intValue();
\r
211 System.out.print("0x" + Integer.toHexString(cp));
\r
213 System.out.println("};");
\r
218 // This is a little class containing a
\r
219 // multi-byte character value and an occurence count for that char.
\r
220 // Instances of this class are kept in the collection that accumulates statistics
\r
222 // WARNING: this class's natural ordering (from Comparable) and equals()
\r
223 // are inconsistent.
\r
225 static class ChEl implements Comparable {
\r
229 ChEl(int c, int o) {
\r
234 // Equals needs to work with a map, with the charCode as the key.
\r
235 // For insertion/lookup, we care about the char code only, not the occurence count.
\r
236 public boolean equals(Object other) {
\r
237 ChEl o = (ChEl)other;
\r
238 return o.charCode == this.charCode;
\r
241 // Hashcode needs to be compatible with equals
\r
242 // We're using this in a hashMap!
\r
243 public int hashCode() {
\r
247 // We want to be able to sort the results by frequency of occurence
\r
248 // Compare backwards. We want most frequent chars first.
\r
249 public int compareTo(Object other) {
\r
250 ChEl o = (ChEl)other;
\r
251 return (this.occurences> o.occurences? -1 :
\r
252 (this.occurences==o.occurences? 0 : 1));
\r
258 // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs
\r
259 // Pulls out one logical char according to the rules of EUC encoding.
\r
261 class iteratedChar {
\r
262 int charValue = 0; // The char value is a value from the encoding.
\r
263 // It's meaning is not well defined, other than
\r
264 // different encodings
\r
267 boolean error = false;
\r
268 boolean done = false;
\r
279 if (nextIndex >= fileSize) {
\r
283 int byteValue = (int)buf[nextIndex++] & 0x00ff;
\r
289 boolean nextChar(iteratedChar it) {
\r
290 it.index = it.nextIndex;
\r
293 int secondByte = 0;
\r
296 firstByte = it.charValue = it.nextByte();
\r
297 if (firstByte < 0) {
\r
298 // Ran off the end of the input data
\r
302 if (firstByte <= 0x0080 ||
\r
303 (sjis && firstByte>=0x00a0 && firstByte< 0x00e0) ||
\r
304 (sjis && firstByte>=0x00fd && firstByte<=0x00ff)) {
\r
305 // single byte char
\r
309 secondByte = it.nextByte();
\r
310 it.charValue = (it.charValue << 8) | secondByte;
\r
312 if (secondByte < 0x40 ||
\r
313 secondByte == 0x007f ||
\r
314 secondByte == 0x00ff ||
\r
315 sjis && secondByte >= 0x00fd) {
\r
320 System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte));
\r
324 return (it.done == false);
\r