2 ***********************************************************************
\r
3 * Copyright (C) 2005-2010, International Business Machines *
\r
4 * Corporation and others. All Rights Reserved. *
\r
5 ***********************************************************************
\r
9 package com.ibm.icu.dev.tool.charsetdet.sbcs;
\r
12 import java.io.FileOutputStream;
\r
13 import java.io.IOException;
\r
14 import java.io.PrintStream;
\r
15 import java.util.ArrayList;
\r
16 import java.util.Arrays;
\r
17 import java.util.Collections;
\r
18 import java.util.Iterator;
\r
19 import java.util.List;
\r
21 import com.ibm.icu.impl.Utility;
\r
26 * TODO To change the template for this generated type comment go to
\r
27 * Window - Preferences - Java - Code Style - Code Templates
\r
29 public class StatisticsTool implements NGramParser.NGramParserClient, NGramList.NGramKeyMapper
\r
31 /* TODO Make this usage string more sane. */
\r
32 private static final String usageString =
\r
33 "\nUsage: StatisticsTool [OPTIONS] [FILES]\n\n" +
\r
34 "This program will read in a Unicode text file of text in a particular language\n" +
\r
35 "and compute the statistics needed to detected that language and character set.\n " +
\r
37 "-e specify the target encoding\n" +
\r
38 "-h or -? print this usage text.\n" +
\r
39 "-v also generate statistics for visual order.\n" +
\r
40 "-l only generate statistics for logical order (cancel -v)." +
\r
41 "-c run the checker.\n" +
\r
42 "-t run the encoding test.\n" +
\r
43 "example: com.ibm.icu.dev.tool.charset.StatisticsTool -e 8859-1 Spanish.txt";
\r
45 private static final int BUFFER_SIZE = 1024;
\r
47 private char[] buffer;
\r
48 private int bufIndex;
\r
51 private InputFile inputFile;
\r
53 private NGramList ngrams;
\r
55 private static byte[] allBytes = {
\r
56 (byte) 0x00, (byte) 0x01, (byte) 0x02, (byte) 0x03, (byte) 0x04, (byte) 0x05, (byte) 0x06, (byte) 0x07,
\r
57 (byte) 0x08, (byte) 0x09, (byte) 0x0A, (byte) 0x0B, (byte) 0x0C, (byte) 0x0D, (byte) 0x0E, (byte) 0x0F,
\r
58 (byte) 0x10, (byte) 0x11, (byte) 0x12, (byte) 0x13, (byte) 0x14, (byte) 0x15, (byte) 0x16, (byte) 0x17,
\r
59 (byte) 0x18, (byte) 0x19, (byte) 0x1A, (byte) 0x1B, (byte) 0x1C, (byte) 0x1D, (byte) 0x1E, (byte) 0x1F,
\r
60 (byte) 0x20, (byte) 0x21, (byte) 0x22, (byte) 0x23, (byte) 0x24, (byte) 0x25, (byte) 0x26, (byte) 0x27,
\r
61 (byte) 0x28, (byte) 0x29, (byte) 0x2A, (byte) 0x2B, (byte) 0x2C, (byte) 0x2D, (byte) 0x2E, (byte) 0x2F,
\r
62 (byte) 0x30, (byte) 0x31, (byte) 0x32, (byte) 0x33, (byte) 0x34, (byte) 0x35, (byte) 0x36, (byte) 0x37,
\r
63 (byte) 0x38, (byte) 0x39, (byte) 0x3A, (byte) 0x3B, (byte) 0x3C, (byte) 0x3D, (byte) 0x3E, (byte) 0x3F,
\r
64 (byte) 0x40, (byte) 0x41, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47,
\r
65 (byte) 0x48, (byte) 0x49, (byte) 0x4A, (byte) 0x4B, (byte) 0x4C, (byte) 0x4D, (byte) 0x4E, (byte) 0x4F,
\r
66 (byte) 0x50, (byte) 0x51, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x57,
\r
67 (byte) 0x58, (byte) 0x59, (byte) 0x5A, (byte) 0x5B, (byte) 0x5C, (byte) 0x5D, (byte) 0x5E, (byte) 0x5F,
\r
68 (byte) 0x60, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67,
\r
69 (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F,
\r
70 (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77,
\r
71 (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x7B, (byte) 0x7C, (byte) 0x7D, (byte) 0x7E, (byte) 0x7F,
\r
72 (byte) 0x80, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87,
\r
73 (byte) 0x88, (byte) 0x89, (byte) 0x8A, (byte) 0x8B, (byte) 0x8C, (byte) 0x8D, (byte) 0x8E, (byte) 0x8F,
\r
74 (byte) 0x90, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97,
\r
75 (byte) 0x98, (byte) 0x99, (byte) 0x9A, (byte) 0x9B, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F,
\r
76 (byte) 0xA0, (byte) 0xA1, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7,
\r
77 (byte) 0xA8, (byte) 0xA9, (byte) 0xAA, (byte) 0xAB, (byte) 0xAC, (byte) 0xAD, (byte) 0xAE, (byte) 0xAF,
\r
78 (byte) 0xB0, (byte) 0xB1, (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0xB6, (byte) 0xB7,
\r
79 (byte) 0xB8, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0xBD, (byte) 0xBE, (byte) 0xBF,
\r
80 (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7,
\r
81 (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF,
\r
82 (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7,
\r
83 (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF,
\r
84 (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7,
\r
85 (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF,
\r
86 (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7,
\r
87 (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF
\r
93 public StatisticsTool()
\r
95 buffer = new char[BUFFER_SIZE];
\r
102 private static void usage()
\r
104 System.out.println(usageString);
\r
107 // private static void exceptionError(Exception e)
\r
109 // System.err.println("ioError: " + e.toString());
\r
112 private int nextBuffer(InputFile inputFileArg)
\r
116 return inputFileArg.read(buffer);
\r
119 public char nextChar()
\r
121 if (bufIndex >= bufMax) {
\r
122 bufMax = nextBuffer(inputFile);
\r
129 return buffer[bufIndex++];
\r
132 public void handleNGram(String key)
\r
137 public Object mapKey(String key)
\r
142 private NGramList dumpNGrams()
\r
144 String filename = inputFile.getPath();
\r
145 int extension = filename.lastIndexOf(".");
\r
146 String outputFileName = filename.substring(0, extension) + ".raw" + filename.substring(extension);
\r
147 PrintStream output;
\r
148 double cumulative = 0;
\r
151 output = new PrintStream(
\r
152 new FileOutputStream(outputFileName), true, "UTF8");
\r
153 } catch (IOException e) {
\r
154 System.out.println("? Could not open " + outputFileName + " for writing.");
\r
158 System.out.println(inputFile.getFilename() + ": " + ngrams.getUniqueNGrams() + "/" + ngrams.getTotalNGrams());
\r
160 ArrayList array = new ArrayList(ngrams.values());
\r
162 Collections.sort(array);
\r
164 NGramList stats = new NGramList(inputFile);
\r
166 int totalNGrams = ngrams.getTotalNGrams();
\r
168 for (Iterator it = array.iterator(); it.hasNext(); count += 1) {
\r
169 NGramList.NGram ngram = (NGramList.NGram) it.next();
\r
170 String value = ngram.getValue();
\r
171 int refCount = ngram.getRefCount();
\r
172 double ratio = (double) refCount / totalNGrams * 100.0;
\r
174 cumulative += ratio;
\r
176 // TODO check should be count < max && cumulative < maxPercent
\r
181 output.println(value + "\t" + refCount + "\t" + ratio + "%\t" + cumulative + "%");
\r
189 private void writeStatistics(ArrayList keyList, boolean visual)
\r
191 String filename = inputFile.getPath();
\r
192 int extension = filename.lastIndexOf(".");
\r
193 String outputFileName = filename.substring(0, extension) + "-" + inputFile.getEncoding() +
\r
194 (visual? "-visual.dat" : ".dat");
\r
195 PrintStream output;
\r
198 output = new PrintStream(
\r
199 new FileOutputStream(outputFileName), true, "ASCII");
\r
200 } catch (IOException e) {
\r
201 System.out.println("? Could not open " + outputFileName + " for writing.");
\r
207 output.print(" private static int[] ngrams = {");
\r
209 for (Iterator it = keyList.iterator(); it.hasNext(); i += 1) {
\r
210 Integer ngram = (Integer) it.next();
\r
213 output.print("\n ");
\r
216 output.print("0x" + Utility.hex(ngram.intValue(), 6) + ", ");
\r
219 output.println("\n };\n");
\r
222 * Generate the byte map
\r
224 char[] unicodes = inputFile.decode(allBytes);
\r
226 for (int b = 0; b < 256; b += 1) {
\r
227 char unicode = unicodes[b];
\r
228 int charClass = NGramParser.getCharClass(unicode);
\r
230 switch (charClass) {
\r
231 case NGramParser.C_LETTER:
\r
232 unicodes[b] = Character.toLowerCase(unicode);
\r
235 case NGramParser.C_PUNCT:
\r
239 case NGramParser.C_IGNORE:
\r
241 unicodes[b] = '\0';
\r
245 byte[] byteMap = inputFile.encode(unicodes);
\r
247 output.print(" private static byte[] byteMap = {");
\r
249 for (int b = 0; b < 256; b += 1) {
\r
251 output.print("\n ");
\r
254 output.print("(byte) 0x" + Utility.hex(byteMap[b] & 0xFF, 2) + ", ");
\r
257 output.println("\n };");
\r
260 public NGramList collectStatistics(InputFile file)
\r
262 if (!file.open()) {
\r
268 NGramParser parser = new NGramParser(this);
\r
270 ngrams = new NGramList(this);
\r
275 NGramList stats = dumpNGrams();
\r
276 ArrayList statKeys = new ArrayList(stats.keys());
\r
278 Collections.sort(statKeys);
\r
279 writeStatistics(statKeys, false);
\r
281 if (inputFile.getVisualOrder()) {
\r
282 ArrayList reversed = new ArrayList(statKeys.size());
\r
284 for (Iterator it = statKeys.iterator(); it.hasNext();) {
\r
285 Integer key = (Integer) it.next();
\r
286 int k = key.intValue();
\r
290 r = (r << 8) | (k & 0xFF);
\r
294 reversed.add(new Integer(r));
\r
297 Collections.sort(reversed);
\r
298 writeStatistics(reversed, true);
\r
304 public static void main(String[] args)
\r
306 List list = Arrays.asList(args);
\r
307 InputFile[] input_files = new InputFile[args.length];
\r
308 int file_count = 0;
\r
309 String encoding = null;
\r
310 boolean run_checker = false;
\r
311 boolean encoding_test = false;
\r
312 boolean visual_order = false;
\r
314 for (Iterator it = list.iterator(); it.hasNext(); /*anything?*/) {
\r
315 String arg = (String) it.next();
\r
317 if (arg.equals("-v")) {
\r
318 visual_order = true;
\r
319 } else if (arg.equals("-l")) {
\r
320 visual_order = false;
\r
321 } else if (arg.equals("-c")) {
\r
322 run_checker = true;
\r
323 } else if (arg.equals("-t")) {
\r
324 encoding_test = true;
\r
325 } else if (arg.equals("-e")) {
\r
326 if (it.hasNext()) {
\r
327 encoding = (String) it.next();
\r
329 System.err.println("Error: missing encoding.");
\r
331 } else if (arg.startsWith("-")) {
\r
332 if (! (arg.equals("-h") || arg.equals("-?"))) {
\r
333 System.err.println("Error: unknown option " + arg);
\r
338 input_files[file_count++] = new InputFile(arg, encoding, visual_order);
\r
342 if(file_count == 0){
\r
343 System.err.println("Error: there are no files to process.");
\r
347 StatisticsTool tool = new StatisticsTool();
\r
348 Checker[] checkers = new Checker[file_count];
\r
350 for(int i = 0; i < file_count; i += 1) {
\r
351 InputFile file = input_files[i];
\r
353 checkers[i] = new Checker(tool.collectStatistics(file), file);
\r
356 System.out.println();
\r
362 for(int c = 0; c < file_count; c += 1) {
\r
363 Checker checker = checkers[c];
\r
365 for(int f = 0; f < file_count; f += 1) {
\r
366 checker.check(input_files[f]);
\r
375 if (encoding_test) {
\r
376 char[] buffer = new char[128];
\r
378 System.out.println("Detection test");
\r
380 for (int f = 0; f < file_count; f += 1) {
\r
381 InputFile file = input_files[f];
\r
382 int[] histogram = new int[file_count];
\r
383 int charCount, misses = 0;
\r
385 System.out.println(file.getFilename() + "(" + file.getEncoding() + "):");
\r
388 for (int c = 0; c < file_count; c += 1) {
\r
389 checkers[c].setMapper(file);
\r
393 // for each checker
\r
394 // call checkBuffer, save score
\r
395 // find highest score, update histogram for that checker
\r
396 // show checker histogram
\r
398 while ((charCount = file.read(buffer)) > 0) {
\r
399 int[] scores = new int[file_count];
\r
400 int bestFit = -1, maxScore = 0;
\r
402 for (int c = 0; c < file_count; c += 1) {
\r
403 scores[c] = checkers[c].checkBuffer(buffer, charCount);
\r
406 for (int c = 0; c < file_count; c += 1) {
\r
407 int score = scores[c];
\r
409 if (score > maxScore) {
\r
415 if (bestFit >= 0) {
\r
416 histogram[bestFit] += 1;
\r
422 for (int c = 0; c < file_count; c += 1) {
\r
423 System.out.println(" " + checkers[c].getLanguage() + ": " + histogram[c]);
\r
427 System.out.println(" NONE: " + misses);
\r
430 System.out.println();
\r