2 ***********************************************************************
\r
3 * Copyright (C) 2005-2006, International Business Machines *
\r
4 * Corporation and others. All Rights Reserved. *
\r
5 ***********************************************************************
\r
9 package com.ibm.icu.dev.tool.charsetdet.sbcs;
\r
14 * TODO To change the template for this generated type comment go to
\r
15 * Window - Preferences - Java - Code Style - Code Templates
\r
17 public class Checker implements NGramParser.NGramParserClient
\r
19 private NGramList ngrams;
\r
20 private int totalNGrams;
\r
21 private int totalHits;
\r
23 private String language;
\r
24 private String encoding;
\r
26 private int[] histogram;
\r
28 private static final int BUFFER_SIZE = 1024;
\r
30 private char[] buffer;
\r
31 private int bufIndex;
\r
34 private NGramParser parser;
\r
37 * TODO This should take cumulative percent and the name...
\r
39 public Checker(NGramList list, InputFile dataFile)
\r
42 ngrams.setMapper(dataFile);
\r
44 language = languageName(dataFile.getFilename());
\r
45 encoding = dataFile.getEncoding();
\r
47 buffer = new char[BUFFER_SIZE];
\r
48 parser = new NGramParser(this);
\r
51 histogram = new int[100];
\r
55 public void handleNGram(String key)
\r
57 NGramList.NGram ngram = ngrams.get(key);
\r
61 if (ngram != null) {
\r
63 //ngram.incrementRefCount();
\r
67 private void resetCounts()
\r
70 totalNGrams = totalHits = 0;
\r
73 private void resetHistogram()
\r
75 for(int i = 0; i < 100; i += 1) {
\r
81 private static void exceptionError(Exception e)
\r
83 System.err.println("ioError: " + e.toString());
\r
86 private static String languageName(String filename)
\r
88 return filename.substring(0, filename.indexOf('.'));
\r
91 private boolean nextBuffer(InputFile inputFile)
\r
94 bufMax = inputFile.read(buffer);
\r
95 } catch (Exception e) {
\r
104 return bufMax >= 0;
\r
107 private void parseBuffer()
\r
114 public char nextChar()
\r
116 if (bufIndex >= bufMax) {
\r
120 return buffer[bufIndex++];
\r
123 public String getLanguage()
\r
128 public void setMapper(InputFile file)
\r
130 ngrams.setMapper(file);
\r
133 public int checkBuffer(char[] theBuffer, int charCount)
\r
135 buffer = theBuffer;
\r
136 bufMax = charCount;
\r
143 public void check(InputFile dataFile)
\r
145 int minHist = 101, maxHist = -1;
\r
149 String dataFilename = dataFile.getFilename();
\r
150 String fileEncoding = dataFile.getEncoding();
\r
152 System.out.println(language + "(" + encoding + ") stats, " + languageName(dataFilename) + "(" + fileEncoding + ") data:");
\r
154 setMapper(dataFile);
\r
157 while (nextBuffer(dataFile)) {
\r
160 double percentHits = (double) totalHits / totalNGrams * 100.0;
\r
161 int ph = (int) percentHits;
\r
163 if (ph < minHist) {
\r
167 if (ph > maxHist) {
\r
171 histogram[ph] += 1;
\r
174 for(int ph = minHist; ph <= maxHist; ph += 1) {
\r
175 System.out.println(ph + "\t" + histogram[ph]);
\r
178 System.out.println();
\r