/* *********************************************************************** * Copyright (C) 2005-2006, International Business Machines * * Corporation and others. All Rights Reserved. * *********************************************************************** * */ package com.ibm.icu.dev.tool.charsetdet.sbcs; /** * @author emader * * TODO To change the template for this generated type comment go to * Window - Preferences - Java - Code Style - Code Templates */ public class Checker implements NGramParser.NGramParserClient { private NGramList ngrams; private int totalNGrams; private int totalHits; private String language; private String encoding; private int[] histogram; private static final int BUFFER_SIZE = 1024; private char[] buffer; private int bufIndex; private int bufMax; private NGramParser parser; /** * TODO This should take cumulative percent and the name... */ public Checker(NGramList list, InputFile dataFile) { ngrams = list; ngrams.setMapper(dataFile); language = languageName(dataFile.getFilename()); encoding = dataFile.getEncoding(); buffer = new char[BUFFER_SIZE]; parser = new NGramParser(this); resetCounts(); histogram = new int[100]; resetHistogram(); } public void handleNGram(String key) { NGramList.NGram ngram = ngrams.get(key); totalNGrams += 1; if (ngram != null) { totalHits += 1; //ngram.incrementRefCount(); } } private void resetCounts() { bufIndex = 0; totalNGrams = totalHits = 0; } private void resetHistogram() { for(int i = 0; i < 100; i += 1) { histogram[i] = 0; } } private static void exceptionError(Exception e) { System.err.println("ioError: " + e.toString()); } private static String languageName(String filename) { return filename.substring(0, filename.indexOf('.')); } private boolean nextBuffer(InputFile inputFile) { try { bufMax = inputFile.read(buffer); } catch (Exception e) { bufMax = -1; exceptionError(e); return false; } bufIndex = 0; return bufMax >= 0; } private void parseBuffer() { resetCounts(); parser.reset(); parser.parse(); } public char nextChar() { if (bufIndex >= bufMax) { return 0; } return buffer[bufIndex++]; } public String getLanguage() { return language; } public void setMapper(InputFile file) { ngrams.setMapper(file); } public int checkBuffer(char[] theBuffer, int charCount) { buffer = theBuffer; bufMax = charCount; parseBuffer(); return totalHits; } public void check(InputFile dataFile) { int minHist = 101, maxHist = -1; dataFile.open(); String dataFilename = dataFile.getFilename(); String fileEncoding = dataFile.getEncoding(); System.out.println(language + "(" + encoding + ") stats, " + languageName(dataFilename) + "(" + fileEncoding + ") data:"); setMapper(dataFile); resetHistogram(); while (nextBuffer(dataFile)) { parseBuffer(); double percentHits = (double) totalHits / totalNGrams * 100.0; int ph = (int) percentHits; if (ph < minHist) { minHist = ph; } if (ph > maxHist) { maxHist = ph; } histogram[ph] += 1; } for(int ph = minHist; ph <= maxHist; ph += 1) { System.out.println(ph + "\t" + histogram[ph]); } System.out.println(); dataFile.close(); return; } }