/* *********************************************************************** * Copyright (C) 2005, International Business Machines Corporation and * * others. All Rights Reserved. * *********************************************************************** * */ package com.ibm.icu.dev.tool.charsetdet.sbcs; import com.ibm.icu.text.UnicodeSet; /** * @author emader * * TODO To change the template for this generated type comment go to * Window - Preferences - Java - Code Style - Code Templates */ public class NGramParser { public interface NGramParserClient { char nextChar(); void handleNGram(String key); } private static final int A_NULL = 0; private static final int A_ADDC = 1; private static final int A_ADDS = 2; /* * Character classes */ public static final int C_IGNORE = 0; public static final int C_LETTER = 1; public static final int C_PUNCT = 2; private static final int S_START = 0; private static final int S_LETTER = 1; private static final int S_PUNCT = 2; static final class StateEntry { private int newState; private int action; StateEntry(int theState, int theAction) { newState = theState; action = theAction; } public int getNewState() { return newState; } public int getAction() { return action; } } private StateEntry[][] stateTable = { {new StateEntry(S_START, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)}, {new StateEntry(S_LETTER, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)}, {new StateEntry(S_PUNCT, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_NULL)} }; protected final int N_GRAM_SIZE = 3; private char[] letters = new char[N_GRAM_SIZE]; private int letterCount; private static UnicodeSet letterSet = new UnicodeSet("[:letter:]"); private NGramParserClient client; /** * */ public NGramParser(NGramParserClient theClient) { client = theClient; letterCount = 0; } public void setClient(NGramParserClient theClient) { client = theClient; } // TODO Is this good enough, or are there other C_IGNORE characters? // TODO Could this make Latin letters C_PUNCT for non-Latin scripts? public static int getCharClass(char ch) { if (ch == '\'' || ch == '\uFEFF') { return C_IGNORE; } if (letterSet.contains(ch)) { return C_LETTER; } return C_PUNCT; } public void reset() { letterCount = 0; } public void addLetter(char letter) { // somewhat clever stuff goes here... letters[letterCount++] = letter; if (letterCount >= N_GRAM_SIZE) { String key = new String(letters); client.handleNGram(key); letterCount = N_GRAM_SIZE - 1; for (int i = 0; i < letterCount; i += 1) { letters[i] = letters[i + 1]; } } } public void parse() { char ch; int state = 0; // this is where the clever stuff goes... while ((ch = client.nextChar()) != 0) { int charClass = getCharClass(ch); StateEntry entry = stateTable[state][charClass]; state = entry.getNewState(); switch (entry.getAction()) { case A_ADDC: addLetter(Character.toLowerCase(ch)); break; case A_ADDS: addLetter(' '); break; case A_NULL: default: break; } } addLetter(' '); } }