2 ***********************************************************************
3 * Copyright (C) 2005, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ***********************************************************************
9 package com.ibm.icu.dev.tool.charsetdet.sbcs;
11 import com.ibm.icu.text.UnicodeSet;
16 * TODO To change the template for this generated type comment go to
17 * Window - Preferences - Java - Code Style - Code Templates
19 public class NGramParser
22 public interface NGramParserClient
25 void handleNGram(String key);
28 private static final int A_NULL = 0;
29 private static final int A_ADDC = 1;
30 private static final int A_ADDS = 2;
35 public static final int C_IGNORE = 0;
36 public static final int C_LETTER = 1;
37 public static final int C_PUNCT = 2;
39 private static final int S_START = 0;
40 private static final int S_LETTER = 1;
41 private static final int S_PUNCT = 2;
43 static final class StateEntry
48 StateEntry(int theState, int theAction)
54 public int getNewState()
59 public int getAction()
65 private StateEntry[][] stateTable = {
66 {new StateEntry(S_START, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)},
67 {new StateEntry(S_LETTER, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)},
68 {new StateEntry(S_PUNCT, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_NULL)}
71 protected final int N_GRAM_SIZE = 3;
73 private char[] letters = new char[N_GRAM_SIZE];
74 private int letterCount;
76 private static UnicodeSet letterSet = new UnicodeSet("[:letter:]");
78 private NGramParserClient client;
83 public NGramParser(NGramParserClient theClient)
89 public void setClient(NGramParserClient theClient)
94 // TODO Is this good enough, or are there other C_IGNORE characters?
95 // TODO Could this make Latin letters C_PUNCT for non-Latin scripts?
96 public static int getCharClass(char ch)
98 if (ch == '\'' || ch == '\uFEFF') {
102 if (letterSet.contains(ch)) {
114 public void addLetter(char letter)
116 // somewhat clever stuff goes here...
117 letters[letterCount++] = letter;
119 if (letterCount >= N_GRAM_SIZE) {
120 String key = new String(letters);
122 client.handleNGram(key);
124 letterCount = N_GRAM_SIZE - 1;
125 for (int i = 0; i < letterCount; i += 1) {
126 letters[i] = letters[i + 1];
136 // this is where the clever stuff goes...
137 while ((ch = client.nextChar()) != 0) {
138 int charClass = getCharClass(ch);
139 StateEntry entry = stateTable[state][charClass];
141 state = entry.getNewState();
143 switch (entry.getAction())
146 addLetter(Character.toLowerCase(ch));