2 ***********************************************************************
\r
3 * Copyright (C) 2005, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 ***********************************************************************
\r
9 package com.ibm.icu.dev.tool.charsetdet.sbcs;
\r
11 import com.ibm.icu.text.UnicodeSet;
\r
16 * TODO To change the template for this generated type comment go to
\r
17 * Window - Preferences - Java - Code Style - Code Templates
\r
19 public class NGramParser
\r
22 public interface NGramParserClient
\r
25 void handleNGram(String key);
\r
28 private static final int A_NULL = 0;
\r
29 private static final int A_ADDC = 1;
\r
30 private static final int A_ADDS = 2;
\r
35 public static final int C_IGNORE = 0;
\r
36 public static final int C_LETTER = 1;
\r
37 public static final int C_PUNCT = 2;
\r
39 private static final int S_START = 0;
\r
40 private static final int S_LETTER = 1;
\r
41 private static final int S_PUNCT = 2;
\r
43 static final class StateEntry
\r
45 private int newState;
\r
48 StateEntry(int theState, int theAction)
\r
50 newState = theState;
\r
54 public int getNewState()
\r
59 public int getAction()
\r
65 private StateEntry[][] stateTable = {
\r
66 {new StateEntry(S_START, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)},
\r
67 {new StateEntry(S_LETTER, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_ADDS)},
\r
68 {new StateEntry(S_PUNCT, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT, A_NULL)}
\r
71 protected final int N_GRAM_SIZE = 3;
\r
73 private char[] letters = new char[N_GRAM_SIZE];
\r
74 private int letterCount;
\r
76 private static UnicodeSet letterSet = new UnicodeSet("[:letter:]");
\r
78 private NGramParserClient client;
\r
83 public NGramParser(NGramParserClient theClient)
\r
89 public void setClient(NGramParserClient theClient)
\r
94 // TODO Is this good enough, or are there other C_IGNORE characters?
\r
95 // TODO Could this make Latin letters C_PUNCT for non-Latin scripts?
\r
96 public static int getCharClass(char ch)
\r
98 if (ch == '\'' || ch == '\uFEFF') {
\r
102 if (letterSet.contains(ch)) {
\r
109 public void reset()
\r
114 public void addLetter(char letter)
\r
116 // somewhat clever stuff goes here...
\r
117 letters[letterCount++] = letter;
\r
119 if (letterCount >= N_GRAM_SIZE) {
\r
120 String key = new String(letters);
\r
122 client.handleNGram(key);
\r
124 letterCount = N_GRAM_SIZE - 1;
\r
125 for (int i = 0; i < letterCount; i += 1) {
\r
126 letters[i] = letters[i + 1];
\r
131 public void parse()
\r
136 // this is where the clever stuff goes...
\r
137 while ((ch = client.nextChar()) != 0) {
\r
138 int charClass = getCharClass(ch);
\r
139 StateEntry entry = stateTable[state][charClass];
\r
141 state = entry.getNewState();
\r
143 switch (entry.getAction())
\r
146 addLetter(Character.toLowerCase(ch));
\r