jars/icu4j-52_1/tools/misc/src/com/ibm/icu/dev/tool/charsetdet/sbcs/NGramParser.java

   1 /*
   2  ***********************************************************************
   3  * Copyright (C) 2005, International Business Machines Corporation and *
   4  * others. All Rights Reserved.                                        *
   5  ***********************************************************************
   6  *
   7  */
   8
   9 package com.ibm.icu.dev.tool.charsetdet.sbcs;
  10
  11 import com.ibm.icu.text.UnicodeSet;
  12
  13 /**
  14  * @author emader
  15  *
  16  * TODO To change the template for this generated type comment go to
  17  * Window - Preferences - Java - Code Style - Code Templates
  18  */
  19 public class NGramParser
  20 {
  21
  22     public interface NGramParserClient
  23     {
  24         char nextChar();
  25         void handleNGram(String key);
  26     }
  27
  28     private static final int A_NULL = 0;
  29     private static final int A_ADDC = 1;
  30     private static final int A_ADDS = 2;
  31
  32     /*
  33      * Character classes
  34      */
  35     public static final int C_IGNORE = 0;
  36     public static final int C_LETTER = 1;
  37     public static final int C_PUNCT  = 2;
  38
  39     private static final int S_START  = 0;
  40     private static final int S_LETTER = 1;
  41     private static final int S_PUNCT  = 2;
  42
  43     static final class StateEntry
  44     {
  45         private int newState;
  46         private int action;
  47
  48         StateEntry(int theState, int theAction)
  49         {
  50             newState = theState;
  51             action   = theAction;
  52         }
  53
  54         public int getNewState()
  55         {
  56             return newState;
  57         }
  58
  59         public int getAction()
  60         {
  61             return action;
  62         }
  63     }
  64
  65     private StateEntry[][] stateTable = {
  66             {new StateEntry(S_START,  A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_ADDS)},
  67             {new StateEntry(S_LETTER, A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_ADDS)},
  68             {new StateEntry(S_PUNCT,  A_NULL), new StateEntry(S_LETTER, A_ADDC), new StateEntry(S_PUNCT,  A_NULL)}
  69     };
  70
  71     protected final int N_GRAM_SIZE = 3;
  72
  73     private char[] letters = new char[N_GRAM_SIZE];
  74     private int letterCount;
  75
  76     private static UnicodeSet letterSet = new UnicodeSet("[:letter:]");
  77
  78     private NGramParserClient client;
  79
  80     /**
  81      *
  82      */
  83     public NGramParser(NGramParserClient theClient)
  84     {
  85         client = theClient;
  86         letterCount = 0;
  87     }
  88
  89     public void setClient(NGramParserClient theClient)
  90     {
  91         client = theClient;
  92     }
  93
  94     // TODO Is this good enough, or are there other C_IGNORE characters?
  95     // TODO Could this make Latin letters C_PUNCT for non-Latin scripts?
  96     public static int getCharClass(char ch)
  97     {
  98         if (ch == '\'' || ch == '\uFEFF') {
  99             return C_IGNORE;
 100         }
 101
 102         if (letterSet.contains(ch)) {
 103             return C_LETTER;
 104         }
 105
 106         return C_PUNCT;
 107     }
 108
 109     public void reset()
 110     {
 111         letterCount = 0;
 112     }
 113
 114     public void addLetter(char letter)
 115     {
 116         // somewhat clever stuff goes here...
 117         letters[letterCount++] = letter;
 118
 119         if (letterCount >= N_GRAM_SIZE) {
 120             String key = new String(letters);
 121
 122             client.handleNGram(key);
 123
 124             letterCount = N_GRAM_SIZE - 1;
 125             for (int i = 0; i < letterCount; i += 1) {
 126                 letters[i] = letters[i + 1];
 127             }
 128         }
 129     }
 130
 131     public void parse()
 132     {
 133         char ch;
 134         int state = 0;
 135
 136         // this is where the clever stuff goes...
 137         while ((ch = client.nextChar()) != 0) {
 138             int charClass = getCharClass(ch);
 139             StateEntry entry = stateTable[state][charClass];
 140
 141             state = entry.getNewState();
 142
 143             switch (entry.getAction())
 144             {
 145             case A_ADDC:
 146                 addLetter(Character.toLowerCase(ch));
 147                 break;
 148
 149             case A_ADDS:
 150                 addLetter(' ');
 151                 break;
 152
 153             case A_NULL:
 154             default:
 155                 break;
 156             }
 157         }
 158
 159         addLetter(' ');
 160     }
 161 }