jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/text/BreakDictionary.java

   1 /*
   2  *******************************************************************************
   3  * Copyright (C) 1996-2012, International Business Machines Corporation and    *
   4  * others. All Rights Reserved.                                                *
   5  *******************************************************************************
   6  */
   7 package com.ibm.icu.text;
   8
   9 import java.io.DataInputStream;
  10 import java.io.FileInputStream;
  11 import java.io.FileNotFoundException;
  12 import java.io.FileOutputStream;
  13 import java.io.IOException;
  14 import java.io.InputStream;
  15 import java.io.OutputStreamWriter;
  16 import java.io.PrintWriter;
  17 import java.io.UnsupportedEncodingException;
  18
  19 import com.ibm.icu.util.CompactByteArray;
  20
  21 /**
  22  * This is the class that represents the list of known words used by
  23  * DictionaryBasedBreakIterator.  The conceptual data structure used
  24  * here is a trie: there is a node hanging off the root node for every
  25  * letter that can start a word.  Each of these nodes has a node hanging
  26  * off of it for every letter that can be the second letter of a word
  27  * if this node is the first letter, and so on.  The trie is represented
  28  * as a two-dimensional array that can be treated as a table of state
  29  * transitions.  Indexes are used to compress this array, taking
  30  * advantage of the fact that this array will always be very sparse.
  31  */
  32 class BreakDictionary {
  33     //=================================================================================
  34     // testing and debugging
  35     //=================================================================================
  36
  37 //    public static void main(String... args) {
  38 //        String inFile = args[0];
  39 //        String outFile = args.length >= 2 ? args[1] : null;
  40 //        try {
  41 //            writeToFile(inFile, outFile);
  42 //        } catch (Exception e) {
  43 //            e.printStackTrace();
  44 //        }
  45 //    }
  46
  47     ///CLOVER:OFF
  48     static void writeToFile(String inFile, String outFile)
  49             throws FileNotFoundException, UnsupportedEncodingException, IOException {
  50
  51         BreakDictionary dictionary = new BreakDictionary(new FileInputStream(inFile));
  52
  53         PrintWriter out = null;
  54
  55         if(outFile != null) {
  56             out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outFile), "UnicodeLittle"));
  57         }
  58
  59         dictionary.printWordList("", 0, out);
  60
  61         if (out != null) {
  62             out.close();
  63         }
  64     }
  65     ///CLOVER:ON
  66
  67     ///CLOVER:OFF
  68     /* public */ void printWordList(String partialWord, int state, PrintWriter out)
  69             throws IOException {
  70         if (state == 0xFFFF) {
  71             System.out.println(partialWord);
  72             if (out != null) {
  73                 out.println(partialWord);
  74             }
  75         }
  76         else {
  77             for (int i = 0; i < numCols; i++) {
  78                 int newState = (at(state, i)) & 0xFFFF;
  79
  80                 if (newState != 0) {
  81                     char newChar = reverseColumnMap[i];
  82                     String newPartialWord = partialWord;
  83
  84                     if (newChar != 0) {
  85                         newPartialWord += newChar;
  86                     }
  87
  88                     printWordList(newPartialWord, newState, out);
  89                 }
  90             }
  91         }
  92     }
  93     ///CLOVER:ON
  94
  95     /**
  96      * A map used to go from column numbers to characters.  Used only
  97      * for debugging right now.
  98      */
  99     private char[] reverseColumnMap = null;
 100
 101     //=================================================================================
 102     // data members
 103     //=================================================================================
 104
 105     /**
 106      * Maps from characters to column numbers.  The main use of this is to
 107      * avoid making room in the array for empty columns.
 108      */
 109     private CompactByteArray columnMap = null;
 110
 111     /**
 112      * The number of actual columns in the table
 113      */
 114     private int numCols;
 115
 116     /*
 117      * Columns are organized into groups of 32.  This says how many
 118      * column groups.  (We could calculate this, but we store the
 119      * value to avoid having to repeatedly calculate it.)
 120      */
 121     //private int numColGroups;
 122
 123     /**
 124      * The actual compressed state table.  Each conceptual row represents
 125      * a state, and the cells in it contain the row numbers of the states
 126      * to transition to for each possible letter.  0 is used to indicate
 127      * an illegal combination of letters (i.e., the error state).  The
 128      * table is compressed by eliminating all the unpopulated (i.e., zero)
 129      * cells.  Multiple conceptual rows can then be doubled up in a single
 130      * physical row by sliding them up and possibly shifting them to one
 131      * side or the other so the populated cells don't collide.  Indexes
 132      * are used to identify unpopulated cells and to locate populated cells.
 133      */
 134     private short[] table = null;
 135
 136     /**
 137      * This index maps logical row numbers to physical row numbers
 138      */
 139     private short[] rowIndex = null;
 140
 141     /**
 142      * A bitmap is used to tell which cells in the comceptual table are
 143      * populated.  This array contains all the unique bit combinations
 144      * in that bitmap.  If the table is more than 32 columns wide,
 145      * successive entries in this array are used for a single row.
 146      */
 147     private int[] rowIndexFlags = null;
 148
 149     /**
 150      * This index maps from a logical row number into the bitmap table above.
 151      * (This keeps us from storing duplicate bitmap combinations.)  Since there
 152      * are a lot of rows with only one populated cell, instead of wasting space
 153      * in the bitmap table, we just store a negative number in this index for
 154      * rows with one populated cell.  The absolute value of that number is
 155      * the column number of the populated cell.
 156      */
 157     private short[] rowIndexFlagsIndex = null;
 158
 159     /**
 160      * For each logical row, this index contains a constant that is added to
 161      * the logical column number to get the physical column number
 162      */
 163     private byte[] rowIndexShifts = null;
 164
 165     //=================================================================================
 166     // deserialization
 167     //=================================================================================
 168
 169     /* public */ BreakDictionary(InputStream dictionaryStream) throws IOException {
 170         readDictionaryFile(new DataInputStream(dictionaryStream));
 171     }
 172
 173     /* public */ void readDictionaryFile(DataInputStream in) throws IOException {
 174         int l;
 175
 176         // read in the version number (right now we just ignore it)
 177         in.readInt();
 178
 179         // read in the column map (this is serialized in its internal form:
 180         // an index array followed by a data array)
 181         l = in.readInt();
 182         char[] temp = new char[l];
 183         for (int i = 0; i < temp.length; i++)
 184             temp[i] = (char)in.readShort();
 185         l = in.readInt();
 186         byte[] temp2 = new byte[l];
 187         for (int i = 0; i < temp2.length; i++)
 188             temp2[i] = in.readByte();
 189         columnMap = new CompactByteArray(temp, temp2);
 190
 191         // read in numCols and numColGroups
 192         numCols = in.readInt();
 193         /*numColGroups = */in.readInt();
 194
 195         // read in the row-number index
 196         l = in.readInt();
 197         rowIndex = new short[l];
 198         for (int i = 0; i < rowIndex.length; i++)
 199             rowIndex[i] = in.readShort();
 200
 201         // load in the populated-cells bitmap: index first, then bitmap list
 202         l = in.readInt();
 203         rowIndexFlagsIndex = new short[l];
 204         for (int i = 0; i < rowIndexFlagsIndex.length; i++)
 205             rowIndexFlagsIndex[i] = in.readShort();
 206         l = in.readInt();
 207         rowIndexFlags = new int[l];
 208         for (int i = 0; i < rowIndexFlags.length; i++)
 209             rowIndexFlags[i] = in.readInt();
 210
 211         // load in the row-shift index
 212         l = in.readInt();
 213         rowIndexShifts = new byte[l];
 214         for (int i = 0; i < rowIndexShifts.length; i++)
 215             rowIndexShifts[i] = in.readByte();
 216
 217         // finally, load in the actual state table
 218         l = in.readInt();
 219         table = new short[l];
 220         for (int i = 0; i < table.length; i++)
 221             table[i] = in.readShort();
 222
 223         // this data structure is only necessary for testing and debugging purposes
 224         reverseColumnMap = new char[numCols];
 225         for (char c = 0; c < 0xffff; c++) {
 226             int col = columnMap.elementAt(c);
 227             if (col != 0) {
 228                reverseColumnMap[col] = c;
 229             }
 230         }
 231
 232         // close the stream
 233         in.close();
 234     }
 235
 236     //=================================================================================
 237     // access to the words
 238     //=================================================================================
 239
 240     /**
 241      * Uses the column map to map the character to a column number, then
 242      * passes the row and column number to the other version of at()
 243      * @param row The current state
 244      * @param ch The character whose column we're interested in
 245      * @return The new state to transition to
 246      */
 247     /* public */ final short at(int row, char ch) {
 248         int col = columnMap.elementAt(ch);
 249         return at(row, col);
 250     }
 251
 252     /**
 253      * Returns the value in the cell with the specified (logical) row and
 254      * column numbers.  In DictionaryBasedBreakIterator, the row number is
 255      * a state number, the column number is an input, and the return value
 256      * is the row number of the new state to transition to.  (0 is the
 257      * "error" state, and -1 is the "end of word" state in a dictionary)
 258      * @param row The row number of the current state
 259      * @param col The column number of the input character (0 means "not a
 260      * dictionary character")
 261      * @return The row number of the new state to transition to
 262      */
 263     /* public */ final short at(int row, int col) {
 264         if (cellIsPopulated(row, col)) {
 265             // we map from logical to physical row number by looking up the
 266             // mapping in rowIndex; we map from logical column number to
 267             // physical column number by looking up a shift value for this
 268             // logical row and offsetting the logical column number by
 269             // the shift amount.  Then we can use internalAt() to actually
 270             // get the value out of the table.
 271             return internalAt(rowIndex[row], col + rowIndexShifts[row]);
 272         }
 273         else {
 274             return 0;
 275         }
 276     }
 277
 278     /**
 279      * Given (logical) row and column numbers, returns true if the
 280      * cell in that position is populated
 281      */
 282     private final boolean cellIsPopulated(int row, int col) {
 283         // look up the entry in the bitmap index for the specified row.
 284         // If it's a negative number, it's the column number of the only
 285         // populated cell in the row
 286         if (rowIndexFlagsIndex[row] < 0) {
 287             return col == -rowIndexFlagsIndex[row];
 288         }
 289
 290         // if it's a positive number, it's the offset of an entry in the bitmap
 291         // list.  If the table is more than 32 columns wide, the bitmap is stored
 292         // successive entries in the bitmap list, so we have to divide the column
 293         // number by 32 and offset the number we got out of the index by the result.
 294         // Once we have the appropriate piece of the bitmap, test the appropriate
 295         // bit and return the result.
 296         else {
 297             int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
 298             return (flags & (1 << (col & 0x1f))) != 0;
 299         }
 300     }
 301
 302     /**
 303      * Implementation of at() when we know the specified cell is populated.
 304      * @param row The PHYSICAL row number of the cell
 305      * @param col The PHYSICAL column number of the cell
 306      * @return The value stored in the cell
 307      */
 308     private final short internalAt(int row, int col) {
 309         // the table is a one-dimensional array, so this just does the math necessary
 310         // to treat it as a two-dimensional array (we don't just use a two-dimensional
 311         // array because two-dimensional arrays are inefficient in Java)
 312         return table[row * numCols + col];
 313     }
 314 }
 315