/* ******************************************************************************* * Copyright (C) 1996-2009, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.dev.tool.rbbi; import java.io.DataOutputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.util.Vector; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.CompactByteArray; public class BuildDictionaryFile { public static void main(String args[]) throws FileNotFoundException, UnsupportedEncodingException, IOException { String filename = args[0]; String encoding = ""; String outputFile = ""; String listingFile = ""; if (args.length >= 2) encoding = args[1]; if(args.length >= 3) outputFile = args[2]; if (args.length >= 4) listingFile = args[3]; BuildDictionaryFile dictionary = new BuildDictionaryFile(); dictionary.build(filename, encoding); DataOutputStream out = null; if (outputFile.length() != 0) { out = new DataOutputStream(new FileOutputStream(outputFile)); dictionary.writeDictionaryFile(out); } PrintWriter listing = null; if (listingFile.length() != 0) { listing = new PrintWriter(new OutputStreamWriter(new FileOutputStream(listingFile), "UnicodeLittle")); dictionary.printWordList("", 0, listing); listing.close(); } } public BuildDictionaryFile() { } public void build(String filename, String encoding) throws FileNotFoundException, UnsupportedEncodingException, IOException { FileInputStream file = new FileInputStream(filename); InputStreamReader in; if (encoding.length() == 0) in = new InputStreamReader(file); else in = new InputStreamReader(file, encoding); buildColumnMap(in); file = new FileInputStream(filename); if (encoding.length() == 0) in = new InputStreamReader(file); else in = new InputStreamReader(file, encoding); buildStateTable(in); //printTable(); } public void buildColumnMap(InputStreamReader in) throws IOException { System.out.println("Building column map..."); UnicodeSet charsInFile = new UnicodeSet(); int c = in.read(); int totalChars = 0; while (c >= 0) { ++totalChars; if (totalChars > 0 && totalChars % 5000 == 0) System.out.println("Read " + totalChars + " characters..."); if (c > ' ') charsInFile.add((char)c); c = in.read(); } // Test.debugPrintln(charsInFile.toString()); StringBuffer tempReverseMap = new StringBuffer(); tempReverseMap.append(' '); columnMap = new CompactByteArray(); int n = charsInFile.getRangeCount(); byte p = 1; for (int i=0; i= 0) { charsInWord++; short column = columnMap.elementAt((char)c); row = (int[])(tempTable.elementAt(state)); if (column != 0) { if (row[column] == 0) { row[column] = tempTable.size(); ++row[numCols]; state = (tempTable.size()); tempTable.addElement(new int[numCols + 1]); } else state = row[column]; } else if (state != 0) { if (row[0] != -1) { row[0] = -1; ++row[numCols]; uniqueWords++; totalUniqueWordChars += charsInWord; } totalWords++; if (totalWords % 5000 == 0) System.out.println("Read " + totalWords + " words, " + tempTable.size() + " rows..."); charsInWord = 0; state = 0; } c = in.read(); } if (state != 0) { row = (int[])(tempTable.elementAt(state)); if (row[0] != -1) { row[0] = -1; uniqueWords++; totalUniqueWordChars += charsInWord; } totalWords++; } compress(tempTable); table = new short[numCols * tempTable.size()]; for (int i = 0; i < tempTable.size(); i++) { row = (int[])tempTable.elementAt(i); for (int j = 0; j < numCols; j++) table[i * numCols + j] = (short)row[j]; } } private void compress(Vector tempTable) { System.out.println("Before compression:"); System.out.println(" Number of rows = " + tempTable.size()); System.out.println(" Number of columns = " + numCols); System.out.println(" Number of cells = " + tempTable.size() * numCols); deleteDuplicateRows(tempTable); System.out.println("After removing duplicate rows:"); System.out.println(" Number of rows = " + tempTable.size()); System.out.println(" Number of columns = " + numCols); System.out.println(" Number of cells = " + tempTable.size() * numCols); stackRows(tempTable); if (tempTable.size() > 32767) throw new IllegalArgumentException("Too many rows in table!"); System.out.println("After doubling up on rows:"); System.out.println(" Number of rows = " + tempTable.size()); System.out.println(" Number of columns = " + numCols); System.out.println(" Number of cells = " + tempTable.size() * numCols); } /* experimental... private void deleteDuplicateRows(Vector tempTable) { int[] rowNumMap = new int[tempTable.size()]; for (int i = 0; i < rowNumMap.length; i++) rowNumMap[i] = i; int nextClass = numCols; int currentClass; int lastClass; boolean split; int[] row1, row2, tempRow; int tempCat; do { System.out.println("Making a pass (" + nextClass + " classes)..."); currentClass = 0; lastClass = nextClass; while (currentClass < nextClass) { System.out.println(" currentClass = " + currentClass); split = false; row1 = row2 = null; for (int i = 0; i < tempTable.size(); i++) { tempRow = (int[])tempTable.elementAt(i); if (tempRow[numCols] == currentClass) { if (row1 == null) { row1 = (int[])tempTable.elementAt(i); } else { row2 = (int[])tempTable.elementAt(i); for (int j = 0; j < numCols; j++) { if ((row1[j] == 0) != (row2[j] == 0) || (row1[j] == -1) != (row2[j] == -1)) { row2[numCols] = nextClass; split = true; break; } else if (row1[j] != 0 && row2[j] != 0 && row1[j] != -1 && row2[j] != -1) { tempRow = (int[])tempTable.elementAt(row1[j]); tempCat = tempRow[numCols]; tempRow = (int[])tempTable.elementAt(row2[j]); if (tempCat != tempRow[numCols]) { row2[numCols] = nextClass; split = true; break; } } } } } } if (split) ++nextClass; ++currentClass; //System.out.println(); } } while (lastClass != nextClass); int[] representatives = new int[nextClass]; for (int i = 1; i < tempTable.size(); i++) { tempRow = (int[])tempTable.elementAt(i); if (representatives[tempRow[numCols]] == 0) representatives[tempRow[numCols]] = i; else rowNumMap[i] = representatives[tempRow[numCols]]; } System.out.println("Renumbering..."); // renumber all remaining rows for (int i = 0; i < rowNumMap.length; i++) if (rowNumMap[i] != i) tempTable.setElementAt(null, i); int newRowNum = 0; for (int i = 0; i < rowNumMap.length; i++) if (tempTable.elementAt(i) != null) rowNumMap[i] = newRowNum++; for (int i = 0; i < rowNumMap.length; i++) if (tempTable.elementAt(i) == null) rowNumMap[i] = rowNumMap[rowNumMap[i]]; for (int i = tempTable.size() - 1; i >= 0; i--) { tempRow = (int[])tempTable.elementAt(i); if (tempRow == null) tempTable.removeElementAt(i); else { for (int j = 0; j < numCols; j++) if (tempRow[j] != -1) tempRow[j] = rowNumMap[j]; } } //for (int i = 1; i < rowNumMap.length; i++) rowNumMap[i] = i; int newRowNum = rowNumMap.length; } */ private void deleteDuplicateRows(Vector tempTable) { Vector work = (Vector)(tempTable.clone()); boolean didDeleteRow = true; Vector tempMapping = new Vector(work.size()); int[] mapping = new int[work.size()]; for (int i = 0; i < mapping.length; i++) { mapping[i] = i; tempMapping.addElement(new Integer(i)); } boolean[] tbd = new boolean[work.size()]; while (didDeleteRow) { System.out.println(" " + work.size() + " rows..."); int deletedRows = 0; didDeleteRow = false; sortTable(work, tempMapping, mapping, 1, work.size()); for (int i = 0; i < work.size() - 1; ) { System.out.print("Deleting, inspecting row " + i + ", deleted " + deletedRows + " rows...\r"); int rowToDelete = ((Integer)(tempMapping.elementAt(i + 1))).intValue(); int rowToMapTo = ((Integer)(tempMapping.elementAt(i))).intValue(); if (compareRows((int[])work.elementAt(i), (int[])work.elementAt(i + 1), mapping) == 0) { tbd[rowToDelete] = true; tempTable.setElementAt(null, rowToDelete); while (tbd[mapping[rowToMapTo]]) mapping[rowToMapTo] = mapping[mapping[rowToMapTo]]; mapping[rowToDelete] = mapping[rowToMapTo]; didDeleteRow = true; deletedRows++; work.removeElementAt(i + 1); tempMapping.removeElementAt(i + 1); } else i++; } for (int i = 0; i < mapping.length; i++) { if (tbd[i] && tbd[mapping[i]]) mapping[i] = mapping[mapping[i]]; } } int decrementBy = 0; for (int i = 0; i < mapping.length; i++) { if (tbd[i]) decrementBy++; else mapping[i] -= decrementBy; } for (int i = 0; i < mapping.length; i++) { if (tbd[i]) mapping[i] = mapping[mapping[i]]; } for (int i = tempTable.size() - 1; i >= 0; i--) { if (tbd[i]) tempTable.removeElementAt(i); else { int[] row = (int[])tempTable.elementAt(i); for (int j = 0; j < numCols; j++) row[j] = (row[j] == -1) ? -1 : mapping[row[j]]; } } } private void sortTable(Vector tbl, Vector tempMapping, int[] mapping, int start, int end) { System.out.print("Sorting (" + start + ", " + end + ")...\r"); if (start + 1 >= end) return; else if (start + 10 >= end) { for (int i = start + 1; i < end; i++) { int[] row = (int[])tbl.elementAt(i); Integer tempMap = (Integer)tempMapping.elementAt(i); int j; for (j = i - 1; j >= start; j--) { if (compareRows((int[])tbl.elementAt(j), row, mapping) > 0) { tbl.setElementAt((int[])tbl.elementAt(j), j + 1); tempMapping.setElementAt((Integer)tempMapping.elementAt(j), j + 1); } else { tbl.setElementAt(row, j + 1); tempMapping.setElementAt(tempMap, j + 1); break; } } if (j < start) { tbl.setElementAt(row, start); tempMapping.setElementAt(tempMap, start); } } } else { int boundaryPos = (start + end) / 2; int i; boolean allTheSame = true; int firstDifferent = 0; do { int[] boundary = (int[])tbl.elementAt(boundaryPos); i = start; int j = end - 1; int[] row = null; byte compResult; while (i < j) { row = (int[])tbl.elementAt(i); while (i <= j && compareRows(row, boundary, mapping) < 0) { i++; row = (int[])tbl.elementAt(i); } row = (int[])tbl.elementAt(j); compResult = compareRows(row, boundary, mapping); while (i <= j && (compResult >= 0)) { if (compResult != 0) { allTheSame = false; firstDifferent = j; } j--; row = (int[])tbl.elementAt(j); compResult = compareRows(row, boundary, mapping); } if (i <= j) { row = (int[])tbl.elementAt(j); tbl.setElementAt(tbl.elementAt(i), j); tbl.setElementAt(row, i); Object temp = tempMapping.elementAt(j); tempMapping.setElementAt(tempMapping.elementAt(i), j); tempMapping.setElementAt(temp, i); } } if (i <= start) { if (allTheSame) return; else boundaryPos = firstDifferent; } } while (i <= start); sortTable(tbl, tempMapping, mapping, start, i); sortTable(tbl, tempMapping, mapping, i, end); } } private byte compareRows(int[] row1, int[] row2, int[] mapping) { for (int i = 0; i < numCols; i++) { int c1 = (row1[i] == -1) ? -1 : mapping[row1[i]]; int c2 = (row2[i] == -1) ? -1 : mapping[row2[i]]; if (c1 < c2) return -1; else if (c1 > c2) return 1; } return 0; } private int[] buildRowIndex(Vector tempTable) { int[] tempRowIndex = new int[tempTable.size()]; rowIndexFlagsIndex = new short[tempTable.size()]; Vector tempRowIndexFlags = new Vector(); rowIndexShifts = new byte[tempTable.size()]; // build the row index. Each entry in the row index starts out referring // to the original row (so it doesn't actually do any mapping), and we set // up the index flags to show which cells in the row are populated for (int i = 0; i < tempTable.size(); i++) { tempRowIndex[i] = i; int[] row = (int[])tempTable.elementAt(i); if (row[numCols] == 1 && row[0] == 0) { int j = 0; while (row[j] == 0) ++j; rowIndexFlagsIndex[i] = (short)(-j); } else { int[] flags = new int[numColGroups]; int nextFlag = 1; int colGroup = 0; for (int j = 0; j < numCols; j++) { if (row[j] != 0) flags[colGroup] |= nextFlag; nextFlag <<= 1; if (nextFlag == 0) { ++colGroup; nextFlag = 1; } } colGroup = 0; int j = 0; while (j < tempRowIndexFlags.size()) { if (((Integer)tempRowIndexFlags.elementAt(j)).intValue() == flags[colGroup]) { ++colGroup; ++j; if (colGroup >= numColGroups) break; } else if (colGroup != 0) colGroup = 0; else ++j; } rowIndexFlagsIndex[i] = (short)(j - colGroup); while (colGroup < numColGroups) { tempRowIndexFlags.addElement(new Integer(flags[colGroup])); ++colGroup; } } } rowIndexFlags = new int[tempRowIndexFlags.size()]; for (int i = 0; i < rowIndexFlags.length; i++) rowIndexFlags[i] = ((Integer)tempRowIndexFlags.elementAt(i)).intValue(); System.out.println("Number of column groups = " + numColGroups); System.out.println("Size of rowIndexFlags = " + rowIndexFlags.length); return tempRowIndex; } private void stackRows(Vector tempTable) { /* System.out.print("Row:\t"); for (int i = 0; i < numCols; i++) System.out.print(reverseColumnMap[i] + "\t"); System.out.println(); for (int i = 0; i < tempTable.size(); i++) { System.out.print(Integer.toString(i) + ":\t"); int[] row = (int[])tempTable.elementAt(i); for (int j = 0; j < row.length; j++) if (row[j] != 0) System.out.print(Integer.toString(row[j]) + "\t"); else System.out.print(".\t"); System.out.println(); } */ int[] tempRowIndex = buildRowIndex(tempTable); boolean[] tbd = new boolean[tempTable.size()]; // now we actually go through and stack rows together for (int i = 0; i < tempTable.size(); i++) { if (tbd[i]) continue; System.out.print("Stacking, inspecting row " + i + "...\r"); //System.out.println("Stacking, inspecting row " + i + "..."); int[] destRow = (int[])tempTable.elementAt(i); boolean[] tempFlags = new boolean[numCols]; boolean[] filledCells = new boolean[numCols]; for (int j = 0; j < numCols; j++) filledCells[j] = destRow[j] != 0; for (int j = i + 1; destRow[numCols] < numCols && j < tempTable.size(); j++) { if (tbd[j]) continue; int[] srcRow = (int[])tempTable.elementAt(j); if (srcRow[numCols] + destRow[numCols] > numCols) continue; int maxLeftShift = -999; int maxRightShift = 0; for (int k = 0; k < numCols; k++) { tempFlags[k] = srcRow[k] != 0; if (tempFlags[k]) { if (maxLeftShift == -999) maxLeftShift = -k; maxRightShift = (numCols - 1) - k; } } int shift; for (shift = maxLeftShift; shift <= maxRightShift; shift++) { int k; for (k = 0; k < numCols; k++) { if (tempFlags[k] && filledCells[k + shift]) break; } if (k >= numCols) break; } if (shift <= maxRightShift) { //System.out.println("Packing row " + j + " into row " + i + " with shift = " + shift); for (int k = 0; k < numCols; k++) { if (tempFlags[k]) { filledCells[k + shift] = true; destRow[k + shift] = srcRow[k]; ++destRow[numCols]; } } tbd[j] = true; tempRowIndex[j] = i; rowIndexShifts[j] = (byte)shift; } } } // finally, we squeeze out all the deleted rows int decrementBy = 0; for (int i = 0; i < tempRowIndex.length; i++) { if (!tbd[i]) tempRowIndex[i] -= decrementBy; else ++decrementBy; } rowIndex = new short[tempRowIndex.length]; for (int i = tempRowIndex.length - 1; i >= 0; i--) { if (tbd[i]) { rowIndex[i] = (short)(tempRowIndex[tempRowIndex[i]]); tempTable.removeElementAt(i); } else rowIndex[i] = (short)tempRowIndex[i]; } } // private void printTable() { // short cell; // int populatedCells = 0; ///* // System.out.println("Conceptual table:"); // System.out.println(" Row: a b c d e f g h i j k l m n" // + " o p q r s t u v w x y z ' #"); // // boolean[] rowPrintFlags = new boolean[rowIndex.length]; // printConceptualTable("", 0, rowPrintFlags); //*/ // // System.out.println(); // System.out.println("Conceptual table:"); // System.out.print(" Row:"); // for (int i = 0; i < reverseColumnMap.length; i++) { // System.out.print(" " + reverseColumnMap[i]); // } // for (int i = 0; i < rowIndex.length; i++) { // System.out.println(); // printNumber(i, 4); // System.out.print(":"); // for (int j = 0; j < numCols; j++) // printNumber(at(i, j), 4); // } // System.out.println('\n'); // // System.out.println(); // System.out.println("Internally stored table:"); // System.out.print(" Row:"); // for (int i = 0; i < reverseColumnMap.length; i++) { // System.out.print(" " + reverseColumnMap[i]); // } // for (int i = 0; i < table.length; i++) { // if (i % numCols == 0) { // System.out.println(); // printNumber(i / numCols, 4); // System.out.print(":"); // } // cell = table[i]; // if (cell != 0) // populatedCells++; // printNumber(cell, 4); // } // System.out.println('\n'); // //System.out.println("Row index:"); //for (int i = 0; i < rowIndex.length; i++) { // System.out.print(" " + i + " -> " + rowIndex[i]); // if (rowIndexFlagsIndex[i] < 0) // System.out.print(", flags = " + Integer.toBinaryString((1 << (-rowIndexFlagsIndex[i]))) + " (" + rowIndexFlagsIndex[i]); // else // System.out.print(", flags = " + Integer.toBinaryString(rowIndexFlags[rowIndexFlagsIndex[i]]) + " (" + rowIndexFlagsIndex[i]); // System.out.println("), shift = " + rowIndexShifts[i]); //} ///* // int theoreticalMinRows = populatedCells / numCols; // if (populatedCells % numCols != 0) // theoreticalMinRows++; // int oneCellRows = 0; // for (int i = 0; i < rowIndexFlags.length; i++) { // double temp = Math.log(rowIndexFlags[i]) / Math.log(2); // if (temp == (int)temp) // oneCellRows++; // } // // System.out.println('\n'); // System.out.println("Total words in input = " + totalWords); // System.out.println("Total unique words = " + uniqueWords + ", comprising " + // totalUniqueWordChars + " characters\n"); // System.out.println("Number of populated cells = " + populatedCells); // System.out.println("Total number of cells = " + (table.length)); // System.out.println("Residency = " + ((float)populatedCells / table.length * 100) + '%'); // System.out.println("Ratio of populated cells to unique-word characters = " + // ((float)populatedCells / totalUniqueWordChars * 100) + '%'); // System.out.println("Ratio of total cells to unique-word characters = " + // ((float)table.length / totalUniqueWordChars * 100) + '%'); // System.out.println("Number of rows = " + (table.length / numCols)); // System.out.println("Theoretical minimum number of rows = " + theoreticalMinRows); // System.out.println("Ratio of number of rows to theoretical minimum = " + // ((float)(table.length / numCols) / theoreticalMinRows * 100) + '%'); // System.out.println("Number of conceptual rows = " + rowIndex.length); // System.out.println("Conceptual rows with only one populated cell = " + oneCellRows); // System.out.println("Ratio of one-cell rows to total conceptual rows = " + (((float)oneCellRows) // / rowIndex.length * 100) + '%'); // System.out.println("Average number of populated cells in multi-cell rows = " + // ((float)(populatedCells - oneCellRows) / (rowIndex.length - oneCellRows))); // // int storageUsed = table.length * 2 + rowIndex.length * 2 // + rowIndexFlags.length * 4 + rowIndexShifts.length; // System.out.println("Total number of bytes in table (including indexes) = " + // storageUsed); // System.out.println("Bytes of overhead per unique-word character = " + ((double)(storageUsed // - (totalUniqueWordChars * 2)) / totalUniqueWordChars)); //*/ // } // private void printConceptualTable(String initialString, int state, boolean[] flags) { // if (initialString.length() == 0) // System.out.println("root:"); // else // System.out.println(initialString + ':'); // // if (!flags[state]) { // flags[state] = true; // printNumber(state, 4); // System.out.print(":"); // for (int i = 0; i < numCols; i++) // printNumber(at(state, i), 4); // System.out.println(); // } // // int nextState; // for (int i = 0; i < numCols; i++) { // nextState = at(state, i); // if (nextState > 0 && !flags[nextState]) { // printNumber(nextState, 4); // System.out.print(":"); // for (int j = 0; j < numCols; j++) // printNumber(at(nextState, j), 4); // System.out.println(); // } // } // for (int i = 0; i < numCols; i++) { // nextState = at(state, i); // if (nextState > 0 && !flags[nextState]) { // char nextChar; // if (nextState == 27) // nextChar = ' '; // else if (nextState == 26) // nextChar = '\''; // else // nextChar = (char)(i + 'a'); // flags[nextState] = true; // printConceptualTable(initialString + nextChar, nextState, flags); // } // } // } private void printWordList(String partialWord, int state, PrintWriter out) throws IOException { if (state == -1) { System.out.println(partialWord); if (out != null) out.println(partialWord); } else { for (int i = 0; i < numCols; i++) { if (at(state, i) != 0) printWordList(partialWord + reverseColumnMap[i], at(state, i), out); } } } private void writeDictionaryFile(DataOutputStream out) throws IOException { out.writeInt(0); // version number char[] columnMapIndexes = columnMap.getIndexArray(); out.writeInt(columnMapIndexes.length); for (int i = 0; i < columnMapIndexes.length; i++) out.writeShort((short)columnMapIndexes[i]); byte[] columnMapValues = columnMap.getValueArray(); out.writeInt(columnMapValues.length); for (int i = 0; i < columnMapValues.length; i++) out.writeByte((byte)columnMapValues[i]); out.writeInt(numCols); out.writeInt(numColGroups); out.writeInt(rowIndex.length); for (int i = 0; i < rowIndex.length; i++) out.writeShort(rowIndex[i]); out.writeInt(rowIndexFlagsIndex.length); for (int i = 0; i < rowIndexFlagsIndex.length; i++) out.writeShort(rowIndexFlagsIndex[i]); out.writeInt(rowIndexFlags.length); for (int i = 0; i < rowIndexFlags.length; i++) out.writeInt(rowIndexFlags[i]); out.writeInt(rowIndexShifts.length); for (int i = 0; i < rowIndexShifts.length; i++) out.writeByte(rowIndexShifts[i]); out.writeInt(table.length); for (int i = 0; i < table.length; i++) out.writeShort(table[i]); out.close(); } // private void printNumber(int x, int width) { // String s = String.valueOf(x); // if (width > s.length()) // System.out.print(spaces.substring(0, width - s.length())); // if (x != 0) // System.out.print(s); // else // System.out.print('.'); // } public final short at(int row, char ch) { int col = columnMap.elementAt(ch); return at(row, col); } public final short at(int row, int col) { if (cellIsPopulated(row, col)) return internalAt(rowIndex[row], col + rowIndexShifts[row]); else return 0; } private final boolean cellIsPopulated(int row, int col) { if (rowIndexFlagsIndex[row] < 0) return col == -rowIndexFlagsIndex[row]; else { int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)]; return (flags & (1 << (col & 0x1f))) != 0; } } private final short internalAt(int row, int col) { return table[row * numCols + col]; } private CompactByteArray columnMap = null; private char[] reverseColumnMap = null; private int numCols; private int numColGroups; private short[] table = null; private short[] rowIndex = null; private int[] rowIndexFlags = null; private short[] rowIndexFlagsIndex = null; private byte[] rowIndexShifts = null; private int totalWords = 0; private int uniqueWords = 0; private int totalUniqueWordChars = 0; //private static final String spaces = " "; }