1 package com.hughes.android.dictionary;
\r
4 public class ZIndexBuilder {
\r
6 // static final Pattern WHITESPACE = Pattern.compile("\\s+");
\r
7 // static final Pattern NONALPHA = Pattern.compile("[^A-Za-z]+");
\r
9 // public static void main(String[] args) throws IOException,
\r
10 // ClassNotFoundException {
\r
11 // if (args.length != 1) {
\r
12 // System.err.println("No input file.");
\r
15 // final String dictionaryFileName = args[0];
\r
16 // createIndex(dictionaryFileName, Entry.LANG1);
\r
17 // createIndex(dictionaryFileName, Entry.LANG2);
\r
20 // private static void createIndex(final String dictionaryFileName,
\r
21 // final byte lang) throws IOException, FileNotFoundException,
\r
22 // ClassNotFoundException {
\r
23 // Node rootBuilder;
\r
24 // rootBuilder = processDictionaryLines(dictionaryFileName, lang);
\r
25 // FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", dictionaryFileName, lang));
\r
26 // rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", dictionaryFileName, lang));
\r
28 // rootBuilder.forEachNode(new Function<Node>() {
\r
30 // public void invoke(final Node node) {
\r
31 // for (final List<EntryDescriptor> entryDescriptors : node.entryDescriptorsMap.values()) {
\r
32 // Collections.sort(entryDescriptors);
\r
36 // // Dump twice to get accurate file locations.
\r
37 // for (int i = 0; i < 2; ++i) {
\r
38 // final RandomAccessFile raf = new RandomAccessFile(String.format(Dictionary.INDEX_FORMAT, dictionaryFileName, lang), "rw");
\r
39 // rootBuilder.dump(raf);
\r
44 // // ----------------------------------------------------------------
\r
46 // static final class EntryDescriptor implements Comparable<EntryDescriptor>, Serializable {
\r
47 // final int offset;
\r
48 // final int numTokens;
\r
49 // public EntryDescriptor(int offset, int numTokens) {
\r
50 // this.offset = offset;
\r
51 // this.numTokens = numTokens;
\r
54 // public boolean equals(Object obj) {
\r
55 // final EntryDescriptor that = (EntryDescriptor) obj;
\r
56 // return this.offset == that.offset;
\r
59 // public int hashCode() {
\r
63 // public int compareTo(EntryDescriptor o) {
\r
64 // return this.numTokens < o.numTokens ? -1 : this.numTokens == o.numTokens ? 0 : 1;
\r
69 // // ----------------------------------------------------------------
\r
71 // static Node processDictionaryLines(final String dictionaryFileName, final byte lang) throws IOException {
\r
72 // final Node root = new Node("");
\r
73 // final RandomAccessFile dictionaryFile = new RandomAccessFile(dictionaryFileName, "r");
\r
75 // final Entry entry = new Entry();
\r
76 // int lineCount = 0;
\r
77 // long fileLocation = 0;
\r
78 // while ((line = dictionaryFile.readLine()) != null) {
\r
79 // assert ((int) fileLocation) == fileLocation;
\r
81 // line = line.trim();
\r
82 // if (line.isEmpty() || line.startsWith("#") || !entry.parseFromLine(line)) {
\r
85 // final String text = entry.getIndexableText(Entry.LANG1);
\r
86 // final String[] tokens = WHITESPACE.split(text);
\r
87 // final Map<String,String> tokenToNormalizedMap = new LinkedHashMap<String,String>();
\r
88 // for (String token : tokens) {
\r
89 // if (token.length() <= 1 || !Character.isLetter(token.charAt(0))) {
\r
92 // tokenToNormalizedMap.put(token, EntryFactory.entryFactory.normalizeToken(token));
\r
94 // for (final Map.Entry<String, String> tokenToNormalized : tokenToNormalizedMap.entrySet()) {
\r
95 // final String normalizedToken = tokenToNormalized.getValue();
\r
96 // final Node node = root.getNode(normalizedToken, 0, true);
\r
97 // node.addToken(tokenToNormalized.getKey(), new EntryDescriptor((int) fileLocation, tokens.length));
\r
98 // assert node == root.getNode(normalizedToken, 0, false);
\r
99 // assert normalizedToken
\r
100 // .equals(root.getNode(normalizedToken, 0, false).normalizedToken);
\r
103 // if (lineCount % 10000 == 0) {
\r
104 // System.out.println("IndexBuilder: " + "lineCount=" + lineCount);
\r
108 // fileLocation = dictionaryFile.getFilePointer();
\r
110 // dictionaryFile.close();
\r
112 // root.recursiveSetDescendantCounts();
\r