-package com.hughes.android.dictionary;\r
-\r
-import java.io.FileNotFoundException;\r
-import java.io.IOException;\r
-import java.io.RandomAccessFile;\r
-import java.io.Serializable;\r
-import java.util.ArrayList;\r
-import java.util.Collections;\r
-import java.util.LinkedHashMap;\r
-import java.util.List;\r
-import java.util.Map;\r
-import java.util.TreeMap;\r
-import java.util.regex.Pattern;\r
-\r
-import com.hughes.util.FileUtil;\r
-\r
-public class IndexBuilder {\r
-\r
- static final Pattern WHITESPACE = Pattern.compile("\\s+");\r
- static final Pattern NONALPHA = Pattern.compile("[^A-Za-z]+");\r
-\r
- public static void main(String[] args) throws IOException,\r
- ClassNotFoundException {\r
- if (args.length != 1) {\r
- System.err.println("No input file.");\r
- return;\r
- }\r
- final String dictionaryFileName = args[0];\r
- createIndex(dictionaryFileName, Entry.LANG1);\r
- createIndex(dictionaryFileName, Entry.LANG2);\r
- }\r
-\r
- private static void createIndex(final String dictionaryFileName,\r
- final byte lang) throws IOException, FileNotFoundException,\r
- ClassNotFoundException {\r
- Node rootBuilder;\r
- rootBuilder = processDictionaryLines(dictionaryFileName, lang);\r
- FileUtil.write(rootBuilder, String.format("%s_builder_%d.serialized", dictionaryFileName, lang));\r
- rootBuilder = (Node) FileUtil.read(String.format("%s_builder_%d.serialized", dictionaryFileName, lang));\r
-\r
- rootBuilder.forEachNode(new Function<Node>() {\r
- @Override\r
- public void invoke(final Node node) {\r
- for (final List<EntryDescriptor> entryDescriptors : node.entryDescriptorsMap.values()) {\r
- Collections.sort(entryDescriptors);\r
- }\r
- }});\r
- \r
- // Dump twice to get accurate file locations.\r
- for (int i = 0; i < 2; ++i) {\r
- final RandomAccessFile raf = new RandomAccessFile(String.format(Dictionary.INDEX_FORMAT, dictionaryFileName, lang), "rw"); \r
- rootBuilder.dump(raf);\r
- raf.close();\r
- }\r
- }\r
-\r
- // ----------------------------------------------------------------\r
- \r
- static final class EntryDescriptor implements Comparable<EntryDescriptor>, Serializable {\r
- final int offset;\r
- final int numTokens;\r
- public EntryDescriptor(int offset, int numTokens) {\r
- this.offset = offset;\r
- this.numTokens = numTokens;\r
- }\r
- @Override\r
- public boolean equals(Object obj) {\r
- final EntryDescriptor that = (EntryDescriptor) obj;\r
- return this.offset == that.offset;\r
- }\r
- @Override\r
- public int hashCode() {\r
- return offset;\r
- }\r
- @Override\r
- public int compareTo(EntryDescriptor o) {\r
- return this.numTokens < o.numTokens ? -1 : this.numTokens == o.numTokens ? 0 : 1;\r
- }\r
- }\r
-\r
-\r
- // ----------------------------------------------------------------\r
-\r
- static Node processDictionaryLines(final String dictionaryFileName, final byte lang) throws IOException {\r
- final Node root = new Node("");\r
- final RandomAccessFile dictionaryFile = new RandomAccessFile(dictionaryFileName, "r");\r
- String line;\r
- final Entry entry = new Entry();\r
- int lineCount = 0;\r
- long fileLocation = 0;\r
- while ((line = dictionaryFile.readLine()) != null) {\r
- assert ((int) fileLocation) == fileLocation;\r
-\r
- line = line.trim();\r
- if (line.isEmpty() || line.startsWith("#") || !entry.parseFromLine(line)) {\r
- continue;\r
- }\r
- final String text = entry.getIndexableText(Entry.LANG1);\r
- final String[] tokens = WHITESPACE.split(text);\r
- final Map<String,String> tokenToNormalizedMap = new LinkedHashMap<String,String>();\r
- for (String token : tokens) {\r
- if (token.length() <= 1 || !Character.isLetter(token.charAt(0))) {\r
- continue;\r
- }\r
- tokenToNormalizedMap.put(token, EntryFactory.entryFactory.normalizeToken(token));\r
- }\r
- for (final Map.Entry<String, String> tokenToNormalized : tokenToNormalizedMap.entrySet()) {\r
- final String normalizedToken = tokenToNormalized.getValue();\r
- final Node node = root.getNode(normalizedToken, 0, true);\r
- node.addToken(tokenToNormalized.getKey(), new EntryDescriptor((int) fileLocation, tokens.length));\r
- assert node == root.getNode(normalizedToken, 0, false);\r
- assert normalizedToken\r
- .equals(root.getNode(normalizedToken, 0, false).normalizedToken);\r
- }\r
-\r
- if (lineCount % 10000 == 0) {\r
- System.out.println("IndexBuilder: " + "lineCount=" + lineCount);\r
- }\r
- \r
- lineCount++;\r
- fileLocation = dictionaryFile.getFilePointer();\r
- }\r
- dictionaryFile.close();\r
- \r
- root.recursiveSetDescendantCounts();\r
- \r
- return root;\r
- }\r
-\r
-}\r