1 package com.hughes.android.dictionary;
\r
3 import java.io.BufferedReader;
\r
5 import java.io.FileInputStream;
\r
6 import java.io.FileNotFoundException;
\r
7 import java.io.IOException;
\r
8 import java.io.InputStreamReader;
\r
9 import java.io.RandomAccessFile;
\r
10 import java.nio.charset.Charset;
\r
11 import java.util.ArrayList;
\r
12 import java.util.Collections;
\r
13 import java.util.List;
\r
14 import java.util.Map;
\r
15 import java.util.Random;
\r
16 import java.util.Set;
\r
17 import java.util.TreeMap;
\r
19 import javax.xml.parsers.ParserConfigurationException;
\r
21 import org.xml.sax.SAXException;
\r
23 import com.hughes.android.dictionary.Dictionary.IndexEntry;
\r
24 import com.hughes.android.dictionary.Dictionary.LanguageData;
\r
25 import com.hughes.android.dictionary.Dictionary.Row;
\r
26 import com.hughes.util.Args;
\r
27 import com.hughes.util.FileUtil;
\r
29 public class DictionaryBuilder {
\r
31 public static void main(String[] args) throws IOException,
\r
32 ClassNotFoundException, ParserConfigurationException, SAXException {
\r
34 final Map<String,String> keyValueArgs = Args.keyValueArgs(args);
\r
36 final Language lang1 = Language.lookup(keyValueArgs.remove("lang1"));
\r
37 final Language lang2 = Language.lookup(keyValueArgs.remove("lang2"));
\r
38 if (lang1 == null || lang2 == null) {
\r
39 fatalError("--lang1= and --lang2= must both be specified.");
\r
42 final String dictOutFilename = keyValueArgs.remove("dictOut");
\r
43 if (dictOutFilename == null) {
\r
44 fatalError("--dictOut= must be specified.");
\r
47 String summaryText = keyValueArgs.remove("summaryText");
\r
48 if (summaryText == null) {
\r
49 fatalError("--summaryText= must be specified.");
\r
51 if (summaryText.startsWith("@")) {
\r
52 summaryText = FileUtil.readToString(new File(summaryText.substring(1)));
\r
55 final String maxEntriesString = keyValueArgs.remove("maxEntries");
\r
56 final int maxEntries = maxEntriesString == null ? Integer.MAX_VALUE : Integer.parseInt(maxEntriesString);
\r
58 System.out.println("lang1=" + lang1);
\r
59 System.out.println("lang2=" + lang2);
\r
60 System.out.println("summaryText=" + summaryText);
\r
61 System.out.println("dictOut=" + dictOutFilename);
\r
63 final Dictionary dict = new Dictionary(summaryText, lang1, lang2);
\r
65 for (int i = 0; i < 100; ++i) {
\r
66 final String prefix = "input" + i;
\r
67 if (keyValueArgs.containsKey(prefix)) {
\r
68 final File file = new File(keyValueArgs.remove(prefix));
\r
69 System.out.println("Processing: " + file);
\r
70 String charsetName = keyValueArgs.remove(prefix + "Charset");
\r
71 if (charsetName == null) {
\r
72 charsetName = "UTF8";
\r
74 final Charset charset = Charset.forName(charsetName);
\r
75 String inputName = keyValueArgs.remove(prefix + "Name");
\r
76 if (inputName == null) {
\r
77 fatalError("Must specify human readable name for: " + prefix + "Name");
\r
80 String inputFormat = keyValueArgs.remove(prefix + "Format");
\r
81 if ("dictcc".equals(inputFormat)) {
\r
82 processLinedInputFile(dict, file, charset, false, maxEntries);
\r
83 } else if ("chemnitz".equals(inputFormat)) {
\r
84 processLinedInputFile(dict, file, charset, true, maxEntries);
\r
85 } else if ("wiktionary".equals(inputFormat)) {
\r
86 new WiktionaryXmlParser(dict).parse(file);
\r
88 fatalError("Invalid or missing input format: " + inputFormat);
\r
91 dict.sources.add(inputName);
\r
92 System.out.println("Done: " + file + "\n\n");
\r
96 if (!keyValueArgs.isEmpty()) {
\r
97 System.err.println("WARNING: couldn't parse arguments: " + keyValueArgs);
\r
100 createIndex(dict, SimpleEntry.LANG1);
\r
101 createIndex(dict, SimpleEntry.LANG2);
\r
103 System.out.println("Writing dictionary.");
\r
104 final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw");
\r
105 dictOut.setLength(0);
\r
106 dict.write(dictOut);
\r
109 final Random random = new Random(0);
\r
110 for (byte lang = 0; lang < 2; ++lang) {
\r
111 final LanguageData languageData = dict.languageDatas[lang];
\r
112 System.out.println("\nRandom words for: " + languageData.language.getSymbol());
\r
113 for (int i = 0; i < 20; ++i) {
\r
114 final int w = random.nextInt(languageData.sortedIndex.size());
\r
115 final IndexEntry entry = languageData.sortedIndex.get(w);
\r
116 final List<Row> rows = languageData.rows;
\r
117 int r = entry.startRow;
\r
118 System.out.println(languageData.rowToString(rows.get(r), false));
\r
120 while (r < rows.size() && !rows.get(r).isToken()) {
\r
121 System.out.println(" " + languageData.rowToString(rows.get(r), false));
\r
128 private static void fatalError(String string) {
\r
129 System.err.println(string);
\r
133 private static void processLinedInputFile(final Dictionary dict, final File file,
\r
134 final Charset charset, final boolean hasMultipleSubentries,
\r
135 final int maxEntries) throws FileNotFoundException, IOException {
\r
136 final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
\r
139 while ((line = reader.readLine()) != null && lineCount < maxEntries) {
\r
140 if (maxEntries < 200) {
\r
141 System.out.println(line);
\r
143 line = line.trim();
\r
144 if (line.equals("") || line.startsWith("#")) {
\r
148 final SimpleEntry entry = SimpleEntry.parseFromLine(line, hasMultipleSubentries);
\r
149 if (entry == null) {
\r
150 System.err.println("Invalid entry: " + line);
\r
154 dict.entries.add(entry);
\r
156 if (lineCount % 10000 == 0) {
\r
157 System.out.println("IndexBuilder: " + "lineCount=" + lineCount);
\r
164 public static void createIndex(final Dictionary dict, final byte lang) {
\r
165 System.out.println("Creating index: " + lang);
\r
167 final Map<String, TokenData> tokenToData = new TreeMap<String, TokenData>(dict.languageDatas[lang].language.sortComparator);
\r
169 for (int e = 0; e < dict.entries.size(); ++e) {
\r
170 final SimpleEntry entry = null; //dict.entries.get(e);
\r
171 final Set<String> tokens = entry.getIndexableTokens(lang);
\r
172 for (final String token : tokens) {
\r
173 TokenData tokenData = tokenToData.get(token);
\r
174 if (tokenData == null) {
\r
175 tokenData = new TokenData(token);
\r
176 tokenToData.put(token, tokenData);
\r
178 tokenData.entries.add(new TokenEntryData(lang, token, entry, e));
\r
181 if (e % 10000 == 0) {
\r
182 System.out.println("createIndex: " + "e=" + e);
\r
188 System.out.println("Sorting TokenData...");
\r
189 final List<TokenData> sortedTokenData = new ArrayList<TokenData>(tokenToData
\r
192 System.out.println("Sorting entries within each TokenData...");
\r
193 for (final TokenData tokenData : sortedTokenData) {
\r
194 Collections.sort(tokenData.entries);
\r
197 // Put it all together.
\r
198 System.out.println("Assembling final data structures...");
\r
199 final List<Row> rows = dict.languageDatas[lang].rows;
\r
200 final List<IndexEntry> indexEntries = dict.languageDatas[lang].sortedIndex;
\r
201 for (int t = 0; t < sortedTokenData.size(); ++t) {
\r
202 final TokenData tokenData = sortedTokenData.get(t);
\r
203 final int startRow = rows.size();
\r
204 final IndexEntry indexEntry = new IndexEntry(tokenData.token, startRow);
\r
205 indexEntries.add(indexEntry);
\r
207 final Row tokenRow = new Row(-(t + 1));
\r
208 rows.add(tokenRow);
\r
210 for (final TokenEntryData entryData : tokenData.entries) {
\r
211 final Row entryRow = new Row(entryData.entryIndex);
\r
212 rows.add(entryRow);
\r
218 static final class TokenEntryData implements Comparable<TokenEntryData> {
\r
219 final String token;
\r
220 final SimpleEntry entry;
\r
221 final int entryIndex;
\r
223 private static final int bigNoOverflow = 100000;
\r
225 int minSubEntryIndexOf = bigNoOverflow;
\r
226 int minSubEntryLength = bigNoOverflow;
\r
227 int minSubEntry = bigNoOverflow;
\r
229 public TokenEntryData(final byte lang, final String token, final SimpleEntry entry, final int entryIndex) {
\r
230 this.token = token;
\r
231 this.entry = entry;
\r
232 this.entryIndex = entryIndex;
\r
234 final String[] subentries = entry.getAllText(lang);
\r
235 for (int s = 0; s < subentries.length; ++s) {
\r
236 final String subentry = subentries[s];
\r
237 int indexOf = subentry.indexOf(token);
\r
238 if (indexOf != -1) {
\r
239 minSubEntryIndexOf = Math.min(minSubEntryIndexOf, indexOf);
\r
240 minSubEntryLength = Math.min(minSubEntryLength, subentry.length());
\r
241 minSubEntry = Math.min(minSubEntry, s);
\r
247 public int compareTo(final TokenEntryData that) {
\r
248 assert this.token.equals(that.token);
\r
250 if (this.minSubEntryIndexOf != that.minSubEntryIndexOf) {
\r
251 return this.minSubEntryIndexOf - that.minSubEntryIndexOf;
\r
253 if (this.minSubEntryLength != that.minSubEntryLength) {
\r
254 return this.minSubEntryLength - that.minSubEntryLength;
\r
256 return this.minSubEntry - that.minSubEntry;
\r
260 static final class TokenData {
\r
261 final String token;
\r
262 final List<TokenEntryData> entries = new ArrayList<TokenEntryData>();
\r
266 public TokenData(final String token) {
\r
267 this.token = token;
\r