1 package com.hughes.android.dictionary;
\r
3 import java.io.BufferedReader;
\r
4 import java.io.FileInputStream;
\r
5 import java.io.FileNotFoundException;
\r
6 import java.io.IOException;
\r
7 import java.io.InputStreamReader;
\r
8 import java.io.RandomAccessFile;
\r
9 import java.nio.charset.Charset;
\r
10 import java.util.ArrayList;
\r
11 import java.util.Arrays;
\r
12 import java.util.Collections;
\r
13 import java.util.Comparator;
\r
14 import java.util.HashMap;
\r
15 import java.util.List;
\r
16 import java.util.Map;
\r
17 import java.util.Set;
\r
19 import com.hughes.android.dictionary.Dictionary.IndexEntry;
\r
20 import com.hughes.android.dictionary.Dictionary.Row;
\r
22 public class DictionaryBuilder {
\r
24 static final List<InputFile> inputFiles = Arrays.asList(
\r
25 new InputFile("c:\\thad\\de-en-chemnitz.txt", Charset.forName("UTF8"), true),
\r
26 // Thad's extra sauce:
\r
27 new InputFile("c:\\thad\\de-en-dictcc.txt", Charset.forName("Cp1252"), false)
\r
29 static final String dictOutFilename = "c:\\thad\\de-en.dict";
\r
31 static class InputFile {
\r
33 final Charset charset;
\r
34 final boolean hasMultipleSubentries;
\r
35 public InputFile(String file, Charset charset, boolean hasMultipleSubentries) {
\r
37 this.charset = charset;
\r
38 this.hasMultipleSubentries = hasMultipleSubentries;
\r
42 public static void main(String[] args) throws IOException,
\r
43 ClassNotFoundException {
\r
45 final Dictionary dict = new Dictionary("de-en.txt - a German-English dictionary\n" +
\r
46 "Version: devel, 2009-04-17\n" +
\r
47 "Source: http://dict.tu-chemnitz.de/\n" +
\r
48 "Thanks to Frank Richter.", Language.DE, Language.EN);
\r
49 System.out.println(Charset.forName("Cp1252"));
\r
50 for (final InputFile inputFile : inputFiles) {
\r
51 processInputFile(dict, inputFile);
\r
54 createIndex(dict, Entry.LANG1);
\r
55 createIndex(dict, Entry.LANG2);
\r
57 System.out.println("Writing dictionary.");
\r
58 final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw");
\r
59 dictOut.setLength(0);
\r
60 dict.write(dictOut);
\r
64 private static void processInputFile(final Dictionary dict, final InputFile inputFile) throws FileNotFoundException, IOException {
\r
65 final BufferedReader dictionaryIn = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile.file), inputFile.charset));
\r
68 while ((line = dictionaryIn.readLine()) != null) {
\r
69 // System.out.println(line);
\r
71 if (line.isEmpty() || line.startsWith("#")) {
\r
75 final Entry entry = Entry.parseFromLine(line, inputFile.hasMultipleSubentries);
\r
76 if (entry == null) {
\r
77 System.err.println("Invalid entry: " + line);
\r
81 dict.entries.add(entry);
\r
83 if (lineCount % 10000 == 0) {
\r
84 System.out.println("IndexBuilder: " + "lineCount=" + lineCount);
\r
88 dictionaryIn.close();
\r
91 public static void createIndex(final Dictionary dict, final byte lang) {
\r
92 System.out.println("Creating index: " + lang);
\r
94 final Map<String, TokenData> tokenDatas = new HashMap<String, TokenData>();
\r
95 final EntryData entryDatas[] = new EntryData[dict.entries.size()];
\r
97 for (int e = 0; e < dict.entries.size(); ++e) {
\r
98 final Entry entry = dict.entries.get(e);
\r
99 final Set<String> tokens = entry.getIndexableTokens(lang);
\r
100 entryDatas[e] = new EntryData(tokens.size());
\r
101 for (final String token : tokens) {
\r
102 TokenData tokenData = tokenDatas.get(token);
\r
103 if (tokenData == null) {
\r
104 tokenData = new TokenData(token);
\r
105 tokenDatas.put(token, tokenData);
\r
107 tokenData.entries.add(e);
\r
110 if (e % 10000 == 0) {
\r
111 System.out.println("createIndex: " + "e=" + e);
\r
117 System.out.println("Sorting TokenData...");
\r
118 final List<TokenData> sortedIndex = new ArrayList<TokenData>(tokenDatas
\r
120 Collections.sort(sortedIndex, new Comparator<TokenData>() {
\r
122 public int compare(TokenData tokenData0, TokenData tokenData1) {
\r
123 return dict.languageDatas[lang].language.sortComparator.compare(tokenData0.token, tokenData1.token);
\r
126 System.out.println("Sorting entries within each TokenData...");
\r
127 final Comparator<Integer> entryComparator = new Comparator<Integer>() {
\r
129 public int compare(Integer o1, Integer o2) {
\r
130 // TODO: better this
\r
131 // Relevant (first token match) chemnitz entries first
\r
132 // first token position in entry
\r
133 // entry length in chars
\r
134 return entryDatas[o1].numTokens < entryDatas[o2].numTokens ? -1
\r
135 : entryDatas[o1].numTokens == entryDatas[o2].numTokens ? 0 : 1;
\r
138 for (final TokenData tokenData : tokenDatas.values()) {
\r
139 Collections.sort(tokenData.entries, entryComparator);
\r
142 // Put it all together.
\r
143 System.out.println("Assembling final data structures...");
\r
144 final List<Row> rows = dict.languageDatas[lang].rows;
\r
145 final List<IndexEntry> indexEntries = dict.languageDatas[lang].sortedIndex;
\r
146 for (int t = 0; t < sortedIndex.size(); ++t) {
\r
147 final TokenData tokenData = sortedIndex.get(t);
\r
148 final int startRow = rows.size();
\r
149 final IndexEntry indexEntry = new IndexEntry(tokenData.token, startRow);
\r
150 indexEntries.add(indexEntry);
\r
152 final Row tokenRow = new Row(-(t + 1));
\r
153 rows.add(tokenRow);
\r
155 for (final Integer e : tokenData.entries) {
\r
156 final Row entryRow = new Row(e);
\r
157 rows.add(entryRow);
\r
163 static final class EntryData {
\r
164 final int numTokens;
\r
166 public EntryData(int numTokens) {
\r
167 this.numTokens = numTokens;
\r
171 static final class TokenData {
\r
172 final String token;
\r
173 final List<Integer> entries = new ArrayList<Integer>();
\r
177 public TokenData(final String token) {
\r
178 this.token = token;
\r