1 package com.hughes.android.dictionary;
\r
3 import java.io.BufferedReader;
\r
4 import java.io.FileInputStream;
\r
5 import java.io.FileNotFoundException;
\r
6 import java.io.IOException;
\r
7 import java.io.InputStreamReader;
\r
8 import java.io.RandomAccessFile;
\r
9 import java.nio.charset.Charset;
\r
10 import java.util.ArrayList;
\r
11 import java.util.Collections;
\r
12 import java.util.Comparator;
\r
13 import java.util.HashMap;
\r
14 import java.util.List;
\r
15 import java.util.Map;
\r
16 import java.util.Set;
\r
18 import com.hughes.android.dictionary.Dictionary.IndexEntry;
\r
19 import com.hughes.android.dictionary.Dictionary.Row;
\r
21 public class DictionaryBuilder {
\r
23 public static void main(String[] args) throws IOException,
\r
24 ClassNotFoundException {
\r
25 if (args.length != 1) {
\r
26 System.err.println("outputfile");
\r
29 final String dictOutFilename = args[0];
\r
31 final Dictionary dict = new Dictionary("de-en.txt - a German-English dictionary\n" +
\r
32 "Version: devel, 2009-04-17\n" +
\r
33 "Source: http://dict.tu-chemnitz.de/\n" +
\r
34 "Thanks to Frank Richter.", Language.DE, Language.EN);
\r
35 System.out.println(Charset.forName("Cp1252"));
\r
36 processInputFile("c:\\de-en-chemnitz.txt", dict, true, Charset.forName("UTF8"));
\r
38 // Thad's extra sauce:
\r
39 processInputFile("c:\\de-en-dictcc.txt", dict, false, Charset.forName("Cp1252"));
\r
41 createIndex(dict, Entry.LANG1);
\r
42 createIndex(dict, Entry.LANG2);
\r
44 System.out.println("Writing dictionary.");
\r
45 final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw");
\r
46 dictOut.setLength(0);
\r
47 dict.write(dictOut);
\r
51 private static void processInputFile(final String filename,
\r
52 final Dictionary dict, final boolean hasMultipleSubentries, final Charset charset) throws FileNotFoundException, IOException {
\r
53 final BufferedReader dictionaryIn = new BufferedReader(new InputStreamReader(new FileInputStream(filename), charset));
\r
56 while ((line = dictionaryIn.readLine()) != null) {
\r
57 // System.out.println(line);
\r
59 if (line.isEmpty() || line.startsWith("#")) {
\r
63 final Entry entry = Entry.parseFromLine(line, hasMultipleSubentries);
\r
64 if (entry == null) {
\r
65 System.err.println("Invalid entry: " + line);
\r
69 dict.entries.add(entry);
\r
71 if (lineCount % 10000 == 0) {
\r
72 System.out.println("IndexBuilder: " + "lineCount=" + lineCount);
\r
76 dictionaryIn.close();
\r
79 public static void createIndex(final Dictionary dict, final byte lang) {
\r
80 System.out.println("Creating index: " + lang);
\r
82 final Map<String, TokenData> tokenDatas = new HashMap<String, TokenData>();
\r
83 final EntryData entryDatas[] = new EntryData[dict.entries.size()];
\r
85 for (int e = 0; e < dict.entries.size(); ++e) {
\r
86 final Entry entry = dict.entries.get(e);
\r
87 final Set<String> tokens = entry.getIndexableTokens(lang);
\r
88 entryDatas[e] = new EntryData(tokens.size());
\r
89 for (final String token : tokens) {
\r
90 TokenData tokenData = tokenDatas.get(token);
\r
91 if (tokenData == null) {
\r
92 tokenData = new TokenData(token);
\r
93 tokenDatas.put(token, tokenData);
\r
95 tokenData.entries.add(e);
\r
98 if (e % 10000 == 0) {
\r
99 System.out.println("createIndex: " + "e=" + e);
\r
105 System.out.println("Sorting TokenData...");
\r
106 final List<TokenData> sortedIndex = new ArrayList<TokenData>(tokenDatas
\r
108 Collections.sort(sortedIndex, new Comparator<TokenData>() {
\r
110 public int compare(TokenData tokenData0, TokenData tokenData1) {
\r
111 return dict.languageDatas[lang].language.sortComparator.compare(tokenData0.token, tokenData1.token);
\r
114 System.out.println("Sorting entries within each TokenData...");
\r
115 final Comparator<Integer> entryComparator = new Comparator<Integer>() {
\r
117 public int compare(Integer o1, Integer o2) {
\r
118 // TODO: better this
\r
119 // Relevant (first token match) chemnitz entries first
\r
120 // first token position in entry
\r
121 // entry length in chars
\r
122 return entryDatas[o1].numTokens < entryDatas[o2].numTokens ? -1
\r
123 : entryDatas[o1].numTokens == entryDatas[o2].numTokens ? 0 : 1;
\r
126 for (final TokenData tokenData : tokenDatas.values()) {
\r
127 Collections.sort(tokenData.entries, entryComparator);
\r
130 // Put it all together.
\r
131 System.out.println("Assembling final data structures...");
\r
132 final List<Row> rows = dict.languageDatas[lang].rows;
\r
133 final List<IndexEntry> indexEntries = dict.languageDatas[lang].sortedIndex;
\r
134 for (int t = 0; t < sortedIndex.size(); ++t) {
\r
135 final TokenData tokenData = sortedIndex.get(t);
\r
136 final int startRow = rows.size();
\r
137 final IndexEntry indexEntry = new IndexEntry(tokenData.token, startRow);
\r
138 indexEntries.add(indexEntry);
\r
140 final Row tokenRow = new Row(-(t + 1));
\r
141 rows.add(tokenRow);
\r
143 for (final Integer e : tokenData.entries) {
\r
144 final Row entryRow = new Row(e);
\r
145 rows.add(entryRow);
\r
151 static final class EntryData {
\r
152 final int numTokens;
\r
154 public EntryData(int numTokens) {
\r
155 this.numTokens = numTokens;
\r
159 static final class TokenData {
\r
160 final String token;
\r
161 final List<Integer> entries = new ArrayList<Integer>();
\r
165 public TokenData(final String token) {
\r
166 this.token = token;
\r