1 package com.hughes.android.dictionary;
\r
3 import java.io.BufferedReader;
\r
4 import java.io.FileInputStream;
\r
5 import java.io.FileNotFoundException;
\r
6 import java.io.IOException;
\r
7 import java.io.InputStreamReader;
\r
8 import java.io.RandomAccessFile;
\r
9 import java.nio.charset.Charset;
\r
10 import java.util.ArrayList;
\r
11 import java.util.Collections;
\r
12 import java.util.Comparator;
\r
13 import java.util.HashMap;
\r
14 import java.util.List;
\r
15 import java.util.Map;
\r
16 import java.util.Set;
\r
18 import com.hughes.android.dictionary.Dictionary.IndexEntry;
\r
19 import com.hughes.android.dictionary.Dictionary.Row;
\r
21 public class DictionaryBuilder {
\r
23 public static void main(String[] args) throws IOException,
\r
24 ClassNotFoundException {
\r
25 if (args.length != 1) {
\r
26 System.err.println("outputfile");
\r
29 final String dictOutFilename = args[0];
\r
31 final Dictionary dict = new Dictionary(Language.DE, Language.EN);
\r
32 System.out.println(Charset.forName("Cp1252"));
\r
33 processInputFile("c:\\de-en-chemnitz.txt", dict, true, Charset.forName("UTF8"));
\r
34 processInputFile("c:\\de-en-dictcc.txt", dict, false, Charset.forName("Cp1252"));
\r
36 createIndex(dict, Entry.LANG1);
\r
37 createIndex(dict, Entry.LANG2);
\r
39 System.out.println("Writing dictionary.");
\r
40 final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw");
\r
41 dictOut.setLength(0);
\r
42 dict.write(dictOut);
\r
46 private static void processInputFile(final String filename,
\r
47 final Dictionary dict, final boolean hasMultipleSubentries, final Charset charset) throws FileNotFoundException, IOException {
\r
48 final BufferedReader dictionaryIn = new BufferedReader(new InputStreamReader(new FileInputStream(filename), charset));
\r
51 while ((line = dictionaryIn.readLine()) != null) {
\r
52 // System.out.println(line);
\r
54 if (line.isEmpty() || line.startsWith("#")) {
\r
58 final Entry entry = Entry.parseFromLine(line, hasMultipleSubentries);
\r
59 if (entry == null) {
\r
60 System.err.println("Invalid entry: " + line);
\r
64 dict.entries.add(entry);
\r
66 if (lineCount % 10000 == 0) {
\r
67 System.out.println("IndexBuilder: " + "lineCount=" + lineCount);
\r
71 dictionaryIn.close();
\r
74 public static void createIndex(final Dictionary dict, final byte lang) {
\r
75 System.out.println("Creating index: " + lang);
\r
77 final Map<String, TokenData> tokenDatas = new HashMap<String, TokenData>();
\r
78 final EntryData entryDatas[] = new EntryData[dict.entries.size()];
\r
80 for (int e = 0; e < dict.entries.size(); ++e) {
\r
81 final Entry entry = dict.entries.get(e);
\r
82 final Set<String> tokens = entry.getIndexableTokens(lang);
\r
83 entryDatas[e] = new EntryData(tokens.size());
\r
84 for (final String token : tokens) {
\r
85 TokenData tokenData = tokenDatas.get(token);
\r
86 if (tokenData == null) {
\r
87 tokenData = new TokenData(token);
\r
88 tokenDatas.put(token, tokenData);
\r
90 tokenData.entries.add(e);
\r
93 if (e % 10000 == 0) {
\r
94 System.out.println("createIndex: " + "e=" + e);
\r
100 System.out.println("Sorting TokenData...");
\r
101 final List<TokenData> sortedIndex = new ArrayList<TokenData>(tokenDatas
\r
103 Collections.sort(sortedIndex, new Comparator<TokenData>() {
\r
105 public int compare(TokenData tokenData0, TokenData tokenData1) {
\r
106 return dict.languageDatas[lang].language.tokenComparator.compare(tokenData0.token, tokenData1.token);
\r
109 System.out.println("Sorting entries within each TokenData...");
\r
110 final Comparator<Integer> entryComparator = new Comparator<Integer>() {
\r
112 public int compare(Integer o1, Integer o2) {
\r
113 // TODO: better this
\r
114 // Relevant (first token match) chemnitz entries first
\r
115 // first token position in entry
\r
116 // entry length in chars
\r
117 return entryDatas[o1].numTokens < entryDatas[o2].numTokens ? -1
\r
118 : entryDatas[o1].numTokens == entryDatas[o2].numTokens ? 0 : 1;
\r
121 for (final TokenData tokenData : tokenDatas.values()) {
\r
122 Collections.sort(tokenData.entries, entryComparator);
\r
125 // Put it all together.
\r
126 System.out.println("Assembling final data structures...");
\r
127 final List<Row> rows = dict.languageDatas[lang].rows;
\r
128 final List<IndexEntry> indexEntries = dict.languageDatas[lang].sortedIndex;
\r
129 for (int t = 0; t < sortedIndex.size(); ++t) {
\r
130 final TokenData tokenData = sortedIndex.get(t);
\r
131 final int startRow = rows.size();
\r
132 final IndexEntry indexEntry = new IndexEntry(tokenData.token, startRow);
\r
133 indexEntries.add(indexEntry);
\r
135 final Row tokenRow = new Row(-(t + 1));
\r
136 rows.add(tokenRow);
\r
138 for (final Integer e : tokenData.entries) {
\r
139 final Row entryRow = new Row(e);
\r
140 rows.add(entryRow);
\r
146 static final class EntryData {
\r
147 final int numTokens;
\r
149 public EntryData(int numTokens) {
\r
150 this.numTokens = numTokens;
\r
154 static final class TokenData {
\r
155 final String token;
\r
156 final List<Integer> entries = new ArrayList<Integer>();
\r
160 public TokenData(final String token) {
\r
161 this.token = token;
\r