1 package com.hughes.android.dictionary;
\r
3 import java.io.IOException;
\r
4 import java.io.RandomAccessFile;
\r
5 import java.util.ArrayList;
\r
6 import java.util.Arrays;
\r
7 import java.util.Collections;
\r
8 import java.util.Comparator;
\r
9 import java.util.HashMap;
\r
10 import java.util.LinkedHashSet;
\r
11 import java.util.List;
\r
12 import java.util.Map;
\r
13 import java.util.Set;
\r
14 import java.util.regex.Pattern;
\r
16 import com.hughes.android.dictionary.Dictionary.IndexEntry;
\r
17 import com.hughes.android.dictionary.Dictionary.Row;
\r
19 public class DictionaryBuilder {
\r
21 static final Pattern WHITESPACE = Pattern.compile("\\s+");
\r
23 public static void main(String[] args) throws IOException,
\r
24 ClassNotFoundException {
\r
25 if (args.length != 2) {
\r
26 System.err.println("inputfile outputfile");
\r
30 final Dictionary dict = new Dictionary("de", "en");
\r
31 final RandomAccessFile dictionaryFile = new RandomAccessFile(args[0], "r");
\r
34 long fileLocation = 0;
\r
35 while ((line = dictionaryFile.readLine()) != null) {
\r
36 assert ((int) fileLocation) == fileLocation;
\r
38 if (line.isEmpty() || line.startsWith("#")) {
\r
42 final Entry entry = Entry.parseFromLine(line);
\r
43 if (entry == null) {
\r
44 System.err.println("Invalid entry: " + line);
\r
48 dict.entries.add(entry);
\r
50 if (lineCount % 10000 == 0) {
\r
51 System.out.println("IndexBuilder: " + "lineCount=" + lineCount);
\r
54 fileLocation = dictionaryFile.getFilePointer();
\r
56 dictionaryFile.close();
\r
58 createIndex(dict, Entry.LANG1);
\r
59 createIndex(dict, Entry.LANG2);
\r
61 System.out.println("Writing dictionary.");
\r
62 final RandomAccessFile dictOut = new RandomAccessFile(args[1], "rw");
\r
63 dictOut.setLength(0);
\r
64 dict.write(dictOut);
\r
68 public static void createIndex(final Dictionary dict, final byte lang) {
\r
69 System.out.println("Creating index: " + lang);
\r
71 final Map<String, TokenData> tokenDatas = new HashMap<String, TokenData>();
\r
72 final EntryData entryDatas[] = new EntryData[dict.entries.size()];
\r
74 for (int e = 0; e < dict.entries.size(); ++e) {
\r
75 final Entry entry = dict.entries.get(e);
\r
76 final String text = entry.getIndexableText(lang);
\r
77 final Set<String> tokens = new LinkedHashSet<String>(Arrays
\r
78 .asList(WHITESPACE.split(text.trim())));
\r
79 entryDatas[e] = new EntryData(tokens.size());
\r
80 for (final String token : tokens) {
\r
81 TokenData tokenData = tokenDatas.get(token);
\r
82 if (tokenData == null) {
\r
83 tokenData = new TokenData(token);
\r
84 tokenDatas.put(token, tokenData);
\r
86 tokenData.entries.add(e);
\r
89 if (e % 10000 == 0) {
\r
90 System.out.println("createIndex: " + "e=" + e);
\r
96 final List<TokenData> sortedIndex = new ArrayList<TokenData>(tokenDatas
\r
98 Collections.sort(sortedIndex);
\r
100 final Comparator<Integer> entryComparator = new Comparator<Integer>() {
\r
102 public int compare(Integer o1, Integer o2) {
\r
103 return entryDatas[o1].numTokens < entryDatas[o2].numTokens ? -1
\r
104 : entryDatas[o1].numTokens == entryDatas[o2].numTokens ? 0 : 1;
\r
108 for (final TokenData tokenData : tokenDatas.values()) {
\r
109 Collections.sort(tokenData.entries, entryComparator);
\r
112 // Put it all together.
\r
114 final List<Row> rows = dict.languages[lang].rows;
\r
115 final List<IndexEntry> indexEntries = dict.languages[lang].sortedIndex;
\r
117 for (int t = 0; t < sortedIndex.size(); ++t) {
\r
118 final TokenData tokenData = sortedIndex.get(t);
\r
119 final int startRow = rows.size();
\r
120 final IndexEntry indexEntry = new IndexEntry(tokenData.token, startRow);
\r
121 indexEntries.add(indexEntry);
\r
123 final Row tokenRow = new Row(-(t + 1));
\r
124 rows.add(tokenRow);
\r
126 for (final Integer e : tokenData.entries) {
\r
127 final Row entryRow = new Row(e);
\r
128 rows.add(entryRow);
\r
134 static final class EntryData {
\r
135 final int numTokens;
\r
137 public EntryData(int numTokens) {
\r
138 this.numTokens = numTokens;
\r
142 static final class TokenData implements Comparable<TokenData> {
\r
143 final String token;
\r
144 final List<Integer> entries = new ArrayList<Integer>();
\r
148 public TokenData(String token) {
\r
149 this.token = token;
\r
153 public int compareTo(TokenData that) {
\r
154 return EntryFactory.entryFactory.getEntryComparator().compare(this.token,
\r