package com.hughes.android.dictionary;\r
\r
+import java.io.BufferedReader;\r
+import java.io.FileInputStream;\r
+import java.io.FileNotFoundException;\r
import java.io.IOException;\r
+import java.io.InputStreamReader;\r
import java.io.RandomAccessFile;\r
+import java.nio.charset.Charset;\r
import java.util.ArrayList;\r
-import java.util.Arrays;\r
import java.util.Collections;\r
import java.util.Comparator;\r
import java.util.HashMap;\r
-import java.util.LinkedHashSet;\r
import java.util.List;\r
import java.util.Map;\r
import java.util.Set;\r
-import java.util.regex.Pattern;\r
\r
import com.hughes.android.dictionary.Dictionary.IndexEntry;\r
import com.hughes.android.dictionary.Dictionary.Row;\r
\r
public class DictionaryBuilder {\r
\r
- static final Pattern WHITESPACE = Pattern.compile("\\s+");\r
-\r
public static void main(String[] args) throws IOException,\r
ClassNotFoundException {\r
- if (args.length != 2) {\r
- System.err.println("inputfile outputfile");\r
+ if (args.length != 1) {\r
+ System.err.println("outputfile");\r
return;\r
}\r
+ final String dictOutFilename = args[0];\r
+\r
+ final Dictionary dict = new Dictionary(Language.DE, Language.EN);\r
+ System.out.println(Charset.forName("Cp1252"));\r
+ processInputFile("c:\\de-en-chemnitz.txt", dict, true, Charset.forName("UTF8"));\r
+ processInputFile("c:\\de-en-dictcc.txt", dict, false, Charset.forName("Cp1252"));\r
\r
- final Dictionary dict = new Dictionary("de", "en");\r
- final RandomAccessFile dictionaryFile = new RandomAccessFile(args[0], "r");\r
+ createIndex(dict, Entry.LANG1);\r
+ createIndex(dict, Entry.LANG2);\r
+\r
+ System.out.println("Writing dictionary.");\r
+ final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw");\r
+ dictOut.setLength(0);\r
+ dict.write(dictOut);\r
+ dictOut.close();\r
+ }\r
+\r
+ private static void processInputFile(final String filename,\r
+ final Dictionary dict, final boolean hasMultipleSubentries, final Charset charset) throws FileNotFoundException, IOException {\r
+ final BufferedReader dictionaryIn = new BufferedReader(new InputStreamReader(new FileInputStream(filename), charset));\r
String line;\r
int lineCount = 0;\r
- long fileLocation = 0;\r
- while ((line = dictionaryFile.readLine()) != null) {\r
- assert ((int) fileLocation) == fileLocation;\r
+ while ((line = dictionaryIn.readLine()) != null) {\r
+// System.out.println(line);\r
line = line.trim();\r
if (line.isEmpty() || line.startsWith("#")) {\r
continue;\r
}\r
\r
- final Entry entry = Entry.parseFromLine(line);\r
+ final Entry entry = Entry.parseFromLine(line, hasMultipleSubentries);\r
if (entry == null) {\r
System.err.println("Invalid entry: " + line);\r
continue;\r
System.out.println("IndexBuilder: " + "lineCount=" + lineCount);\r
}\r
lineCount++;\r
- fileLocation = dictionaryFile.getFilePointer();\r
}\r
- dictionaryFile.close();\r
-\r
- createIndex(dict, Entry.LANG1);\r
- createIndex(dict, Entry.LANG2);\r
-\r
- System.out.println("Writing dictionary.");\r
- final RandomAccessFile dictOut = new RandomAccessFile(args[1], "rw");\r
- dictOut.setLength(0);\r
- dict.write(dictOut);\r
- dictOut.close();\r
+ dictionaryIn.close();\r
}\r
\r
public static void createIndex(final Dictionary dict, final byte lang) {\r
\r
for (int e = 0; e < dict.entries.size(); ++e) {\r
final Entry entry = dict.entries.get(e);\r
- final String text = entry.getIndexableText(lang);\r
- final Set<String> tokens = new LinkedHashSet<String>(Arrays\r
- .asList(WHITESPACE.split(text.trim())));\r
+ final Set<String> tokens = entry.getIndexableTokens(lang);\r
entryDatas[e] = new EntryData(tokens.size());\r
for (final String token : tokens) {\r
TokenData tokenData = tokenDatas.get(token);\r
\r
// Sort it.\r
\r
+ System.out.println("Sorting TokenData...");\r
final List<TokenData> sortedIndex = new ArrayList<TokenData>(tokenDatas\r
.values());\r
- Collections.sort(sortedIndex);\r
+ Collections.sort(sortedIndex, new Comparator<TokenData>() {\r
+ @Override\r
+ public int compare(TokenData tokenData0, TokenData tokenData1) {\r
+ return dict.languageDatas[lang].language.tokenComparator.compare(tokenData0.token, tokenData1.token);\r
+ }});\r
\r
+ System.out.println("Sorting entries within each TokenData...");\r
final Comparator<Integer> entryComparator = new Comparator<Integer>() {\r
@Override\r
public int compare(Integer o1, Integer o2) {\r
+ // TODO: better this\r
+ // Relevant (first token match) chemnitz entries first\r
+ // first token position in entry\r
+ // entry length in chars\r
return entryDatas[o1].numTokens < entryDatas[o2].numTokens ? -1\r
: entryDatas[o1].numTokens == entryDatas[o2].numTokens ? 0 : 1;\r
}\r
};\r
-\r
for (final TokenData tokenData : tokenDatas.values()) {\r
Collections.sort(tokenData.entries, entryComparator);\r
}\r
\r
// Put it all together.\r
-\r
- final List<Row> rows = dict.languages[lang].rows;\r
- final List<IndexEntry> indexEntries = dict.languages[lang].sortedIndex;\r
-\r
+ System.out.println("Assembling final data structures...");\r
+ final List<Row> rows = dict.languageDatas[lang].rows;\r
+ final List<IndexEntry> indexEntries = dict.languageDatas[lang].sortedIndex;\r
for (int t = 0; t < sortedIndex.size(); ++t) {\r
final TokenData tokenData = sortedIndex.get(t);\r
final int startRow = rows.size();\r
}\r
}\r
\r
- static final class TokenData implements Comparable<TokenData> {\r
+ static final class TokenData {\r
final String token;\r
final List<Integer> entries = new ArrayList<Integer>();\r
\r
int startRow;\r
\r
- public TokenData(String token) {\r
+ public TokenData(final String token) {\r
this.token = token;\r
}\r
-\r
- @Override\r
- public int compareTo(TokenData that) {\r
- return EntryFactory.entryFactory.getEntryComparator().compare(this.token,\r
- that.token);\r
- }\r
}\r
\r
}\r