package com.hughes.android.dictionary;\r
\r
import java.io.BufferedReader;\r
+import java.io.File;\r
import java.io.FileInputStream;\r
import java.io.FileNotFoundException;\r
import java.io.IOException;\r
import java.nio.charset.Charset;\r
import java.util.ArrayList;\r
import java.util.Collections;\r
-import java.util.Comparator;\r
-import java.util.HashMap;\r
import java.util.List;\r
import java.util.Map;\r
+import java.util.Random;\r
import java.util.Set;\r
+import java.util.TreeMap;\r
+\r
+import javax.xml.parsers.ParserConfigurationException;\r
+\r
+import org.xml.sax.SAXException;\r
\r
import com.hughes.android.dictionary.Dictionary.IndexEntry;\r
+import com.hughes.android.dictionary.Dictionary.LanguageData;\r
import com.hughes.android.dictionary.Dictionary.Row;\r
+import com.hughes.util.Args;\r
+import com.hughes.util.FileUtil;\r
\r
public class DictionaryBuilder {\r
-\r
+ \r
public static void main(String[] args) throws IOException,\r
- ClassNotFoundException {\r
- if (args.length != 1) {\r
- System.err.println("outputfile");\r
- return;\r
+ ClassNotFoundException, ParserConfigurationException, SAXException {\r
+ \r
+ final Map<String,String> keyValueArgs = Args.keyValueArgs(args);\r
+ \r
+ final Language lang1 = Language.lookup(keyValueArgs.remove("lang1"));\r
+ final Language lang2 = Language.lookup(keyValueArgs.remove("lang2"));\r
+ if (lang1 == null || lang2 == null) {\r
+ fatalError("--lang1= and --lang2= must both be specified.");\r
+ }\r
+ \r
+ final String dictOutFilename = keyValueArgs.remove("dictOut");\r
+ if (dictOutFilename == null) {\r
+ fatalError("--dictOut= must be specified.");\r
+ }\r
+ \r
+ String summaryText = keyValueArgs.remove("summaryText");\r
+ if (summaryText == null) {\r
+ fatalError("--summaryText= must be specified.");\r
+ }\r
+ if (summaryText.startsWith("@")) {\r
+ summaryText = FileUtil.readToString(new File(summaryText.substring(1)));\r
}\r
- final String dictOutFilename = args[0];\r
-\r
- final Dictionary dict = new Dictionary("de-en.txt - a German-English dictionary\n" +\r
- "Version: 1.6, 2009-04-16\n" +\r
- "Source: http://dict.tu-chemnitz.de/\n" +\r
- "Thanks to Frank Richter.", Language.DE, Language.EN);\r
- System.out.println(Charset.forName("Cp1252"));\r
- processInputFile("c:\\de-en-chemnitz.txt", dict, true, Charset.forName("UTF8"));\r
\r
- // Thad's extra sauce: \r
- processInputFile("c:\\de-en-dictcc.txt", dict, false, Charset.forName("Cp1252"));\r
+ final String maxEntriesString = keyValueArgs.remove("maxEntries");\r
+ final int maxEntries = maxEntriesString == null ? Integer.MAX_VALUE : Integer.parseInt(maxEntriesString);\r
+ \r
+ System.out.println("lang1=" + lang1);\r
+ System.out.println("lang2=" + lang2);\r
+ System.out.println("summaryText=" + summaryText);\r
+ System.out.println("dictOut=" + dictOutFilename);\r
\r
- createIndex(dict, Entry.LANG1);\r
- createIndex(dict, Entry.LANG2);\r
+ final Dictionary dict = new Dictionary(summaryText, lang1, lang2);\r
+\r
+ for (int i = 0; i < 100; ++i) {\r
+ final String prefix = "input" + i;\r
+ if (keyValueArgs.containsKey(prefix)) {\r
+ final File file = new File(keyValueArgs.remove(prefix));\r
+ System.out.println("Processing: " + file);\r
+ String charsetName = keyValueArgs.remove(prefix + "Charset");\r
+ if (charsetName == null) {\r
+ charsetName = "UTF8";\r
+ }\r
+ final Charset charset = Charset.forName(charsetName);\r
+ String inputName = keyValueArgs.remove(prefix + "Name");\r
+ if (inputName == null) {\r
+ fatalError("Must specify human readable name for: " + prefix + "Name");\r
+ }\r
+\r
+ String inputFormat = keyValueArgs.remove(prefix + "Format");\r
+ if ("dictcc".equals(inputFormat)) {\r
+ processLinedInputFile(dict, file, charset, false, maxEntries);\r
+ } else if ("chemnitz".equals(inputFormat)) {\r
+ processLinedInputFile(dict, file, charset, true, maxEntries);\r
+ } else if ("wiktionary".equals(inputFormat)) {\r
+ new WiktionaryXmlParser(dict).parse(file);\r
+ } else {\r
+ fatalError("Invalid or missing input format: " + inputFormat);\r
+ }\r
+ \r
+ dict.sources.add(inputName);\r
+ System.out.println("Done: " + file + "\n\n");\r
+ }\r
+ }\r
+ \r
+ if (!keyValueArgs.isEmpty()) {\r
+ System.err.println("WARNING: couldn't parse arguments: " + keyValueArgs);\r
+ }\r
+ \r
+ createIndex(dict, SimpleEntry.LANG1);\r
+ createIndex(dict, SimpleEntry.LANG2);\r
\r
System.out.println("Writing dictionary.");\r
final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw");\r
dictOut.setLength(0);\r
dict.write(dictOut);\r
dictOut.close();\r
+ \r
+ final Random random = new Random(0);\r
+ for (byte lang = 0; lang < 2; ++lang) {\r
+ final LanguageData languageData = dict.languageDatas[lang];\r
+ System.out.println("\nRandom words for: " + languageData.language.getSymbol());\r
+ for (int i = 0; i < 20; ++i) {\r
+ final int w = random.nextInt(languageData.sortedIndex.size());\r
+ final IndexEntry entry = languageData.sortedIndex.get(w);\r
+ final List<Row> rows = languageData.rows;\r
+ int r = entry.startRow;\r
+ System.out.println(languageData.rowToString(rows.get(r), false));\r
+ ++r;\r
+ while (r < rows.size() && !rows.get(r).isToken()) {\r
+ System.out.println(" " + languageData.rowToString(rows.get(r), false));\r
+ ++r;\r
+ }\r
+ }\r
+ }\r
}\r
\r
- private static void processInputFile(final String filename,\r
- final Dictionary dict, final boolean hasMultipleSubentries, final Charset charset) throws FileNotFoundException, IOException {\r
- final BufferedReader dictionaryIn = new BufferedReader(new InputStreamReader(new FileInputStream(filename), charset));\r
+ private static void fatalError(String string) {\r
+ System.err.println(string);\r
+ System.exit(1);\r
+ }\r
+\r
+ private static void processLinedInputFile(final Dictionary dict, final File file,\r
+ final Charset charset, final boolean hasMultipleSubentries,\r
+ final int maxEntries) throws FileNotFoundException, IOException {\r
+ final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));\r
String line;\r
int lineCount = 0;\r
- while ((line = dictionaryIn.readLine()) != null) {\r
-// System.out.println(line);\r
+ while ((line = reader.readLine()) != null && lineCount < maxEntries) {\r
+ if (maxEntries < 200) { \r
+ System.out.println(line);\r
+ }\r
line = line.trim();\r
- if (line.isEmpty() || line.startsWith("#")) {\r
+ if (line.equals("") || line.startsWith("#")) {\r
continue;\r
}\r
\r
- final Entry entry = Entry.parseFromLine(line, hasMultipleSubentries);\r
+ final SimpleEntry entry = SimpleEntry.parseFromLine(line, hasMultipleSubentries);\r
if (entry == null) {\r
System.err.println("Invalid entry: " + line);\r
continue;\r
}\r
lineCount++;\r
}\r
- dictionaryIn.close();\r
+ reader.close();\r
}\r
\r
public static void createIndex(final Dictionary dict, final byte lang) {\r
System.out.println("Creating index: " + lang);\r
\r
- final Map<String, TokenData> tokenDatas = new HashMap<String, TokenData>();\r
- final EntryData entryDatas[] = new EntryData[dict.entries.size()];\r
+ final Map<String, TokenData> tokenToData = new TreeMap<String, TokenData>(dict.languageDatas[lang].language.sortComparator);\r
\r
for (int e = 0; e < dict.entries.size(); ++e) {\r
- final Entry entry = dict.entries.get(e);\r
+ final SimpleEntry entry = null; //dict.entries.get(e);\r
final Set<String> tokens = entry.getIndexableTokens(lang);\r
- entryDatas[e] = new EntryData(tokens.size());\r
for (final String token : tokens) {\r
- TokenData tokenData = tokenDatas.get(token);\r
+ TokenData tokenData = tokenToData.get(token);\r
if (tokenData == null) {\r
tokenData = new TokenData(token);\r
- tokenDatas.put(token, tokenData);\r
+ tokenToData.put(token, tokenData);\r
}\r
- tokenData.entries.add(e);\r
+ tokenData.entries.add(new TokenEntryData(lang, token, entry, e));\r
}\r
\r
if (e % 10000 == 0) {\r
// Sort it.\r
\r
System.out.println("Sorting TokenData...");\r
- final List<TokenData> sortedIndex = new ArrayList<TokenData>(tokenDatas\r
+ final List<TokenData> sortedTokenData = new ArrayList<TokenData>(tokenToData\r
.values());\r
- Collections.sort(sortedIndex, new Comparator<TokenData>() {\r
- @Override\r
- public int compare(TokenData tokenData0, TokenData tokenData1) {\r
- return dict.languageDatas[lang].language.tokenComparator.compare(tokenData0.token, tokenData1.token);\r
- }});\r
\r
System.out.println("Sorting entries within each TokenData...");\r
- final Comparator<Integer> entryComparator = new Comparator<Integer>() {\r
- @Override\r
- public int compare(Integer o1, Integer o2) {\r
- // TODO: better this\r
- // Relevant (first token match) chemnitz entries first\r
- // first token position in entry\r
- // entry length in chars\r
- return entryDatas[o1].numTokens < entryDatas[o2].numTokens ? -1\r
- : entryDatas[o1].numTokens == entryDatas[o2].numTokens ? 0 : 1;\r
- }\r
- };\r
- for (final TokenData tokenData : tokenDatas.values()) {\r
- Collections.sort(tokenData.entries, entryComparator);\r
+ for (final TokenData tokenData : sortedTokenData) {\r
+ Collections.sort(tokenData.entries);\r
}\r
\r
// Put it all together.\r
System.out.println("Assembling final data structures...");\r
final List<Row> rows = dict.languageDatas[lang].rows;\r
final List<IndexEntry> indexEntries = dict.languageDatas[lang].sortedIndex;\r
- for (int t = 0; t < sortedIndex.size(); ++t) {\r
- final TokenData tokenData = sortedIndex.get(t);\r
+ for (int t = 0; t < sortedTokenData.size(); ++t) {\r
+ final TokenData tokenData = sortedTokenData.get(t);\r
final int startRow = rows.size();\r
final IndexEntry indexEntry = new IndexEntry(tokenData.token, startRow);\r
indexEntries.add(indexEntry);\r
final Row tokenRow = new Row(-(t + 1));\r
rows.add(tokenRow);\r
\r
- for (final Integer e : tokenData.entries) {\r
- final Row entryRow = new Row(e);\r
+ for (final TokenEntryData entryData : tokenData.entries) {\r
+ final Row entryRow = new Row(entryData.entryIndex);\r
rows.add(entryRow);\r
}\r
}\r
\r
}\r
\r
- static final class EntryData {\r
- final int numTokens;\r
+ static final class TokenEntryData implements Comparable<TokenEntryData> {\r
+ final String token;\r
+ final SimpleEntry entry;\r
+ final int entryIndex;\r
+ \r
+ private static final int bigNoOverflow = 100000;\r
+\r
+ int minSubEntryIndexOf = bigNoOverflow;\r
+ int minSubEntryLength = bigNoOverflow;\r
+ int minSubEntry = bigNoOverflow;\r
\r
- public EntryData(int numTokens) {\r
- this.numTokens = numTokens;\r
+ public TokenEntryData(final byte lang, final String token, final SimpleEntry entry, final int entryIndex) {\r
+ this.token = token;\r
+ this.entry = entry;\r
+ this.entryIndex = entryIndex;\r
+ \r
+ final String[] subentries = entry.getAllText(lang);\r
+ for (int s = 0; s < subentries.length; ++s) {\r
+ final String subentry = subentries[s];\r
+ int indexOf = subentry.indexOf(token);\r
+ if (indexOf != -1) {\r
+ minSubEntryIndexOf = Math.min(minSubEntryIndexOf, indexOf); \r
+ minSubEntryLength = Math.min(minSubEntryLength, subentry.length());\r
+ minSubEntry = Math.min(minSubEntry, s);\r
+ }\r
+ }\r
+ }\r
+\r
+ @Override\r
+ public int compareTo(final TokenEntryData that) {\r
+ assert this.token.equals(that.token);\r
+ \r
+ if (this.minSubEntryIndexOf != that.minSubEntryIndexOf) {\r
+ return this.minSubEntryIndexOf - that.minSubEntryIndexOf;\r
+ }\r
+ if (this.minSubEntryLength != that.minSubEntryLength) {\r
+ return this.minSubEntryLength - that.minSubEntryLength;\r
+ }\r
+ return this.minSubEntry - that.minSubEntry;\r
}\r
}\r
\r
static final class TokenData {\r
final String token;\r
- final List<Integer> entries = new ArrayList<Integer>();\r
+ final List<TokenEntryData> entries = new ArrayList<TokenEntryData>();\r
\r
int startRow;\r
\r