-<?xml version="1.0" encoding="UTF-8"?>\r
-<classpath>\r
- <classpathentry kind="src" path="src"/>\r
- <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>\r
- <classpathentry combineaccessrules="false" kind="src" path="/Dictionary"/>\r
- <classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/3"/>\r
- <classpathentry kind="output" path="bin"/>\r
-</classpath>\r
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+ <classpathentry kind="src" path="src"/>
+ <classpathentry combineaccessrules="false" kind="src" path="/Dictionary"/>
+ <classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/3"/>
+ <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.launching.macosx.MacOSXType/JVM 1.6.0"/>
+ <classpathentry kind="output" path="bin"/>
+</classpath>
System.out.println("lang1=" + lang1);\r
System.out.println("lang2=" + lang2);\r
System.out.println("summaryText=" + summaryText);\r
- System.out.println("dictOut=" + dictOutFilename); \r
+ System.out.println("dictOut=" + dictOutFilename);\r
\r
final Dictionary dict = new Dictionary(summaryText, lang1, lang2);\r
\r
final Map<String, TokenData> tokenToData = new TreeMap<String, TokenData>(dict.languageDatas[lang].language.sortComparator);\r
\r
for (int e = 0; e < dict.entries.size(); ++e) {\r
- final SimpleEntry entry = dict.entries.get(e);\r
+ final SimpleEntry entry = null; //dict.entries.get(e);\r
final Set<String> tokens = entry.getIndexableTokens(lang);\r
for (final String token : tokens) {\r
TokenData tokenData = tokenToData.get(token);\r
--- /dev/null
+package com.hughes.android.dictionary.engine;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.hughes.android.dictionary.Language;
+import com.hughes.android.dictionary.engine.PairEntry.Pair;
+
+public class DictFileParser {
+
+ static final Logger logger = Logger.getLogger(DictFileParser.class.getName());
+
+ // Dictcc
+ static final Pattern TAB = Pattern.compile("\\t");
+
+ // Chemnitz
+ static final Pattern DOUBLE_COLON = Pattern.compile(" :: ");
+ static final Pattern PIPE = Pattern.compile(" \\| ");
+
+ static final Pattern SPACES = Pattern.compile("\\s+");
+ static final Pattern DE_NOUN = Pattern.compile("([^ ]+) *\\{(m|f|n|pl)\\}");
+ static final Pattern EN_VERB = Pattern.compile("^to ([^ ]+)");
+
+ static final Pattern BRACKETED = Pattern.compile("\\[([^]]+)\\]");
+ static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\]");
+
+ static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}0-9]+");
+ static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}0-9]+");
+
+ static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}0-9]+|[^\\p{L}0-9]+$");
+
+ final Charset charset;
+ final boolean flipCols;
+
+ final Pattern fieldSplit;
+ final Pattern subfieldSplit;
+
+ final DictionaryBuilder dictBuilder;
+ final IndexBuilder[] langIndexBuilders;
+ final IndexBuilder bothIndexBuilder;
+
+ final Set<String> alreadyDone = new HashSet<String>();
+
+ public DictFileParser(final Charset charset, boolean flipCols,
+ final Pattern fieldSplit, final Pattern subfieldSplit,
+ final DictionaryBuilder dictBuilder, final IndexBuilder[] langIndexBuilders,
+ final IndexBuilder bothIndexBuilder) {
+ this.charset = charset;
+ this.flipCols = flipCols;
+ this.fieldSplit = fieldSplit;
+ this.subfieldSplit = subfieldSplit;
+ this.dictBuilder = dictBuilder;
+ this.langIndexBuilders = langIndexBuilders;
+ this.bothIndexBuilder = bothIndexBuilder;
+ }
+
+ public void parseFile(final File file) throws IOException {
+ final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
+ String line;
+ while ((line = reader.readLine()) != null) {
+ parseLine(line);
+ }
+ }
+
+ private void parseLine(final String line) {
+ if (line.startsWith("#") || line.length() == 0) {
+ logger.info("Skipping comment line: " + line);
+ return;
+ }
+ final String[] fields = fieldSplit.split(line);
+ if (fields.length != 2) {
+ logger.warning("Malformed line: " + line);
+ return;
+ }
+
+ fields[0] = SPACES.matcher(fields[0]).replaceAll(" ").trim();
+ fields[1] = SPACES.matcher(fields[1]).replaceAll(" ").trim();
+ if (flipCols) {
+ final String temp = fields[0];
+ fields[0] = fields[1];
+ fields[1] = temp;
+ }
+
+ final String[][] subfields = new String[2][];
+ if (subfieldSplit != null) {
+ subfields[0] = subfieldSplit.split(fields[0]);
+ subfields[1] = subfieldSplit.split(fields[1]);
+ if (subfields[0].length != subfields[1].length) {
+ logger.warning("Number of subfields doesn't match: " + line);
+ return;
+ }
+ } else {
+ subfields[0] = new String[] { fields[0] };
+ subfields[1] = new String[] { fields[1] };
+ }
+
+ final Pair[] pairs = new Pair[subfields[0].length];
+ for (int i = 0; i < pairs.length; ++i) {
+ pairs[i] = new Pair(subfields[0][i], subfields[1][i]);
+ }
+ final PairEntry pairEntry = new PairEntry(pairs);
+ final EntryData entryData = new EntryData(dictBuilder.dictionary.pairEntries.size(), pairEntry);
+ dictBuilder.dictionary.pairEntries.add(pairEntry);
+ dictBuilder.entryDatas.add(entryData); // TODO: delete me.
+
+ for (int l = 0; l < 2; ++l) {
+ alreadyDone.clear();
+
+ for (int j = 0; j < subfields[l].length; ++j) {
+ String subfield = subfields[l][j];
+ final IndexBuilder indexBuilder = langIndexBuilders[l];
+ if (indexBuilder.index.sortLanguage == Language.de) {
+ subfield = parseField_DE(indexBuilder, subfield, entryData, j);
+ } else if (indexBuilder.index.sortLanguage == Language.en) {
+ subfield = parseField_EN(indexBuilder, subfield, entryData, j);
+ }
+ parseFieldGeneric(indexBuilder, subfield, entryData, j, subfields.length);
+ }
+ }
+ }
+
+ private void parseFieldGeneric(final IndexBuilder indexBuilder, String field,
+ final EntryData entryData, final int subfieldIdx, final int numSubFields) {
+ // remove bracketed and parenthesized stuff.
+ final StringBuilder bracketed = new StringBuilder();
+ final StringBuilder parenthesized = new StringBuilder();
+
+ Matcher matcher;
+ while ((matcher = BRACKETED.matcher(field)).matches()) {
+ bracketed.append(matcher.group(1)).append(" ");
+ field = matcher.replaceFirst(" ");
+ }
+
+ while ((matcher = PARENTHESIZED.matcher(field)).matches()) {
+ parenthesized.append(matcher.group(1)).append(" ");
+ field = matcher.replaceFirst(" ");
+ }
+
+ field = SPACES.matcher(field).replaceAll(" ").trim();
+
+ // split words on non -A-z0-9, do them.
+ final String[] tokens = NON_CHAR_DASH.split(field);
+
+ final EntryTypeName entryTypeName;
+ if (numSubFields == 1) {
+ assert subfieldIdx == 0;
+ if (tokens.length == 1) {
+ entryTypeName = EntryTypeName.ONE_WORD;
+ } else if (tokens.length == 2) {
+ entryTypeName = EntryTypeName.TWO_WORDS;
+ } else if (tokens.length == 3) {
+ entryTypeName = EntryTypeName.THREE_WORDS;
+ } else if (tokens.length == 4) {
+ entryTypeName = EntryTypeName.FOUR_WORDS;
+ } else {
+ entryTypeName = EntryTypeName.FIVE_OR_MORE_WORDS;
+ }
+ } else {
+ assert numSubFields > 1;
+ if (subfieldIdx == 0) {
+ if (tokens.length == 1) {
+ entryTypeName = EntryTypeName.MULTIROW_HEAD_ONE_WORD;
+ } else {
+ entryTypeName = EntryTypeName.MULTIROW_HEAD_MANY_WORDS;
+ }
+ } else {
+ assert subfieldIdx > 0;
+ if (tokens.length == 1) {
+ entryTypeName = EntryTypeName.MULTIROW_TAIL_ONE_WORD;
+ } else {
+ entryTypeName = EntryTypeName.MULTIROW_TAIL_MANY_WORDS;
+ }
+ }
+ }
+
+ for (String token : tokens) {
+ token = TRIM_PUNC.matcher(token).replaceAll("");
+ if (!alreadyDone.contains(token) && token.length() > 0) {
+ final List<EntryData> entries = indexBuilder.getOrCreateEntries(token, entryTypeName);
+ entries.add(entryData);
+ alreadyDone.add(token);
+
+ // also split words on dashes, do them, too.
+ if (token.contains("-")) {
+ final String[] dashed = token.split("-");
+ for (final String dashedToken : dashed) {
+ if (!alreadyDone.contains(dashedToken) && dashedToken.length() > 0) {
+ final List<EntryData> dashEntries = indexBuilder.getOrCreateEntries(dashedToken, EntryTypeName.PART_OF_HYPHENATED);
+ dashEntries.add(entryData);
+ }
+ }
+ }
+
+ } // if (!alreadyDone.contains(token)) {
+ } // for (final String token : tokens) {
+
+ // process bracketed stuff (split on spaces and dashes always)
+ final String[] bracketedTokens = NON_CHAR.split(bracketed.toString());
+ for (final String token : bracketedTokens) {
+ assert !token.contains("-");
+ if (!alreadyDone.contains(token) && token.length() > 0) {
+ final List<EntryData> entries = indexBuilder.getOrCreateEntries(token, EntryTypeName.BRACKETED);
+ entries.add(entryData);
+ }
+ }
+
+ // process paren stuff
+ final String[] parenTokens = NON_CHAR.split(bracketed.toString());
+ for (final String token : parenTokens) {
+ assert !token.contains("-");
+ if (!alreadyDone.contains(token) && token.length() > 0) {
+ final List<EntryData> entries = indexBuilder.getOrCreateEntries(token, EntryTypeName.PARENTHESIZED);
+ entries.add(entryData);
+ }
+ }
+
+ }
+
+ private String parseField_DE(final IndexBuilder indexBuilder, String field,
+ final EntryData entryData, final int subfieldIdx) {
+ final Matcher matcher = DE_NOUN.matcher(field);
+ while (matcher.find()) {
+ final String noun = matcher.group(1);
+ //final String gender = matcher.group(2);
+ if (alreadyDone.add(noun)) {
+ // System.out.println("Found DE noun " + noun + ", " + gender);
+ final List<EntryData> entries = indexBuilder.getOrCreateEntries(noun, EntryTypeName.NOUN);
+ entries.add(entryData);
+ }
+ }
+ return field;
+ }
+
+ private String parseField_EN(final IndexBuilder indexBuilder, String field,
+ final EntryData entryData, final int subfieldIdx) {
+ if (field.startsWith("to ")) {
+ field = field.substring(3);
+ }
+ return field;
+ }
+
+
+}
--- /dev/null
+package com.hughes.android.dictionary.engine;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.RandomAccessFile;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import com.hughes.android.dictionary.Language;
+import com.hughes.util.Args;
+import com.hughes.util.FileUtil;
+
+/*
+
+--maxEntries=100
+--dictOut=de-en.dict
+--lang1=DE
+--lang2=EN
+--dictInfo=@dictInfo.txt
+
+--input0=/Users/thadh/personal/quickDic/de-en-chemnitz.txt
+--input0Name=chemnitz
+--input0Charset=UTF8
+--input0Format=chemnitz
+
+--input1=/Users/thadh/personal/quickDic/dewiktionary-20100326-pages-articles.xml
+--input1Name=wiktionary
+--input1Format=wiktionary
+
+--input2=/Users/thadh/personal/quickDic/de-en-dictcc.txt
+--input2Name=dictcc
+--input2Charset=Cp1252
+--input2Format=dictcc
+ */
+
+public class DictionaryBuilder {
+
+ final Dictionary dictionary;
+
+ final List<EntryData> entryDatas = new ArrayList<EntryData>();
+
+ final List<IndexBuilder> indexBuilders = new ArrayList<IndexBuilder>();
+
+ public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1) {
+ dictionary = new Dictionary(dictInfo);
+ indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0));
+ indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1));
+ }
+
+ void build() {
+ for (final IndexBuilder indexBuilder : indexBuilders) {
+ indexBuilder.build();
+ dictionary.indices.add(indexBuilder.index);
+ }
+ }
+
+ public static void main(final String[] args) throws IOException {
+ final Map<String,String> keyValueArgs = Args.keyValueArgs(args);
+
+ final Language lang1 = Language.lookup(keyValueArgs.remove("lang1"));
+ final Language lang2 = Language.lookup(keyValueArgs.remove("lang2"));
+ if (lang1 == null || lang2 == null) {
+ fatalError("--lang1= and --lang2= must both be specified.");
+ }
+
+ final String dictOutFilename = keyValueArgs.remove("dictOut");
+ if (dictOutFilename == null) {
+ fatalError("--dictOut= must be specified.");
+ }
+
+ String dictInfo = keyValueArgs.remove("dictInfo");
+ if (dictInfo == null) {
+ fatalError("--dictInfo= must be specified.");
+ }
+ if (dictInfo.startsWith("@")) {
+ dictInfo = FileUtil.readToString(new File(dictInfo.substring(1)));
+ }
+
+ final String printFile = keyValueArgs.remove("print");
+
+ System.out.println("lang1=" + lang1);
+ System.out.println("lang2=" + lang2);
+ System.out.println("dictInfo=" + dictInfo);
+ System.out.println("dictOut=" + dictOutFilename);
+
+ final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2);
+
+ for (int i = 0; i < 100; ++i) {
+ final String prefix = "input" + i;
+ if (keyValueArgs.containsKey(prefix)) {
+ final File file = new File(keyValueArgs.remove(prefix));
+ System.out.println("Processing: " + file);
+ String charsetName = keyValueArgs.remove(prefix + "Charset");
+ if (charsetName == null) {
+ charsetName = "UTF8";
+ }
+ final Charset charset = Charset.forName(charsetName);
+ String inputName = keyValueArgs.remove(prefix + "Name");
+ if (inputName == null) {
+ fatalError("Must specify human readable name for: " + prefix + "Name");
+ }
+
+ String inputFormat = keyValueArgs.remove(prefix + "Format");
+ if ("dictcc".equals(inputFormat)) {
+ new DictFileParser(charset, false, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file);
+ } else if ("chemnitz".equals(inputFormat)) {
+ new DictFileParser(charset, false, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file);
+ } else if ("wiktionary".equals(inputFormat)) {
+ throw new RuntimeException();
+// new WiktionaryXmlParser(dict).parse(file);
+ } else {
+ fatalError("Invalid or missing input format: " + inputFormat);
+ }
+
+ final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName);
+ dictionaryBuilder.dictionary.sources.add(entrySource);
+ System.out.println("Done: " + file + "\n\n");
+ }
+ }
+
+ dictionaryBuilder.build();
+
+ if (printFile != null) {
+ final PrintStream out = new PrintStream(new File(printFile));
+ dictionaryBuilder.dictionary.print(out);
+ out.close();
+ }
+
+ System.out.println("Writing dictionary to: " + dictOutFilename);
+ final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw");
+ dictOut.setLength(0);
+ dictionaryBuilder.dictionary.write(dictOut);
+ dictOut.close();
+
+ if (!keyValueArgs.isEmpty()) {
+ System.err.println("WARNING: couldn't parse arguments: " + keyValueArgs);
+ System.exit(1);
+ }
+
+ }
+
+ private static void fatalError(String string) {
+ System.err.println(string);
+ System.exit(1);
+ }
+
+}
--- /dev/null
+package com.hughes.android.dictionary.engine;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.RandomAccessFile;
+
+import com.hughes.util.FileUtil;
+
+import junit.framework.TestCase;
+
+public class DictionaryBuilderTest extends TestCase {
+
+ public void testGermanCombined() throws IOException {
+ final File result = File.createTempFile("de_en", ".dict");
+ System.out.println("Writing to: " + result);
+ DictionaryBuilder.main(new String[] {
+ "--dictOut=" + result.getAbsolutePath(),
+ "--lang1=DE",
+ "--lang2=EN",
+ "--dictInfo=@testdata/de_en_dictInfo.txt",
+
+ "--input1=testdata/de-en-chemnitz_100",
+ "--input1Name=dictcc",
+ "--input1Charset=UTF8",
+ "--input1Format=chemnitz",
+
+ "--input2=testdata/de-en-dictcc_100",
+ "--input2Name=dictcc",
+ "--input2Charset=UTF8",
+ "--input2Format=dictcc",
+
+ "--print=testdata/de_en.test",
+ });
+
+ // Check it once:
+ assertFilesEqual("testdata/de_en.golden", "testdata/de_en.test");
+
+
+ // Check it again.
+ final Dictionary dict = new Dictionary(new RandomAccessFile(result.getAbsolutePath(), "r"));
+ final PrintStream out = new PrintStream(new File("testdata/de_en.test"));
+ dict.print(out);
+ out.close();
+
+ assertFilesEqual("testdata/de_en.golden", "testdata/de_en.test");
+ }
+
+
+ void assertFilesEqual(final String expected, final String actual) throws IOException {
+ final String expectedString = FileUtil.readToString(new File(expected));
+ final String actualString = FileUtil.readToString(new File(actual));
+ assertEquals(expectedString, actualString);
+ }
+
+}
--- /dev/null
+/**
+ *
+ */
+package com.hughes.android.dictionary.engine;
+
+import com.hughes.util.IndexedObject;
+
+class EntryData extends IndexedObject {
+ EntryData(final int index, final Entry entry) {
+ super(index);
+ this.entry = entry;
+ }
+ Entry entry;
+}
\ No newline at end of file
--- /dev/null
+package com.hughes.android.dictionary.engine;
+
+import java.util.ArrayList;
+import java.util.EnumMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import com.hughes.android.dictionary.Language;
+
+public class IndexBuilder {
+
+ final DictionaryBuilder dictionaryBuilder;
+ final Index index;
+
+ final SortedMap<String, TokenData> tokenToData;
+
+ @SuppressWarnings("unchecked")
+ IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language) {
+ this.dictionaryBuilder = dictionaryBuilder;
+ index = new Index(dictionaryBuilder.dictionary, shortName, longName, language);
+ tokenToData = new TreeMap<String, TokenData>(language.getSortCollator());
+ }
+
+ public void build() {
+ final Set<EntryData> tokenEntryDatas = new HashSet<EntryData>();
+ final List<RowBase> rows = index.rows;
+ for (final TokenData tokenData : tokenToData.values()) {
+ tokenEntryDatas.clear();
+ final int indexRow = index.sortedIndexEntries.size();
+ index.sortedIndexEntries.add(new Index.IndexEntry(tokenData.token, rows.size()));
+ rows.add(new TokenRow(indexRow, rows.size(), index));
+ int count = 0;
+ for (final List<EntryData> entryDatas : tokenData.typeToEntries.values()) {
+ for (final EntryData entryData : entryDatas) {
+ if (tokenEntryDatas.add(entryData)) {
+ rows.add(new PairEntry.Row(entryData.index(), rows.size(), index));
+ ++count;
+ }
+ }
+ }
+ System.out.println(count + " ENTRIES FOR TOKEN " + tokenData.token);
+ }
+ }
+
+ static class TokenData {
+ final String token;
+
+ final Map<EntryTypeName, List<EntryData>> typeToEntries = new EnumMap<EntryTypeName, List<EntryData>>(EntryTypeName.class);
+
+ TokenData(final String token) {
+ assert token.equals(token.trim());
+ assert token.length() > 0;
+ this.token = token;
+ }
+ }
+
+ public TokenData getOrCreateTokenData(final String token) {
+ TokenData tokenData = tokenToData.get(token);
+ if (tokenData == null) {
+ tokenData = new TokenData(token);
+ tokenToData.put(token, tokenData);
+ }
+ return tokenData;
+ }
+
+ public List<EntryData> getOrCreateEntries(final String token, final EntryTypeName entryTypeName) {
+ final TokenData tokenData = getOrCreateTokenData(token);
+ List<EntryData> entries = tokenData.typeToEntries.get(entryTypeName);
+ if (entries == null) {
+ entries = new ArrayList<EntryData>();
+ tokenData.typeToEntries.put(entryTypeName, entries);
+ }
+ return entries;
+ }
+
+
+}