From: Thad Hughes Date: Fri, 8 Oct 2010 22:14:16 +0000 (-0700) Subject: go X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=45d5ecd68dcf3feea93007388d89e41703f7e02c go --- diff --git a/.classpath b/.classpath index 22e2c09..cc0189b 100755 --- a/.classpath +++ b/.classpath @@ -1,8 +1,8 @@ - - - - - - - - + + + + + + + + diff --git a/src/com/hughes/android/dictionary/DictionaryBuilder.java b/src/com/hughes/android/dictionary/DictionaryBuilder.java index 1e76822..ba60c4c 100755 --- a/src/com/hughes/android/dictionary/DictionaryBuilder.java +++ b/src/com/hughes/android/dictionary/DictionaryBuilder.java @@ -58,7 +58,7 @@ public class DictionaryBuilder { System.out.println("lang1=" + lang1); System.out.println("lang2=" + lang2); System.out.println("summaryText=" + summaryText); - System.out.println("dictOut=" + dictOutFilename); + System.out.println("dictOut=" + dictOutFilename); final Dictionary dict = new Dictionary(summaryText, lang1, lang2); @@ -167,7 +167,7 @@ public class DictionaryBuilder { final Map tokenToData = new TreeMap(dict.languageDatas[lang].language.sortComparator); for (int e = 0; e < dict.entries.size(); ++e) { - final SimpleEntry entry = dict.entries.get(e); + final SimpleEntry entry = null; //dict.entries.get(e); final Set tokens = entry.getIndexableTokens(lang); for (final String token : tokens) { TokenData tokenData = tokenToData.get(token); diff --git a/src/com/hughes/android/dictionary/engine/DictFileParser.java b/src/com/hughes/android/dictionary/engine/DictFileParser.java new file mode 100644 index 0000000..9b4ac0a --- /dev/null +++ b/src/com/hughes/android/dictionary/engine/DictFileParser.java @@ -0,0 +1,253 @@ +package com.hughes.android.dictionary.engine; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.hughes.android.dictionary.Language; +import com.hughes.android.dictionary.engine.PairEntry.Pair; + +public class DictFileParser { + + static final Logger logger = Logger.getLogger(DictFileParser.class.getName()); + + // Dictcc + static final Pattern TAB = Pattern.compile("\\t"); + + // Chemnitz + static final Pattern DOUBLE_COLON = Pattern.compile(" :: "); + static final Pattern PIPE = Pattern.compile(" \\| "); + + static final Pattern SPACES = Pattern.compile("\\s+"); + static final Pattern DE_NOUN = Pattern.compile("([^ ]+) *\\{(m|f|n|pl)\\}"); + static final Pattern EN_VERB = Pattern.compile("^to ([^ ]+)"); + + static final Pattern BRACKETED = Pattern.compile("\\[([^]]+)\\]"); + static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\]"); + + static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}0-9]+"); + static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}0-9]+"); + + static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}0-9]+|[^\\p{L}0-9]+$"); + + final Charset charset; + final boolean flipCols; + + final Pattern fieldSplit; + final Pattern subfieldSplit; + + final DictionaryBuilder dictBuilder; + final IndexBuilder[] langIndexBuilders; + final IndexBuilder bothIndexBuilder; + + final Set alreadyDone = new HashSet(); + + public DictFileParser(final Charset charset, boolean flipCols, + final Pattern fieldSplit, final Pattern subfieldSplit, + final DictionaryBuilder dictBuilder, final IndexBuilder[] langIndexBuilders, + final IndexBuilder bothIndexBuilder) { + this.charset = charset; + this.flipCols = flipCols; + this.fieldSplit = fieldSplit; + this.subfieldSplit = subfieldSplit; + this.dictBuilder = dictBuilder; + this.langIndexBuilders = langIndexBuilders; + this.bothIndexBuilder = bothIndexBuilder; + } + + public void parseFile(final File file) throws IOException { + final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)); + String line; + while ((line = reader.readLine()) != null) { + parseLine(line); + } + } + + private void parseLine(final String line) { + if (line.startsWith("#") || line.length() == 0) { + logger.info("Skipping comment line: " + line); + return; + } + final String[] fields = fieldSplit.split(line); + if (fields.length != 2) { + logger.warning("Malformed line: " + line); + return; + } + + fields[0] = SPACES.matcher(fields[0]).replaceAll(" ").trim(); + fields[1] = SPACES.matcher(fields[1]).replaceAll(" ").trim(); + if (flipCols) { + final String temp = fields[0]; + fields[0] = fields[1]; + fields[1] = temp; + } + + final String[][] subfields = new String[2][]; + if (subfieldSplit != null) { + subfields[0] = subfieldSplit.split(fields[0]); + subfields[1] = subfieldSplit.split(fields[1]); + if (subfields[0].length != subfields[1].length) { + logger.warning("Number of subfields doesn't match: " + line); + return; + } + } else { + subfields[0] = new String[] { fields[0] }; + subfields[1] = new String[] { fields[1] }; + } + + final Pair[] pairs = new Pair[subfields[0].length]; + for (int i = 0; i < pairs.length; ++i) { + pairs[i] = new Pair(subfields[0][i], subfields[1][i]); + } + final PairEntry pairEntry = new PairEntry(pairs); + final EntryData entryData = new EntryData(dictBuilder.dictionary.pairEntries.size(), pairEntry); + dictBuilder.dictionary.pairEntries.add(pairEntry); + dictBuilder.entryDatas.add(entryData); // TODO: delete me. + + for (int l = 0; l < 2; ++l) { + alreadyDone.clear(); + + for (int j = 0; j < subfields[l].length; ++j) { + String subfield = subfields[l][j]; + final IndexBuilder indexBuilder = langIndexBuilders[l]; + if (indexBuilder.index.sortLanguage == Language.de) { + subfield = parseField_DE(indexBuilder, subfield, entryData, j); + } else if (indexBuilder.index.sortLanguage == Language.en) { + subfield = parseField_EN(indexBuilder, subfield, entryData, j); + } + parseFieldGeneric(indexBuilder, subfield, entryData, j, subfields.length); + } + } + } + + private void parseFieldGeneric(final IndexBuilder indexBuilder, String field, + final EntryData entryData, final int subfieldIdx, final int numSubFields) { + // remove bracketed and parenthesized stuff. + final StringBuilder bracketed = new StringBuilder(); + final StringBuilder parenthesized = new StringBuilder(); + + Matcher matcher; + while ((matcher = BRACKETED.matcher(field)).matches()) { + bracketed.append(matcher.group(1)).append(" "); + field = matcher.replaceFirst(" "); + } + + while ((matcher = PARENTHESIZED.matcher(field)).matches()) { + parenthesized.append(matcher.group(1)).append(" "); + field = matcher.replaceFirst(" "); + } + + field = SPACES.matcher(field).replaceAll(" ").trim(); + + // split words on non -A-z0-9, do them. + final String[] tokens = NON_CHAR_DASH.split(field); + + final EntryTypeName entryTypeName; + if (numSubFields == 1) { + assert subfieldIdx == 0; + if (tokens.length == 1) { + entryTypeName = EntryTypeName.ONE_WORD; + } else if (tokens.length == 2) { + entryTypeName = EntryTypeName.TWO_WORDS; + } else if (tokens.length == 3) { + entryTypeName = EntryTypeName.THREE_WORDS; + } else if (tokens.length == 4) { + entryTypeName = EntryTypeName.FOUR_WORDS; + } else { + entryTypeName = EntryTypeName.FIVE_OR_MORE_WORDS; + } + } else { + assert numSubFields > 1; + if (subfieldIdx == 0) { + if (tokens.length == 1) { + entryTypeName = EntryTypeName.MULTIROW_HEAD_ONE_WORD; + } else { + entryTypeName = EntryTypeName.MULTIROW_HEAD_MANY_WORDS; + } + } else { + assert subfieldIdx > 0; + if (tokens.length == 1) { + entryTypeName = EntryTypeName.MULTIROW_TAIL_ONE_WORD; + } else { + entryTypeName = EntryTypeName.MULTIROW_TAIL_MANY_WORDS; + } + } + } + + for (String token : tokens) { + token = TRIM_PUNC.matcher(token).replaceAll(""); + if (!alreadyDone.contains(token) && token.length() > 0) { + final List entries = indexBuilder.getOrCreateEntries(token, entryTypeName); + entries.add(entryData); + alreadyDone.add(token); + + // also split words on dashes, do them, too. + if (token.contains("-")) { + final String[] dashed = token.split("-"); + for (final String dashedToken : dashed) { + if (!alreadyDone.contains(dashedToken) && dashedToken.length() > 0) { + final List dashEntries = indexBuilder.getOrCreateEntries(dashedToken, EntryTypeName.PART_OF_HYPHENATED); + dashEntries.add(entryData); + } + } + } + + } // if (!alreadyDone.contains(token)) { + } // for (final String token : tokens) { + + // process bracketed stuff (split on spaces and dashes always) + final String[] bracketedTokens = NON_CHAR.split(bracketed.toString()); + for (final String token : bracketedTokens) { + assert !token.contains("-"); + if (!alreadyDone.contains(token) && token.length() > 0) { + final List entries = indexBuilder.getOrCreateEntries(token, EntryTypeName.BRACKETED); + entries.add(entryData); + } + } + + // process paren stuff + final String[] parenTokens = NON_CHAR.split(bracketed.toString()); + for (final String token : parenTokens) { + assert !token.contains("-"); + if (!alreadyDone.contains(token) && token.length() > 0) { + final List entries = indexBuilder.getOrCreateEntries(token, EntryTypeName.PARENTHESIZED); + entries.add(entryData); + } + } + + } + + private String parseField_DE(final IndexBuilder indexBuilder, String field, + final EntryData entryData, final int subfieldIdx) { + final Matcher matcher = DE_NOUN.matcher(field); + while (matcher.find()) { + final String noun = matcher.group(1); + //final String gender = matcher.group(2); + if (alreadyDone.add(noun)) { + // System.out.println("Found DE noun " + noun + ", " + gender); + final List entries = indexBuilder.getOrCreateEntries(noun, EntryTypeName.NOUN); + entries.add(entryData); + } + } + return field; + } + + private String parseField_EN(final IndexBuilder indexBuilder, String field, + final EntryData entryData, final int subfieldIdx) { + if (field.startsWith("to ")) { + field = field.substring(3); + } + return field; + } + + +} diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java new file mode 100644 index 0000000..bff164b --- /dev/null +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java @@ -0,0 +1,150 @@ +package com.hughes.android.dictionary.engine; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.io.RandomAccessFile; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import com.hughes.android.dictionary.Language; +import com.hughes.util.Args; +import com.hughes.util.FileUtil; + +/* + +--maxEntries=100 +--dictOut=de-en.dict +--lang1=DE +--lang2=EN +--dictInfo=@dictInfo.txt + +--input0=/Users/thadh/personal/quickDic/de-en-chemnitz.txt +--input0Name=chemnitz +--input0Charset=UTF8 +--input0Format=chemnitz + +--input1=/Users/thadh/personal/quickDic/dewiktionary-20100326-pages-articles.xml +--input1Name=wiktionary +--input1Format=wiktionary + +--input2=/Users/thadh/personal/quickDic/de-en-dictcc.txt +--input2Name=dictcc +--input2Charset=Cp1252 +--input2Format=dictcc + */ + +public class DictionaryBuilder { + + final Dictionary dictionary; + + final List entryDatas = new ArrayList(); + + final List indexBuilders = new ArrayList(); + + public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1) { + dictionary = new Dictionary(dictInfo); + indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0)); + indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1)); + } + + void build() { + for (final IndexBuilder indexBuilder : indexBuilders) { + indexBuilder.build(); + dictionary.indices.add(indexBuilder.index); + } + } + + public static void main(final String[] args) throws IOException { + final Map keyValueArgs = Args.keyValueArgs(args); + + final Language lang1 = Language.lookup(keyValueArgs.remove("lang1")); + final Language lang2 = Language.lookup(keyValueArgs.remove("lang2")); + if (lang1 == null || lang2 == null) { + fatalError("--lang1= and --lang2= must both be specified."); + } + + final String dictOutFilename = keyValueArgs.remove("dictOut"); + if (dictOutFilename == null) { + fatalError("--dictOut= must be specified."); + } + + String dictInfo = keyValueArgs.remove("dictInfo"); + if (dictInfo == null) { + fatalError("--dictInfo= must be specified."); + } + if (dictInfo.startsWith("@")) { + dictInfo = FileUtil.readToString(new File(dictInfo.substring(1))); + } + + final String printFile = keyValueArgs.remove("print"); + + System.out.println("lang1=" + lang1); + System.out.println("lang2=" + lang2); + System.out.println("dictInfo=" + dictInfo); + System.out.println("dictOut=" + dictOutFilename); + + final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2); + + for (int i = 0; i < 100; ++i) { + final String prefix = "input" + i; + if (keyValueArgs.containsKey(prefix)) { + final File file = new File(keyValueArgs.remove(prefix)); + System.out.println("Processing: " + file); + String charsetName = keyValueArgs.remove(prefix + "Charset"); + if (charsetName == null) { + charsetName = "UTF8"; + } + final Charset charset = Charset.forName(charsetName); + String inputName = keyValueArgs.remove(prefix + "Name"); + if (inputName == null) { + fatalError("Must specify human readable name for: " + prefix + "Name"); + } + + String inputFormat = keyValueArgs.remove(prefix + "Format"); + if ("dictcc".equals(inputFormat)) { + new DictFileParser(charset, false, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file); + } else if ("chemnitz".equals(inputFormat)) { + new DictFileParser(charset, false, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file); + } else if ("wiktionary".equals(inputFormat)) { + throw new RuntimeException(); +// new WiktionaryXmlParser(dict).parse(file); + } else { + fatalError("Invalid or missing input format: " + inputFormat); + } + + final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName); + dictionaryBuilder.dictionary.sources.add(entrySource); + System.out.println("Done: " + file + "\n\n"); + } + } + + dictionaryBuilder.build(); + + if (printFile != null) { + final PrintStream out = new PrintStream(new File(printFile)); + dictionaryBuilder.dictionary.print(out); + out.close(); + } + + System.out.println("Writing dictionary to: " + dictOutFilename); + final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw"); + dictOut.setLength(0); + dictionaryBuilder.dictionary.write(dictOut); + dictOut.close(); + + if (!keyValueArgs.isEmpty()) { + System.err.println("WARNING: couldn't parse arguments: " + keyValueArgs); + System.exit(1); + } + + } + + private static void fatalError(String string) { + System.err.println(string); + System.exit(1); + } + +} diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java new file mode 100644 index 0000000..a2468f2 --- /dev/null +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -0,0 +1,56 @@ +package com.hughes.android.dictionary.engine; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.io.RandomAccessFile; + +import com.hughes.util.FileUtil; + +import junit.framework.TestCase; + +public class DictionaryBuilderTest extends TestCase { + + public void testGermanCombined() throws IOException { + final File result = File.createTempFile("de_en", ".dict"); + System.out.println("Writing to: " + result); + DictionaryBuilder.main(new String[] { + "--dictOut=" + result.getAbsolutePath(), + "--lang1=DE", + "--lang2=EN", + "--dictInfo=@testdata/de_en_dictInfo.txt", + + "--input1=testdata/de-en-chemnitz_100", + "--input1Name=dictcc", + "--input1Charset=UTF8", + "--input1Format=chemnitz", + + "--input2=testdata/de-en-dictcc_100", + "--input2Name=dictcc", + "--input2Charset=UTF8", + "--input2Format=dictcc", + + "--print=testdata/de_en.test", + }); + + // Check it once: + assertFilesEqual("testdata/de_en.golden", "testdata/de_en.test"); + + + // Check it again. + final Dictionary dict = new Dictionary(new RandomAccessFile(result.getAbsolutePath(), "r")); + final PrintStream out = new PrintStream(new File("testdata/de_en.test")); + dict.print(out); + out.close(); + + assertFilesEqual("testdata/de_en.golden", "testdata/de_en.test"); + } + + + void assertFilesEqual(final String expected, final String actual) throws IOException { + final String expectedString = FileUtil.readToString(new File(expected)); + final String actualString = FileUtil.readToString(new File(actual)); + assertEquals(expectedString, actualString); + } + +} diff --git a/src/com/hughes/android/dictionary/engine/EntryData.java b/src/com/hughes/android/dictionary/engine/EntryData.java new file mode 100644 index 0000000..7f6b9b5 --- /dev/null +++ b/src/com/hughes/android/dictionary/engine/EntryData.java @@ -0,0 +1,14 @@ +/** + * + */ +package com.hughes.android.dictionary.engine; + +import com.hughes.util.IndexedObject; + +class EntryData extends IndexedObject { + EntryData(final int index, final Entry entry) { + super(index); + this.entry = entry; + } + Entry entry; +} \ No newline at end of file diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java new file mode 100644 index 0000000..44ff0d3 --- /dev/null +++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java @@ -0,0 +1,81 @@ +package com.hughes.android.dictionary.engine; + +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; + +import com.hughes.android.dictionary.Language; + +public class IndexBuilder { + + final DictionaryBuilder dictionaryBuilder; + final Index index; + + final SortedMap tokenToData; + + @SuppressWarnings("unchecked") + IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language) { + this.dictionaryBuilder = dictionaryBuilder; + index = new Index(dictionaryBuilder.dictionary, shortName, longName, language); + tokenToData = new TreeMap(language.getSortCollator()); + } + + public void build() { + final Set tokenEntryDatas = new HashSet(); + final List rows = index.rows; + for (final TokenData tokenData : tokenToData.values()) { + tokenEntryDatas.clear(); + final int indexRow = index.sortedIndexEntries.size(); + index.sortedIndexEntries.add(new Index.IndexEntry(tokenData.token, rows.size())); + rows.add(new TokenRow(indexRow, rows.size(), index)); + int count = 0; + for (final List entryDatas : tokenData.typeToEntries.values()) { + for (final EntryData entryData : entryDatas) { + if (tokenEntryDatas.add(entryData)) { + rows.add(new PairEntry.Row(entryData.index(), rows.size(), index)); + ++count; + } + } + } + System.out.println(count + " ENTRIES FOR TOKEN " + tokenData.token); + } + } + + static class TokenData { + final String token; + + final Map> typeToEntries = new EnumMap>(EntryTypeName.class); + + TokenData(final String token) { + assert token.equals(token.trim()); + assert token.length() > 0; + this.token = token; + } + } + + public TokenData getOrCreateTokenData(final String token) { + TokenData tokenData = tokenToData.get(token); + if (tokenData == null) { + tokenData = new TokenData(token); + tokenToData.put(token, tokenData); + } + return tokenData; + } + + public List getOrCreateEntries(final String token, final EntryTypeName entryTypeName) { + final TokenData tokenData = getOrCreateTokenData(token); + List entries = tokenData.typeToEntries.get(entryTypeName); + if (entries == null) { + entries = new ArrayList(); + tokenData.typeToEntries.put(entryTypeName, entries); + } + return entries; + } + + +}