X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2FDictFileParser.java;h=8c6424b8543ab663d71eaf5d8a9a7892dc6de7fd;hp=cea4f90bb4494edf8a81c7302c6e4978ac997fc0;hb=d8daf271f63e308eab3917a6bcc09b56035e0489;hpb=a8052a74747df9244c098041dc82c745f64d51c6 diff --git a/src/com/hughes/android/dictionary/parser/DictFileParser.java b/src/com/hughes/android/dictionary/parser/DictFileParser.java index cea4f90..8c6424b 100644 --- a/src/com/hughes/android/dictionary/parser/DictFileParser.java +++ b/src/com/hughes/android/dictionary/parser/DictFileParser.java @@ -21,243 +21,257 @@ import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.Arrays; +import java.util.Collections; import java.util.LinkedHashSet; -import java.util.List; import java.util.Set; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.hughes.android.dictionary.engine.DictionaryBuilder; -import com.hughes.android.dictionary.engine.IndexedEntry; +import com.hughes.android.dictionary.engine.EntrySource; import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.IndexBuilder; +import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.engine.Language; import com.hughes.android.dictionary.engine.PairEntry; -import com.hughes.android.dictionary.engine.PairEntry.Pair; - -public class DictFileParser { - - static final Logger logger = Logger.getLogger(DictFileParser.class.getName()); - - // Dictcc - public static final Pattern TAB = Pattern.compile("\\t"); - - // Chemnitz - public static final Pattern DOUBLE_COLON = Pattern.compile(" :: "); - public static final Pattern PIPE = Pattern.compile("\\|"); - - static final Pattern SPACES = Pattern.compile("\\s+"); -// static final Pattern DE_NOUN = Pattern.compile("([^ ]+) *\\{(m|f|n|pl)\\}"); -// static final Pattern EN_VERB = Pattern.compile("^to ([^ ]+)"); - - static final Pattern BRACKETED = Pattern.compile("\\[([^]]+)\\]"); - static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\)"); - static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}"); - - static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}0-9]+"); - public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}0-9]+"); - - static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}0-9]+|[^\\p{L}0-9]+$"); - - final Charset charset; - final boolean flipCols; - - final Pattern fieldSplit; - final Pattern subfieldSplit; - - final DictionaryBuilder dictBuilder; - final IndexBuilder[] langIndexBuilders; - final IndexBuilder bothIndexBuilder; - - // final Set alreadyDone = new HashSet(); - - public DictFileParser(final Charset charset, boolean flipCols, - final Pattern fieldSplit, final Pattern subfieldSplit, - final DictionaryBuilder dictBuilder, final IndexBuilder[] langIndexBuilders, - final IndexBuilder bothIndexBuilder) { - this.charset = charset; - this.flipCols = flipCols; - this.fieldSplit = fieldSplit; - this.subfieldSplit = subfieldSplit; - this.dictBuilder = dictBuilder; - this.langIndexBuilders = langIndexBuilders; - this.bothIndexBuilder = bothIndexBuilder; - } - - public void parseFile(final File file) throws IOException { - final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)); - String line; - int count = 0; - while ((line = reader.readLine()) != null) { - if (count % 10000 == 0) { - logger.info("count=" + count + ", line=" + line); - } - parseLine(line); - ++count; - } - } - - private void parseLine(final String line) { - if (line.startsWith("#") || line.length() == 0) { - logger.info("Skipping comment line: " + line); - return; - } - final String[] fields = fieldSplit.split(line); - if (fields.length != 2) { - logger.warning("Malformed line: " + line); - return; - } - - fields[0] = SPACES.matcher(fields[0]).replaceAll(" ").trim(); - fields[1] = SPACES.matcher(fields[1]).replaceAll(" ").trim(); - if (flipCols) { - final String temp = fields[0]; - fields[0] = fields[1]; - fields[1] = temp; - } +import com.hughes.util.StringUtil; - final String[][] subfields = new String[2][]; - if (subfieldSplit != null) { - subfields[0] = subfieldSplit.split(fields[0]); - subfields[1] = subfieldSplit.split(fields[1]); - if (subfields[0].length != subfields[1].length) { - logger.warning("Number of subfields doesn't match: " + line); - return; - } - } else { - subfields[0] = new String[] { fields[0] }; - subfields[1] = new String[] { fields[1] }; - } - - final PairEntry pairEntry = new PairEntry(); - for (int i = 0; i < subfields[0].length; ++i) { - subfields[0][i] = subfields[0][i].trim(); - subfields[1][i] = subfields[1][i].trim(); - pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i])); +public class DictFileParser implements Parser { + + static final Logger logger = Logger.getLogger(DictFileParser.class.getName()); + + // Dictcc + public static final String TAB = "\t"; + + // Chemnitz + public static final String DOUBLE_COLON = " :: "; + public static final String PIPE = "|"; + + static final Pattern SPACES = Pattern.compile("\\s+"); + + static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}"); + + // http://www.regular-expressions.info/unicode.html + static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}\\p{M}\\p{N}]+"); + public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}\\p{M}\\p{N}]+"); + + static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}\\p{M}\\p{N}]+|[^\\p{L}\\p{M}\\p{N}]+$"); + + final Charset charset; + final boolean flipCols; + + final String fieldSplit; + final String subfieldSplit; + + final DictionaryBuilder dictBuilder; + + EntrySource entrySource; + + // final Set alreadyDone = new HashSet(); + + public DictFileParser(final Charset charset, boolean flipCols, + final String fieldSplit, final String subfieldSplit, + final DictionaryBuilder dictBuilder) { + this.charset = charset; + this.flipCols = flipCols; + this.fieldSplit = fieldSplit; + this.subfieldSplit = subfieldSplit; + this.dictBuilder = dictBuilder; } - final IndexedEntry entryData = new IndexedEntry(pairEntry); - - for (int l = 0; l < 2; ++l) { - // alreadyDone.clear(); - - for (int j = 0; j < subfields[l].length; ++j) { - String subfield = subfields[l][j]; - final IndexBuilder indexBuilder = langIndexBuilders[l]; - if (indexBuilder.index.sortLanguage == Language.de) { - subfield = parseField_DE(indexBuilder, subfield, entryData, j); - } else if (indexBuilder.index.sortLanguage == Language.en) { - subfield = parseField_EN(indexBuilder, subfield, entryData, j); + + @Override + public void parse(final File file, final EntrySource entrySouce, final int pageLimit) throws IOException { + this.entrySource = entrySouce; + final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)); + String line; + int count = 0; + while ((line = reader.readLine()) != null) { + if (pageLimit >= 0 && count >= pageLimit) { + return; + } + if (count % 10000 == 0) { + logger.info("count=" + count + ", line=" + line); + } + parseLine(line); + ++count; } - parseFieldGeneric(indexBuilder, subfield, entryData, j, subfields[l].length); - } } - } - - private void parseFieldGeneric(final IndexBuilder indexBuilder, String field, - final IndexedEntry entryData, final int subfieldIdx, final int numSubFields) { - // remove bracketed and parenthesized stuff. - final StringBuilder bracketed = new StringBuilder(); - final StringBuilder parenthesized = new StringBuilder(); - - Matcher matcher; - while ((matcher = BRACKETED.matcher(field)).find()) { - bracketed.append(matcher.group(1)).append(" "); - field = matcher.replaceFirst(" "); + + private void parseLine(final String line) { + if (line.startsWith("#") || line.isEmpty()) { + logger.info("Skipping comment line: " + line); + return; + } + final String[] fields = StringUtil.split(line, fieldSplit); + if (fields.length < 2 || fields.length > 4) { + logger.warning("Malformed line, expected 3 or 4 fields, got " + fields.length + ": " + line); + return; + } + + fields[0] = SPACES.matcher(fields[0]).replaceAll(" ").trim(); + fields[1] = SPACES.matcher(fields[1]).replaceAll(" ").trim(); + if (flipCols) { + final String temp = fields[0]; + fields[0] = fields[1]; + fields[1] = temp; + } + + final String[][] subfields = new String[2][]; + if (subfieldSplit != null) { + subfields[0] = StringUtil.split(fields[0], subfieldSplit); + subfields[1] = StringUtil.split(fields[1], subfieldSplit); + if (subfields[0].length != subfields[1].length) { + logger.warning("Number of subfields doesn't match: " + line); + return; + } + } else { + subfields[0] = new String[] { fields[0] }; + subfields[1] = new String[] { fields[1] }; + } + + final PairEntry pairEntry = new PairEntry(entrySource); + for (int i = 0; i < subfields[0].length; ++i) { + subfields[0][i] = subfields[0][i].trim(); + subfields[1][i] = subfields[1][i].trim(); + if (subfields[0][i].isEmpty() && subfields[1][i].isEmpty()) { + logger.warning("Empty pair: " + line); + continue; + } + if (subfields[0][i].isEmpty()) { + subfields[0][i] = "__"; + } + if (subfields[1][i].isEmpty()) { + subfields[1][i] = "__"; + } + pairEntry.pairs.add(new PairEntry.Pair(subfields[0][i], subfields[1][i])); + } + final IndexedEntry entryData = new IndexedEntry(pairEntry); + entryData.isValid = true; + + for (int l = 0; l < 2; ++l) { + // alreadyDone.clear(); + + final IndexBuilder indexBuilder = dictBuilder.indexBuilders.get(l); + for (int j = 0; j < subfields[l].length; ++j) { + String subfield = subfields[l][j]; + if (indexBuilder.index.sortLanguage == Language.de) { + subfield = parseField_DE(indexBuilder, subfield, entryData, j); + } else if (indexBuilder.index.sortLanguage == Language.en) { + subfield = parseField_EN(indexBuilder, subfield, entryData, j); + } + parseFieldGeneric(indexBuilder, subfield, entryData, j, subfields[l].length); + } + } } - while ((matcher = PARENTHESIZED.matcher(field)).find()) { - parenthesized.append(matcher.group(1)).append(" "); - field = matcher.replaceFirst(" "); + private StringBuilder extractParenthesized(StringBuilder in, String startChar, String endChar) { + StringBuilder res = new StringBuilder(); + int pos = 0; + while ((pos = in.indexOf(startChar, pos)) != -1) { + int end = in.indexOf(endChar, pos + 1); + if (end == -1) break; + res.append(in, pos + 1, end).append(" "); + in.replace(pos, end + 1, " "); + pos++; // skip the just appended space + } + return res; } - - field = SPACES.matcher(field).replaceAll(" ").trim(); - - // split words on non -A-z0-9, do them. - final String[] tokens = NON_CHAR_DASH.split(field); - - final EntryTypeName entryTypeName; - if (numSubFields == 1) { - assert subfieldIdx == 0; - if (tokens.length == 1) { - entryTypeName = EntryTypeName.ONE_WORD; - } else if (tokens.length == 2) { - entryTypeName = EntryTypeName.TWO_WORDS; - } else if (tokens.length == 3) { - entryTypeName = EntryTypeName.THREE_WORDS; - } else if (tokens.length == 4) { - entryTypeName = EntryTypeName.FOUR_WORDS; - } else { - entryTypeName = EntryTypeName.FIVE_OR_MORE_WORDS; - } - } else { - assert numSubFields > 1; - if (subfieldIdx == 0) { - if (tokens.length == 1) { - entryTypeName = EntryTypeName.MULTIROW_HEAD_ONE_WORD; + + private void parseFieldGeneric(final IndexBuilder indexBuilder, String field, + final IndexedEntry entryData, final int subfieldIdx, final int numSubFields) { + final StringBuilder fieldsb = new StringBuilder(field); + // remove bracketed and parenthesized stuff. + final StringBuilder bracketed = extractParenthesized(fieldsb, "[", "]"); + final StringBuilder parenthesized = extractParenthesized(fieldsb, "(", ")"); + + field = fieldsb.toString().trim(); + + // split words on non -A-z0-9, do them. + final String[] tokens = NON_CHAR_DASH.split(field); + + final EntryTypeName entryTypeName; + if (numSubFields == 1) { + assert subfieldIdx == 0; + if (tokens.length == 1) { + entryTypeName = EntryTypeName.ONE_WORD; + } else if (tokens.length == 2) { + entryTypeName = EntryTypeName.TWO_WORDS; + } else if (tokens.length == 3) { + entryTypeName = EntryTypeName.THREE_WORDS; + } else if (tokens.length == 4) { + entryTypeName = EntryTypeName.FOUR_WORDS; + } else { + entryTypeName = EntryTypeName.FIVE_OR_MORE_WORDS; + } } else { - entryTypeName = EntryTypeName.MULTIROW_HEAD_MANY_WORDS; + assert numSubFields > 1; + if (subfieldIdx == 0) { + if (tokens.length == 1) { + entryTypeName = EntryTypeName.MULTIROW_HEAD_ONE_WORD; + } else { + entryTypeName = EntryTypeName.MULTIROW_HEAD_MANY_WORDS; + } + } else { + assert subfieldIdx > 0; + if (tokens.length == 1) { + entryTypeName = EntryTypeName.MULTIROW_TAIL_ONE_WORD; + } else { + entryTypeName = EntryTypeName.MULTIROW_TAIL_MANY_WORDS; + } + } } - } else { - assert subfieldIdx > 0; - if (tokens.length == 1) { - entryTypeName = EntryTypeName.MULTIROW_TAIL_ONE_WORD; - } else { - entryTypeName = EntryTypeName.MULTIROW_TAIL_MANY_WORDS; + + for (String token : tokens) { + token = TRIM_PUNC.matcher(token).replaceAll(""); + if (/*!alreadyDone.contains(token) && */!token.isEmpty()) { + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), entryTypeName); + // alreadyDone.add(token); + + // also split words on dashes, do them, too. + if (token.indexOf('-') != -1) { + final String[] dashed = StringUtil.split(token, "-"); + for (final String dashedToken : dashed) { + if (/*!alreadyDone.contains(dashedToken) && */!dashedToken.isEmpty()) { + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(dashedToken), EntryTypeName.PART_OF_HYPHENATED); + } + } + } + + } // if (!alreadyDone.contains(token)) { + } // for (final String token : tokens) { + + // process bracketed stuff (split on spaces and dashes always) + if (bracketed.length() > 0) { + final String[] bracketedTokens = NON_CHAR.split(bracketed.toString()); + for (final String token : bracketedTokens) { + assert token.indexOf("-") == -1; + if (/*!alreadyDone.contains(token) && */!token.isEmpty()) { + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.BRACKETED); + } + } } - } - } - for (String token : tokens) { - token = TRIM_PUNC.matcher(token).replaceAll(""); - if (/*!alreadyDone.contains(token) && */token.length() > 0) { - final List entries = indexBuilder.getOrCreateEntries(token, entryTypeName); - entries.add(entryData); - // alreadyDone.add(token); - - // also split words on dashes, do them, too. - if (token.contains("-")) { - final String[] dashed = token.split("-"); - for (final String dashedToken : dashed) { - if (/*!alreadyDone.contains(dashedToken) && */dashedToken.length() > 0) { - final List dashEntries = indexBuilder.getOrCreateEntries(dashedToken, EntryTypeName.PART_OF_HYPHENATED); - dashEntries.add(entryData); + // process paren stuff + if (parenthesized.length() > 0) { + final String[] parenTokens = NON_CHAR.split(parenthesized.toString()); + for (final String token : parenTokens) { + assert token.indexOf("-") == -1; + if (/*!alreadyDone.contains(token) && */!token.isEmpty()) { + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.PARENTHESIZED); + } } - } } - } // if (!alreadyDone.contains(token)) { - } // for (final String token : tokens) { - - // process bracketed stuff (split on spaces and dashes always) - final String[] bracketedTokens = NON_CHAR.split(bracketed.toString()); - for (final String token : bracketedTokens) { - assert !token.contains("-"); - if (/*!alreadyDone.contains(token) && */token.length() > 0) { - final List entries = indexBuilder.getOrCreateEntries(token, EntryTypeName.BRACKETED); - entries.add(entryData); - } - } - - // process paren stuff - final String[] parenTokens = NON_CHAR.split(parenthesized.toString()); - for (final String token : parenTokens) { - assert !token.contains("-"); - if (/*!alreadyDone.contains(token) && */token.length() > 0) { - final List entries = indexBuilder.getOrCreateEntries(token, EntryTypeName.PARENTHESIZED); - entries.add(entryData); - } } - - } - private String parseField_DE(final IndexBuilder indexBuilder, String field, - final IndexedEntry entryData, final int subfieldIdx) { - + private String parseField_DE(final IndexBuilder indexBuilder, String field, + final IndexedEntry entryData, final int subfieldIdx) { + // final Matcher matcher = DE_NOUN.matcher(field); // while (matcher.find()) { // final String noun = matcher.group(1); - //final String gender = matcher.group(2); + //final String gender = matcher.group(2); // if (alreadyDone.add(noun)) { // System.out.println("Found DE noun " + noun + ", " + gender); // final List entries = indexBuilder.getOrCreateEntries(noun, EntryTypeName.NOUN); @@ -265,26 +279,28 @@ public class DictFileParser { // } // } - // In English, curly braces are used for different tenses. - field = CURLY_BRACED.matcher(field).replaceAll(" "); + if (field.indexOf('{') == -1) return field; + + // In English, curly braces are used for different tenses. + field = CURLY_BRACED.matcher(field).replaceAll(" "); + + return field; + } + + private String parseField_EN(final IndexBuilder indexBuilder, String field, + final IndexedEntry entryData, final int subfieldIdx) { + if (field.startsWith("to ")) { + field = field.substring(3); + } + return field; + } - return field; - } - - private String parseField_EN(final IndexBuilder indexBuilder, String field, - final IndexedEntry entryData, final int subfieldIdx) { - if (field.startsWith("to ")) { - field = field.substring(3); + public static Set tokenize(final String text, final Pattern pattern) { + final String[] split = pattern.split(text); + final Set result = new LinkedHashSet<>(Arrays.asList(split)); + result.remove(""); + return result; } - return field; - } - - public static final Set tokenize(final String text, final Pattern pattern) { - final String[] split = pattern.split(text); - final Set result = new LinkedHashSet(Arrays.asList(split)); - result.remove(""); - return result; - } }