X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2FDictFileParser.java;h=8c6424b8543ab663d71eaf5d8a9a7892dc6de7fd;hp=0fa43060086ffce54d1567d9ecd3530c7b94586e;hb=d8daf271f63e308eab3917a6bcc09b56035e0489;hpb=0a53dc44bc2c7a10cc7bd073499b0d01289baed3 diff --git a/src/com/hughes/android/dictionary/parser/DictFileParser.java b/src/com/hughes/android/dictionary/parser/DictFileParser.java index 0fa4306..8c6424b 100644 --- a/src/com/hughes/android/dictionary/parser/DictFileParser.java +++ b/src/com/hughes/android/dictionary/parser/DictFileParser.java @@ -35,22 +35,21 @@ import com.hughes.android.dictionary.engine.IndexBuilder; import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.engine.Language; import com.hughes.android.dictionary.engine.PairEntry; +import com.hughes.util.StringUtil; public class DictFileParser implements Parser { static final Logger logger = Logger.getLogger(DictFileParser.class.getName()); // Dictcc - public static final Pattern TAB = Pattern.compile("\\t"); + public static final String TAB = "\t"; // Chemnitz - public static final Pattern DOUBLE_COLON = Pattern.compile(" :: "); - public static final Pattern PIPE = Pattern.compile("\\|"); + public static final String DOUBLE_COLON = " :: "; + public static final String PIPE = "|"; static final Pattern SPACES = Pattern.compile("\\s+"); - static final Pattern BRACKETED = Pattern.compile("\\[([^]]+)\\]"); - static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\)"); static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}"); // http://www.regular-expressions.info/unicode.html @@ -62,28 +61,23 @@ public class DictFileParser implements Parser { final Charset charset; final boolean flipCols; - final Pattern fieldSplit; - final Pattern subfieldSplit; + final String fieldSplit; + final String subfieldSplit; final DictionaryBuilder dictBuilder; - final IndexBuilder[] langIndexBuilders; - final IndexBuilder bothIndexBuilder; EntrySource entrySource; // final Set alreadyDone = new HashSet(); public DictFileParser(final Charset charset, boolean flipCols, - final Pattern fieldSplit, final Pattern subfieldSplit, - final DictionaryBuilder dictBuilder, final IndexBuilder[] langIndexBuilders, - final IndexBuilder bothIndexBuilder) { + final String fieldSplit, final String subfieldSplit, + final DictionaryBuilder dictBuilder) { this.charset = charset; this.flipCols = flipCols; this.fieldSplit = fieldSplit; this.subfieldSplit = subfieldSplit; this.dictBuilder = dictBuilder; - this.langIndexBuilders = langIndexBuilders; - this.bothIndexBuilder = bothIndexBuilder; } @Override @@ -105,11 +99,11 @@ public class DictFileParser implements Parser { } private void parseLine(final String line) { - if (line.startsWith("#") || line.length() == 0) { + if (line.startsWith("#") || line.isEmpty()) { logger.info("Skipping comment line: " + line); return; } - final String[] fields = fieldSplit.split(line); + final String[] fields = StringUtil.split(line, fieldSplit); if (fields.length < 2 || fields.length > 4) { logger.warning("Malformed line, expected 3 or 4 fields, got " + fields.length + ": " + line); return; @@ -125,8 +119,8 @@ public class DictFileParser implements Parser { final String[][] subfields = new String[2][]; if (subfieldSplit != null) { - subfields[0] = subfieldSplit.split(fields[0]); - subfields[1] = subfieldSplit.split(fields[1]); + subfields[0] = StringUtil.split(fields[0], subfieldSplit); + subfields[1] = StringUtil.split(fields[1], subfieldSplit); if (subfields[0].length != subfields[1].length) { logger.warning("Number of subfields doesn't match: " + line); return; @@ -140,14 +134,14 @@ public class DictFileParser implements Parser { for (int i = 0; i < subfields[0].length; ++i) { subfields[0][i] = subfields[0][i].trim(); subfields[1][i] = subfields[1][i].trim(); - if (subfields[0][i].length() == 0 && subfields[1][i].length() == 0) { + if (subfields[0][i].isEmpty() && subfields[1][i].isEmpty()) { logger.warning("Empty pair: " + line); continue; } - if (subfields[0][i].length() == 0) { + if (subfields[0][i].isEmpty()) { subfields[0][i] = "__"; } - if (subfields[1][i].length() == 0) { + if (subfields[1][i].isEmpty()) { subfields[1][i] = "__"; } pairEntry.pairs.add(new PairEntry.Pair(subfields[0][i], subfields[1][i])); @@ -158,9 +152,9 @@ public class DictFileParser implements Parser { for (int l = 0; l < 2; ++l) { // alreadyDone.clear(); + final IndexBuilder indexBuilder = dictBuilder.indexBuilders.get(l); for (int j = 0; j < subfields[l].length; ++j) { String subfield = subfields[l][j]; - final IndexBuilder indexBuilder = langIndexBuilders[l]; if (indexBuilder.index.sortLanguage == Language.de) { subfield = parseField_DE(indexBuilder, subfield, entryData, j); } else if (indexBuilder.index.sortLanguage == Language.en) { @@ -171,24 +165,27 @@ public class DictFileParser implements Parser { } } + private StringBuilder extractParenthesized(StringBuilder in, String startChar, String endChar) { + StringBuilder res = new StringBuilder(); + int pos = 0; + while ((pos = in.indexOf(startChar, pos)) != -1) { + int end = in.indexOf(endChar, pos + 1); + if (end == -1) break; + res.append(in, pos + 1, end).append(" "); + in.replace(pos, end + 1, " "); + pos++; // skip the just appended space + } + return res; + } + private void parseFieldGeneric(final IndexBuilder indexBuilder, String field, final IndexedEntry entryData, final int subfieldIdx, final int numSubFields) { + final StringBuilder fieldsb = new StringBuilder(field); // remove bracketed and parenthesized stuff. - final StringBuilder bracketed = new StringBuilder(); - final StringBuilder parenthesized = new StringBuilder(); - - Matcher matcher; - while ((matcher = BRACKETED.matcher(field)).find()) { - bracketed.append(matcher.group(1)).append(" "); - field = matcher.replaceFirst(" "); - } + final StringBuilder bracketed = extractParenthesized(fieldsb, "[", "]"); + final StringBuilder parenthesized = extractParenthesized(fieldsb, "(", ")"); - while ((matcher = PARENTHESIZED.matcher(field)).find()) { - parenthesized.append(matcher.group(1)).append(" "); - field = matcher.replaceFirst(" "); - } - - field = SPACES.matcher(field).replaceAll(" ").trim(); + field = fieldsb.toString().trim(); // split words on non -A-z0-9, do them. final String[] tokens = NON_CHAR_DASH.split(field); @@ -227,15 +224,15 @@ public class DictFileParser implements Parser { for (String token : tokens) { token = TRIM_PUNC.matcher(token).replaceAll(""); - if (/*!alreadyDone.contains(token) && */token.length() > 0) { + if (/*!alreadyDone.contains(token) && */!token.isEmpty()) { indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), entryTypeName); // alreadyDone.add(token); // also split words on dashes, do them, too. - if (token.contains("-")) { - final String[] dashed = token.split("-"); + if (token.indexOf('-') != -1) { + final String[] dashed = StringUtil.split(token, "-"); for (final String dashedToken : dashed) { - if (/*!alreadyDone.contains(dashedToken) && */dashedToken.length() > 0) { + if (/*!alreadyDone.contains(dashedToken) && */!dashedToken.isEmpty()) { indexBuilder.addEntryWithTokens(entryData, Collections.singleton(dashedToken), EntryTypeName.PART_OF_HYPHENATED); } } @@ -245,20 +242,24 @@ public class DictFileParser implements Parser { } // for (final String token : tokens) { // process bracketed stuff (split on spaces and dashes always) - final String[] bracketedTokens = NON_CHAR.split(bracketed.toString()); - for (final String token : bracketedTokens) { - assert !token.contains("-"); - if (/*!alreadyDone.contains(token) && */token.length() > 0) { - indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.BRACKETED); + if (bracketed.length() > 0) { + final String[] bracketedTokens = NON_CHAR.split(bracketed.toString()); + for (final String token : bracketedTokens) { + assert token.indexOf("-") == -1; + if (/*!alreadyDone.contains(token) && */!token.isEmpty()) { + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.BRACKETED); + } } } // process paren stuff - final String[] parenTokens = NON_CHAR.split(parenthesized.toString()); - for (final String token : parenTokens) { - assert !token.contains("-"); - if (/*!alreadyDone.contains(token) && */token.length() > 0) { - indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.PARENTHESIZED); + if (parenthesized.length() > 0) { + final String[] parenTokens = NON_CHAR.split(parenthesized.toString()); + for (final String token : parenTokens) { + assert token.indexOf("-") == -1; + if (/*!alreadyDone.contains(token) && */!token.isEmpty()) { + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.PARENTHESIZED); + } } } @@ -278,6 +279,8 @@ public class DictFileParser implements Parser { // } // } + if (field.indexOf('{') == -1) return field; + // In English, curly braces are used for different tenses. field = CURLY_BRACED.matcher(field).replaceAll(" ");