X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2FDictFileParser.java;h=1f08428761d42911a99a29c3c07a71f6c7f271cc;hb=020aa910526ece05ee8514e55a9a951b45ce1fea;hp=07d077562b8b106392d201123eb26aa753e48781;hpb=e479ba38bbcb261951399326623c20ffacc147d4;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/DictFileParser.java b/src/com/hughes/android/dictionary/parser/DictFileParser.java index 07d0775..1f08428 100644 --- a/src/com/hughes/android/dictionary/parser/DictFileParser.java +++ b/src/com/hughes/android/dictionary/parser/DictFileParser.java @@ -15,44 +15,47 @@ package com.hughes.android.dictionary.parser; import java.io.BufferedReader; +import java.io.BufferedInputStream; +import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; +import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashSet; -import java.util.List; import java.util.Set; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.hughes.android.dictionary.engine.AbstractEntry; import com.hughes.android.dictionary.engine.DictionaryBuilder; import com.hughes.android.dictionary.engine.EntrySource; -import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.engine.EntryTypeName; +import com.hughes.android.dictionary.engine.HtmlEntry; import com.hughes.android.dictionary.engine.IndexBuilder; +import com.hughes.android.dictionary.engine.IndexBuilder.TokenData; +import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.engine.Language; import com.hughes.android.dictionary.engine.PairEntry; -import com.hughes.android.dictionary.engine.PairEntry.Pair; +import com.hughes.util.StringUtil; public class DictFileParser implements Parser { static final Logger logger = Logger.getLogger(DictFileParser.class.getName()); // Dictcc - public static final Pattern TAB = Pattern.compile("\\t"); + public static final String TAB = "\t"; // Chemnitz - public static final Pattern DOUBLE_COLON = Pattern.compile(" :: "); - public static final Pattern PIPE = Pattern.compile("\\|"); + public static final String DOUBLE_COLON = " :: "; + public static final String PIPE = "|"; static final Pattern SPACES = Pattern.compile("\\s+"); - static final Pattern BRACKETED = Pattern.compile("\\[([^]]+)\\]"); - static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\)"); static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}"); // http://www.regular-expressions.info/unicode.html @@ -64,34 +67,33 @@ public class DictFileParser implements Parser { final Charset charset; final boolean flipCols; - final Pattern fieldSplit; - final Pattern subfieldSplit; + final String fieldSplit; + final String subfieldSplit; + + final boolean singleLang; final DictionaryBuilder dictBuilder; - final IndexBuilder[] langIndexBuilders; - final IndexBuilder bothIndexBuilder; EntrySource entrySource; // final Set alreadyDone = new HashSet(); public DictFileParser(final Charset charset, boolean flipCols, - final Pattern fieldSplit, final Pattern subfieldSplit, - final DictionaryBuilder dictBuilder, final IndexBuilder[] langIndexBuilders, - final IndexBuilder bothIndexBuilder) { + final String fieldSplit, final String subfieldSplit, + final boolean singleLang, + final DictionaryBuilder dictBuilder) { this.charset = charset; this.flipCols = flipCols; this.fieldSplit = fieldSplit; this.subfieldSplit = subfieldSplit; + this.singleLang = singleLang; this.dictBuilder = dictBuilder; - this.langIndexBuilders = langIndexBuilders; - this.bothIndexBuilder = bothIndexBuilder; } @Override public void parse(final File file, final EntrySource entrySouce, final int pageLimit) throws IOException { this.entrySource = entrySouce; - final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)); + final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); String line; int count = 0; while ((line = reader.readLine()) != null) { @@ -107,14 +109,13 @@ public class DictFileParser implements Parser { } private void parseLine(final String line) { - if (line.startsWith("#") || line.length() == 0) { + if (line.startsWith("#") || line.isEmpty()) { logger.info("Skipping comment line: " + line); return; } - final String[] fields = fieldSplit.split(line); - // dictcc now has a part of speech field as field #3. - if (fields.length < 2 || fields.length > 3) { - logger.warning("Malformed line: " + line); + final String[] fields = StringUtil.split(line, fieldSplit); + if (fields.length < 2 || fields.length > 4) { + logger.warning("Malformed line, expected 3 or 4 fields, got " + fields.length + ": " + line); return; } @@ -128,8 +129,8 @@ public class DictFileParser implements Parser { final String[][] subfields = new String[2][]; if (subfieldSplit != null) { - subfields[0] = subfieldSplit.split(fields[0]); - subfields[1] = subfieldSplit.split(fields[1]); + subfields[0] = StringUtil.split(fields[0], subfieldSplit); + subfields[1] = StringUtil.split(fields[1], subfieldSplit); if (subfields[0].length != subfields[1].length) { logger.warning("Number of subfields doesn't match: " + line); return; @@ -139,59 +140,92 @@ public class DictFileParser implements Parser { subfields[1] = new String[] { fields[1] }; } - final PairEntry pairEntry = new PairEntry(entrySource); for (int i = 0; i < subfields[0].length; ++i) { subfields[0][i] = subfields[0][i].trim(); subfields[1][i] = subfields[1][i].trim(); - if (subfields[0][i].length() == 0 && subfields[1][i].length() == 0) { + if (subfields[0][i].isEmpty() && subfields[1][i].isEmpty()) { logger.warning("Empty pair: " + line); continue; } - if (subfields[0][i].length() == 0) { + if (subfields[0][i].isEmpty()) { subfields[0][i] = "__"; } - if (subfields[1][i].length() == 0) { + if (subfields[1][i].isEmpty()) { subfields[1][i] = "__"; } - pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i])); } - final IndexedEntry entryData = new IndexedEntry(pairEntry); - entryData.isValid = true; - - for (int l = 0; l < 2; ++l) { - // alreadyDone.clear(); - - for (int j = 0; j < subfields[l].length; ++j) { - String subfield = subfields[l][j]; - final IndexBuilder indexBuilder = langIndexBuilders[l]; - if (indexBuilder.index.sortLanguage == Language.de) { - subfield = parseField_DE(indexBuilder, subfield, entryData, j); - } else if (indexBuilder.index.sortLanguage == Language.en) { - subfield = parseField_EN(indexBuilder, subfield, entryData, j); + + if (singleLang) { + HtmlEntry htmlEntry = new HtmlEntry(entrySource, fields[0]); + htmlEntry.html = StringUtil.escapeUnicodeToPureHtml(fields[1]); + + final IndexBuilder titleIndexBuilder = dictBuilder.indexBuilders.get(0); + htmlEntry.addToDictionary(titleIndexBuilder.index.dict); + + TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(fields[0]); + tokenData.hasMainEntry = true; + tokenData.htmlEntries.add(0, htmlEntry); + + final String[] tokens = NON_CHAR.split(fields[0]); + if (tokens.length > 1) { + for (final String token : tokens) { + assert token.length() >= 1; + assert token.indexOf("-") == -1; + if (/*!alreadyDone.contains(token) && */!token.isEmpty()) { + tokenData = titleIndexBuilder.getOrCreateTokenData(token); + tokenData.htmlEntries.add(htmlEntry); + } + } + } + final IndexedEntry entryData = new IndexedEntry(htmlEntry); + entryData.isValid = true; + + } else { + PairEntry pairEntry = new PairEntry(entrySource); + for (int i = 0; i < subfields[0].length; ++i) { + pairEntry.pairs.add(new PairEntry.Pair(subfields[0][i], subfields[1][i])); + } + final IndexedEntry entryData = new IndexedEntry(pairEntry); + entryData.isValid = true; + for (int l = 0; l < 2; ++l) { + // alreadyDone.clear(); + + final IndexBuilder indexBuilder = dictBuilder.indexBuilders.get(l); + for (int j = 0; j < subfields[l].length; ++j) { + String subfield = subfields[l][j]; + if (indexBuilder.index.sortLanguage == Language.de) { + subfield = parseField_DE(indexBuilder, subfield, entryData, j); + } else if (indexBuilder.index.sortLanguage == Language.en) { + subfield = parseField_EN(indexBuilder, subfield, entryData, j); + } + parseFieldGeneric(indexBuilder, subfield, entryData, j, subfields[l].length); } - parseFieldGeneric(indexBuilder, subfield, entryData, j, subfields[l].length); } } + + } + + private StringBuilder extractParenthesized(StringBuilder in, String startChar, String endChar) { + StringBuilder res = new StringBuilder(); + int pos = 0; + while ((pos = in.indexOf(startChar, pos)) != -1) { + int end = in.indexOf(endChar, pos + 1); + if (end == -1) break; + res.append(in, pos + 1, end).append(" "); + in.replace(pos, end + 1, " "); + pos++; // skip the just appended space + } + return res; } private void parseFieldGeneric(final IndexBuilder indexBuilder, String field, final IndexedEntry entryData, final int subfieldIdx, final int numSubFields) { + final StringBuilder fieldsb = new StringBuilder(field); // remove bracketed and parenthesized stuff. - final StringBuilder bracketed = new StringBuilder(); - final StringBuilder parenthesized = new StringBuilder(); + final StringBuilder bracketed = extractParenthesized(fieldsb, "[", "]"); + final StringBuilder parenthesized = extractParenthesized(fieldsb, "(", ")"); - Matcher matcher; - while ((matcher = BRACKETED.matcher(field)).find()) { - bracketed.append(matcher.group(1)).append(" "); - field = matcher.replaceFirst(" "); - } - - while ((matcher = PARENTHESIZED.matcher(field)).find()) { - parenthesized.append(matcher.group(1)).append(" "); - field = matcher.replaceFirst(" "); - } - - field = SPACES.matcher(field).replaceAll(" ").trim(); + field = fieldsb.toString().trim(); // split words on non -A-z0-9, do them. final String[] tokens = NON_CHAR_DASH.split(field); @@ -230,15 +264,15 @@ public class DictFileParser implements Parser { for (String token : tokens) { token = TRIM_PUNC.matcher(token).replaceAll(""); - if (/*!alreadyDone.contains(token) && */token.length() > 0) { + if (/*!alreadyDone.contains(token) && */!token.isEmpty()) { indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), entryTypeName); // alreadyDone.add(token); // also split words on dashes, do them, too. - if (token.contains("-")) { - final String[] dashed = token.split("-"); + if (token.indexOf('-') != -1) { + final String[] dashed = StringUtil.split(token, "-"); for (final String dashedToken : dashed) { - if (/*!alreadyDone.contains(dashedToken) && */dashedToken.length() > 0) { + if (/*!alreadyDone.contains(dashedToken) && */!dashedToken.isEmpty()) { indexBuilder.addEntryWithTokens(entryData, Collections.singleton(dashedToken), EntryTypeName.PART_OF_HYPHENATED); } } @@ -248,20 +282,24 @@ public class DictFileParser implements Parser { } // for (final String token : tokens) { // process bracketed stuff (split on spaces and dashes always) - final String[] bracketedTokens = NON_CHAR.split(bracketed.toString()); - for (final String token : bracketedTokens) { - assert !token.contains("-"); - if (/*!alreadyDone.contains(token) && */token.length() > 0) { - indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.BRACKETED); + if (bracketed.length() > 0) { + final String[] bracketedTokens = NON_CHAR.split(bracketed.toString()); + for (final String token : bracketedTokens) { + assert token.indexOf("-") == -1; + if (/*!alreadyDone.contains(token) && */!token.isEmpty()) { + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.BRACKETED); + } } } // process paren stuff - final String[] parenTokens = NON_CHAR.split(parenthesized.toString()); - for (final String token : parenTokens) { - assert !token.contains("-"); - if (/*!alreadyDone.contains(token) && */token.length() > 0) { - indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.PARENTHESIZED); + if (parenthesized.length() > 0) { + final String[] parenTokens = NON_CHAR.split(parenthesized.toString()); + for (final String token : parenTokens) { + assert token.indexOf("-") == -1; + if (/*!alreadyDone.contains(token) && */!token.isEmpty()) { + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.PARENTHESIZED); + } } } @@ -281,6 +319,8 @@ public class DictFileParser implements Parser { // } // } + if (field.indexOf('{') == -1) return field; + // In English, curly braces are used for different tenses. field = CURLY_BRACED.matcher(field).replaceAll(" "); @@ -295,9 +335,9 @@ public class DictFileParser implements Parser { return field; } - public static final Set tokenize(final String text, final Pattern pattern) { + public static Set tokenize(final String text, final Pattern pattern) { final String[] split = pattern.split(text); - final Set result = new LinkedHashSet(Arrays.asList(split)); + final Set result = new LinkedHashSet<>(Arrays.asList(split)); result.remove(""); return result; }