X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2FDictFileParser.java;h=991ed8ad417b29e4941f68fb07e4a1eaf4b4248b;hb=4b1b911b69b63a157189a44124861cf365b12888;hp=67ca43293cbdbf9bde04bd27de5ff03c7f55ce50;hpb=5fab504f765ff1553c98096ba85b04ffc2ef1062;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/DictFileParser.java b/src/com/hughes/android/dictionary/parser/DictFileParser.java index 67ca432..991ed8a 100644 --- a/src/com/hughes/android/dictionary/parser/DictFileParser.java +++ b/src/com/hughes/android/dictionary/parser/DictFileParser.java @@ -30,6 +30,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import com.hughes.android.dictionary.engine.DictionaryBuilder; +import com.hughes.android.dictionary.engine.EntrySource; import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.IndexBuilder; @@ -37,7 +38,7 @@ import com.hughes.android.dictionary.engine.Language; import com.hughes.android.dictionary.engine.PairEntry; import com.hughes.android.dictionary.engine.PairEntry.Pair; -public class DictFileParser { +public class DictFileParser implements Parser { static final Logger logger = Logger.getLogger(DictFileParser.class.getName()); @@ -49,17 +50,15 @@ public class DictFileParser { public static final Pattern PIPE = Pattern.compile("\\|"); static final Pattern SPACES = Pattern.compile("\\s+"); -// static final Pattern DE_NOUN = Pattern.compile("([^ ]+) *\\{(m|f|n|pl)\\}"); -// static final Pattern EN_VERB = Pattern.compile("^to ([^ ]+)"); static final Pattern BRACKETED = Pattern.compile("\\[([^]]+)\\]"); static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\)"); static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}"); - static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}0-9]+"); - public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}0-9]+"); + static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}\\p{M}\\p{N}]+"); + public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}\\p{M}\\p{N}]+"); - static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}0-9]+|[^\\p{L}0-9]+$"); + static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}\\p{M}\\p{N}]+|[^\\p{L}\\p{M}\\p{N}]+$"); final Charset charset; final boolean flipCols; @@ -71,6 +70,8 @@ public class DictFileParser { final IndexBuilder[] langIndexBuilders; final IndexBuilder bothIndexBuilder; + EntrySource entrySource; + // final Set alreadyDone = new HashSet(); public DictFileParser(final Charset charset, boolean flipCols, @@ -86,11 +87,16 @@ public class DictFileParser { this.bothIndexBuilder = bothIndexBuilder; } - public void parseFile(final File file) throws IOException { + @Override + public void parse(final File file, final EntrySource entrySouce, final int pageLimit) throws IOException { + this.entrySource = entrySouce; final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)); String line; int count = 0; while ((line = reader.readLine()) != null) { + if (pageLimit >= 0 && count >= pageLimit) { + return; + } if (count % 10000 == 0) { logger.info("count=" + count + ", line=" + line); } @@ -131,10 +137,20 @@ public class DictFileParser { subfields[1] = new String[] { fields[1] }; } - final PairEntry pairEntry = new PairEntry(); + final PairEntry pairEntry = new PairEntry(entrySource); for (int i = 0; i < subfields[0].length; ++i) { subfields[0][i] = subfields[0][i].trim(); subfields[1][i] = subfields[1][i].trim(); + if (subfields[0][i].length() == 0 && subfields[1][i].length() == 0) { + logger.warning("Empty pair: " + line); + continue; + } + if (subfields[0][i].length() == 0) { + subfields[0][i] = "__"; + } + if (subfields[1][i].length() == 0) { + subfields[1][i] = "__"; + } pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i])); } final IndexedEntry entryData = new IndexedEntry(pairEntry);