// Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package com.hughes.android.dictionary.parser; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashSet; import java.util.Set; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.hughes.android.dictionary.engine.DictionaryBuilder; import com.hughes.android.dictionary.engine.EntrySource; import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.HtmlEntry; import com.hughes.android.dictionary.engine.IndexBuilder; import com.hughes.android.dictionary.engine.IndexBuilder.TokenData; import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.engine.Language; import com.hughes.android.dictionary.engine.PairEntry; import com.hughes.util.StringUtil; public class SingleDictFileParser implements Parser { static final Logger logger = Logger.getLogger(SingleDictFileParser.class.getName()); // Dictcc public static final String TAB = "\t"; // Chemnitz public static final String DOUBLE_COLON = " :: "; public static final String PIPE = "|"; static final Pattern SPACES = Pattern.compile("\\s+"); static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}"); // http://www.regular-expressions.info/unicode.html static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}\\p{M}\\p{N}]+"); public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}\\p{M}\\p{N}]+"); static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}\\p{M}\\p{N}]+|[^\\p{L}\\p{M}\\p{N}]+$"); final Charset charset; final String fieldSplit; final DictionaryBuilder dictBuilder; EntrySource entrySource; // final Set alreadyDone = new HashSet(); public SingleDictFileParser(final Charset charset, final String fieldSplit, final DictionaryBuilder dictBuilder) { this.charset = charset; this.fieldSplit = fieldSplit; this.dictBuilder = dictBuilder; } @Override public void parse(final File file, final EntrySource entrySouce, final int pageLimit) throws IOException { this.entrySource = entrySouce; final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)); String line; int count = 0; while ((line = reader.readLine()) != null) { if (pageLimit >= 0 && count >= pageLimit) { return; } if (count % 10000 == 0) { logger.info("count=" + count + ", line=" + line); } parseLine(line); ++count; } } private void parseLine(final String line) { if (line.startsWith("#") || line.isEmpty()) { logger.info("Skipping comment line: " + line); return; } final String[] fields = StringUtil.split(line, fieldSplit); if (fields.length < 2 || fields.length > 4) { logger.warning("Malformed line, expected 3 or 4 fields, got " + fields.length + ": " + line); return; } String headword = SPACES.matcher(fields[0]).replaceAll(" ").trim(); String definition = SPACES.matcher(fields[1]).replaceAll(" ").trim(); final HtmlEntry htmlEntry = new HtmlEntry(entrySource, headword); htmlEntry.html = definition; IndexedEntry entryData = new IndexedEntry(htmlEntry); entryData.isValid = true; final IndexBuilder titleIndexBuilder = dictBuilder.indexBuilders.get(0); final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(headword); tokenData.hasMainEntry = true; htmlEntry.addToDictionary(titleIndexBuilder.index.dict); tokenData.htmlEntries.add(htmlEntry); entryData = null; } public static Set tokenize(final String text, final Pattern pattern) { final String[] split = pattern.split(text); final Set result = new LinkedHashSet<>(Arrays.asList(split)); result.remove(""); return result; } }