import com.hughes.android.dictionary.engine.PairEntry;
import com.hughes.android.dictionary.engine.PairEntry.Pair;
-public class DictFileParser {
+public class DictFileParser implements Parser {
static final Logger logger = Logger.getLogger(DictFileParser.class.getName());
public static final Pattern PIPE = Pattern.compile("\\|");
static final Pattern SPACES = Pattern.compile("\\s+");
-// static final Pattern DE_NOUN = Pattern.compile("([^ ]+) *\\{(m|f|n|pl)\\}");
-// static final Pattern EN_VERB = Pattern.compile("^to ([^ ]+)");
static final Pattern BRACKETED = Pattern.compile("\\[([^]]+)\\]");
static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\)");
static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}");
- static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}0-9]+");
- public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}0-9]+");
+ // http://www.regular-expressions.info/unicode.html
+ static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}\\p{M}\\p{N}]+");
+ public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}\\p{M}\\p{N}]+");
- static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}0-9]+|[^\\p{L}0-9]+$");
+ static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}\\p{M}\\p{N}]+|[^\\p{L}\\p{M}\\p{N}]+$");
final Charset charset;
final boolean flipCols;
this.bothIndexBuilder = bothIndexBuilder;
}
- public void parseFile(final File file, final EntrySource entrySouce) throws IOException {
+ @Override
+ public void parse(final File file, final EntrySource entrySouce, final int pageLimit) throws IOException {
this.entrySource = entrySouce;
final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
String line;
int count = 0;
while ((line = reader.readLine()) != null) {
+ if (pageLimit >= 0 && count >= pageLimit) {
+ return;
+ }
if (count % 10000 == 0) {
logger.info("count=" + count + ", line=" + line);
}
return;
}
final String[] fields = fieldSplit.split(line);
- if (fields.length != 2) {
+ // dictcc now has a part of speech field as field #3.
+ if (fields.length < 2 || fields.length > 3) {
logger.warning("Malformed line: " + line);
return;
}
subfields[0] = new String[] { fields[0] };
subfields[1] = new String[] { fields[1] };
}
-
+
final PairEntry pairEntry = new PairEntry(entrySource);
for (int i = 0; i < subfields[0].length; ++i) {
subfields[0][i] = subfields[0][i].trim();
pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i]));
}
final IndexedEntry entryData = new IndexedEntry(pairEntry);
+ entryData.isValid = true;
for (int l = 0; l < 2; ++l) {
// alreadyDone.clear();