return;
}
final String[] fields = fieldSplit.split(line);
- // dictcc now has a part of speech field as field #3.
- if (fields.length < 2 || fields.length > 3) {
- logger.warning("Malformed line: " + line);
+ if (fields.length < 2 || fields.length > 4) {
+ logger.warning("Malformed line, expected 3 or 4 fields, got " + fields.length + ": " + line);
return;
}
return field;
}
- public static final Set<String> tokenize(final String text, final Pattern pattern) {
+ public static Set<String> tokenize(final String text, final Pattern pattern) {
final String[] split = pattern.split(text);
- final Set<String> result = new LinkedHashSet<String>(Arrays.asList(split));
+ final Set<String> result = new LinkedHashSet<>(Arrays.asList(split));
result.remove("");
return result;
}