-
- static final Logger logger = Logger.getLogger(DictFileParser.class.getName());
-
- // Dictcc
- public static final Pattern TAB = Pattern.compile("\\t");
-
- // Chemnitz
- public static final Pattern DOUBLE_COLON = Pattern.compile(" :: ");
- public static final Pattern PIPE = Pattern.compile("\\|");
-
- static final Pattern SPACES = Pattern.compile("\\s+");
-
- static final Pattern BRACKETED = Pattern.compile("\\[([^]]+)\\]");
- static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\)");
- static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}");
-
- // http://www.regular-expressions.info/unicode.html
- static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}\\p{M}\\p{N}]+");
- public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}\\p{M}\\p{N}]+");
-
- static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}\\p{M}\\p{N}]+|[^\\p{L}\\p{M}\\p{N}]+$");
-
- final Charset charset;
- final boolean flipCols;
-
- final Pattern fieldSplit;
- final Pattern subfieldSplit;
-
- final DictionaryBuilder dictBuilder;
- final IndexBuilder[] langIndexBuilders;
- final IndexBuilder bothIndexBuilder;
-
- EntrySource entrySource;
-
- // final Set<String> alreadyDone = new HashSet<String>();
-
- public DictFileParser(final Charset charset, boolean flipCols,
- final Pattern fieldSplit, final Pattern subfieldSplit,
- final DictionaryBuilder dictBuilder, final IndexBuilder[] langIndexBuilders,
- final IndexBuilder bothIndexBuilder) {
- this.charset = charset;
- this.flipCols = flipCols;
- this.fieldSplit = fieldSplit;
- this.subfieldSplit = subfieldSplit;
- this.dictBuilder = dictBuilder;
- this.langIndexBuilders = langIndexBuilders;
- this.bothIndexBuilder = bothIndexBuilder;
- }
-
- @Override
- public void parse(final File file, final EntrySource entrySouce, final int pageLimit) throws IOException {
- this.entrySource = entrySouce;
- final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
- String line;
- int count = 0;
- while ((line = reader.readLine()) != null) {
- if (pageLimit >= 0 && count >= pageLimit) {
- return;
- }
- if (count % 10000 == 0) {
- logger.info("count=" + count + ", line=" + line);
- }
- parseLine(line);
- ++count;
- }
- }
-
- private void parseLine(final String line) {
- if (line.startsWith("#") || line.length() == 0) {
- logger.info("Skipping comment line: " + line);
- return;
- }
- final String[] fields = fieldSplit.split(line);
- if (fields.length != 2) {
- logger.warning("Malformed line: " + line);
- return;
- }
-
- fields[0] = SPACES.matcher(fields[0]).replaceAll(" ").trim();
- fields[1] = SPACES.matcher(fields[1]).replaceAll(" ").trim();
- if (flipCols) {
- final String temp = fields[0];
- fields[0] = fields[1];
- fields[1] = temp;
- }