1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser;
17 import java.io.BufferedReader;
18 import java.io.BufferedInputStream;
19 import java.io.DataInputStream;
21 import java.io.FileInputStream;
22 import java.io.FileWriter;
23 import java.io.IOException;
24 import java.io.InputStreamReader;
25 import java.nio.charset.Charset;
26 import java.util.Arrays;
27 import java.util.Collections;
28 import java.util.LinkedHashSet;
30 import java.util.logging.Logger;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
34 import com.hughes.android.dictionary.engine.AbstractEntry;
35 import com.hughes.android.dictionary.engine.DictionaryBuilder;
36 import com.hughes.android.dictionary.engine.EntrySource;
37 import com.hughes.android.dictionary.engine.EntryTypeName;
38 import com.hughes.android.dictionary.engine.HtmlEntry;
39 import com.hughes.android.dictionary.engine.IndexBuilder;
40 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
41 import com.hughes.android.dictionary.engine.IndexedEntry;
42 import com.hughes.android.dictionary.engine.Language;
43 import com.hughes.android.dictionary.engine.PairEntry;
44 import com.hughes.util.StringUtil;
46 public class DictFileParser implements Parser {
48 static final Logger logger = Logger.getLogger(DictFileParser.class.getName());
51 public static final String TAB = "\t";
54 public static final String DOUBLE_COLON = " :: ";
55 public static final String PIPE = "|";
57 static final Pattern SPACES = Pattern.compile("\\s+");
59 static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}");
61 // http://www.regular-expressions.info/unicode.html
62 static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}\\p{M}\\p{N}]+");
63 public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}\\p{M}\\p{N}]+");
65 static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}\\p{M}\\p{N}]+|[^\\p{L}\\p{M}\\p{N}]+$");
67 final Charset charset;
68 final boolean flipCols;
70 final String fieldSplit;
71 final String subfieldSplit;
73 final boolean singleLang;
75 final DictionaryBuilder dictBuilder;
77 EntrySource entrySource;
79 // final Set<String> alreadyDone = new HashSet<String>();
81 public DictFileParser(final Charset charset, boolean flipCols,
82 final String fieldSplit, final String subfieldSplit,
83 final boolean singleLang,
84 final DictionaryBuilder dictBuilder) {
85 this.charset = charset;
86 this.flipCols = flipCols;
87 this.fieldSplit = fieldSplit;
88 this.subfieldSplit = subfieldSplit;
89 this.singleLang = singleLang;
90 this.dictBuilder = dictBuilder;
94 public void parse(final File file, final EntrySource entrySouce, final int pageLimit) throws IOException {
95 this.entrySource = entrySouce;
96 final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
99 while ((line = reader.readLine()) != null) {
100 if (pageLimit >= 0 && count >= pageLimit) {
103 if (count % 10000 == 0) {
104 logger.info("count=" + count + ", line=" + line);
111 private void parseLine(final String line) {
112 if (line.startsWith("#") || line.isEmpty()) {
113 logger.info("Skipping comment line: " + line);
116 final String[] fields = StringUtil.split(line, fieldSplit);
117 if (fields.length < 2 || fields.length > 4) {
118 logger.warning("Malformed line, expected 3 or 4 fields, got " + fields.length + ": " + line);
122 fields[0] = SPACES.matcher(fields[0]).replaceAll(" ").trim();
123 fields[1] = SPACES.matcher(fields[1]).replaceAll(" ").trim();
125 final String temp = fields[0];
126 fields[0] = fields[1];
130 final String[][] subfields = new String[2][];
131 if (subfieldSplit != null) {
132 subfields[0] = StringUtil.split(fields[0], subfieldSplit);
133 subfields[1] = StringUtil.split(fields[1], subfieldSplit);
134 if (subfields[0].length != subfields[1].length) {
135 logger.warning("Number of subfields doesn't match: " + line);
139 subfields[0] = new String[] { fields[0] };
140 subfields[1] = new String[] { fields[1] };
143 for (int i = 0; i < subfields[0].length; ++i) {
144 subfields[0][i] = subfields[0][i].trim();
145 subfields[1][i] = subfields[1][i].trim();
146 if (subfields[0][i].isEmpty() && subfields[1][i].isEmpty()) {
147 logger.warning("Empty pair: " + line);
150 if (subfields[0][i].isEmpty()) {
151 subfields[0][i] = "__";
153 if (subfields[1][i].isEmpty()) {
154 subfields[1][i] = "__";
159 HtmlEntry htmlEntry = new HtmlEntry(entrySource, fields[0]);
160 htmlEntry.html = StringUtil.escapeUnicodeToPureHtml(fields[1]);
162 final IndexBuilder titleIndexBuilder = dictBuilder.indexBuilders.get(0);
163 htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
165 TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(fields[0]);
166 tokenData.hasMainEntry = true;
167 tokenData.htmlEntries.add(0, htmlEntry);
169 final String[] tokens = NON_CHAR.split(fields[0]);
170 if (tokens.length > 1) {
171 for (final String token : tokens) {
172 assert token.length() >= 1;
173 assert token.indexOf("-") == -1;
174 if (/*!alreadyDone.contains(token) && */!token.isEmpty()) {
175 tokenData = titleIndexBuilder.getOrCreateTokenData(token);
176 tokenData.htmlEntries.add(htmlEntry);
180 final IndexedEntry entryData = new IndexedEntry(htmlEntry);
181 entryData.isValid = true;
184 PairEntry pairEntry = new PairEntry(entrySource);
185 for (int i = 0; i < subfields[0].length; ++i) {
186 pairEntry.pairs.add(new PairEntry.Pair(subfields[0][i], subfields[1][i]));
188 final IndexedEntry entryData = new IndexedEntry(pairEntry);
189 entryData.isValid = true;
190 for (int l = 0; l < 2; ++l) {
191 // alreadyDone.clear();
193 final IndexBuilder indexBuilder = dictBuilder.indexBuilders.get(l);
194 for (int j = 0; j < subfields[l].length; ++j) {
195 String subfield = subfields[l][j];
196 if (indexBuilder.index.sortLanguage == Language.de) {
197 subfield = parseField_DE(indexBuilder, subfield, entryData, j);
198 } else if (indexBuilder.index.sortLanguage == Language.en) {
199 subfield = parseField_EN(indexBuilder, subfield, entryData, j);
201 parseFieldGeneric(indexBuilder, subfield, entryData, j, subfields[l].length);
208 private StringBuilder extractParenthesized(StringBuilder in, String startChar, String endChar) {
209 StringBuilder res = new StringBuilder();
211 while ((pos = in.indexOf(startChar, pos)) != -1) {
212 int end = in.indexOf(endChar, pos + 1);
213 if (end == -1) break;
214 res.append(in, pos + 1, end).append(" ");
215 in.replace(pos, end + 1, " ");
216 pos++; // skip the just appended space
221 private void parseFieldGeneric(final IndexBuilder indexBuilder, String field,
222 final IndexedEntry entryData, final int subfieldIdx, final int numSubFields) {
223 final StringBuilder fieldsb = new StringBuilder(field);
224 // remove bracketed and parenthesized stuff.
225 final StringBuilder bracketed = extractParenthesized(fieldsb, "[", "]");
226 final StringBuilder parenthesized = extractParenthesized(fieldsb, "(", ")");
228 field = fieldsb.toString().trim();
230 // split words on non -A-z0-9, do them.
231 final String[] tokens = NON_CHAR_DASH.split(field);
233 final EntryTypeName entryTypeName;
234 if (numSubFields == 1) {
235 assert subfieldIdx == 0;
236 if (tokens.length == 1) {
237 entryTypeName = EntryTypeName.ONE_WORD;
238 } else if (tokens.length == 2) {
239 entryTypeName = EntryTypeName.TWO_WORDS;
240 } else if (tokens.length == 3) {
241 entryTypeName = EntryTypeName.THREE_WORDS;
242 } else if (tokens.length == 4) {
243 entryTypeName = EntryTypeName.FOUR_WORDS;
245 entryTypeName = EntryTypeName.FIVE_OR_MORE_WORDS;
248 assert numSubFields > 1;
249 if (subfieldIdx == 0) {
250 if (tokens.length == 1) {
251 entryTypeName = EntryTypeName.MULTIROW_HEAD_ONE_WORD;
253 entryTypeName = EntryTypeName.MULTIROW_HEAD_MANY_WORDS;
256 assert subfieldIdx > 0;
257 if (tokens.length == 1) {
258 entryTypeName = EntryTypeName.MULTIROW_TAIL_ONE_WORD;
260 entryTypeName = EntryTypeName.MULTIROW_TAIL_MANY_WORDS;
265 for (String token : tokens) {
266 token = TRIM_PUNC.matcher(token).replaceAll("");
267 if (/*!alreadyDone.contains(token) && */!token.isEmpty()) {
268 indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), entryTypeName);
269 // alreadyDone.add(token);
271 // also split words on dashes, do them, too.
272 if (token.indexOf('-') != -1) {
273 final String[] dashed = StringUtil.split(token, "-");
274 for (final String dashedToken : dashed) {
275 if (/*!alreadyDone.contains(dashedToken) && */!dashedToken.isEmpty()) {
276 indexBuilder.addEntryWithTokens(entryData, Collections.singleton(dashedToken), EntryTypeName.PART_OF_HYPHENATED);
281 } // if (!alreadyDone.contains(token)) {
282 } // for (final String token : tokens) {
284 // process bracketed stuff (split on spaces and dashes always)
285 if (bracketed.length() > 0) {
286 final String[] bracketedTokens = NON_CHAR.split(bracketed.toString());
287 for (final String token : bracketedTokens) {
288 assert token.indexOf("-") == -1;
289 if (/*!alreadyDone.contains(token) && */!token.isEmpty()) {
290 indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.BRACKETED);
295 // process paren stuff
296 if (parenthesized.length() > 0) {
297 final String[] parenTokens = NON_CHAR.split(parenthesized.toString());
298 for (final String token : parenTokens) {
299 assert token.indexOf("-") == -1;
300 if (/*!alreadyDone.contains(token) && */!token.isEmpty()) {
301 indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.PARENTHESIZED);
308 private String parseField_DE(final IndexBuilder indexBuilder, String field,
309 final IndexedEntry entryData, final int subfieldIdx) {
311 // final Matcher matcher = DE_NOUN.matcher(field);
312 // while (matcher.find()) {
313 // final String noun = matcher.group(1);
314 //final String gender = matcher.group(2);
315 // if (alreadyDone.add(noun)) {
316 // System.out.println("Found DE noun " + noun + ", " + gender);
317 // final List<EntryData> entries = indexBuilder.getOrCreateEntries(noun, EntryTypeName.NOUN);
318 // entries.add(entryData);
322 if (field.indexOf('{') == -1) return field;
324 // In English, curly braces are used for different tenses.
325 field = CURLY_BRACED.matcher(field).replaceAll(" ");
330 private String parseField_EN(final IndexBuilder indexBuilder, String field,
331 final IndexedEntry entryData, final int subfieldIdx) {
332 if (field.startsWith("to ")) {
333 field = field.substring(3);
338 public static Set<String> tokenize(final String text, final Pattern pattern) {
339 final String[] split = pattern.split(text);
340 final Set<String> result = new LinkedHashSet<>(Arrays.asList(split));