import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Arrays;
+import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import com.hughes.android.dictionary.engine.DictionaryBuilder;
+import com.hughes.android.dictionary.engine.EntrySource;
import com.hughes.android.dictionary.engine.IndexedEntry;
import com.hughes.android.dictionary.engine.EntryTypeName;
import com.hughes.android.dictionary.engine.IndexBuilder;
final IndexBuilder[] langIndexBuilders;
final IndexBuilder bothIndexBuilder;
+ EntrySource entrySource;
+
// final Set<String> alreadyDone = new HashSet<String>();
public DictFileParser(final Charset charset, boolean flipCols,
this.bothIndexBuilder = bothIndexBuilder;
}
- public void parseFile(final File file) throws IOException {
+ public void parseFile(final File file, final EntrySource entrySouce) throws IOException {
+ this.entrySource = entrySouce;
final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
String line;
int count = 0;
subfields[1] = new String[] { fields[1] };
}
- final PairEntry pairEntry = new PairEntry();
+ final PairEntry pairEntry = new PairEntry(entrySource);
for (int i = 0; i < subfields[0].length; ++i) {
subfields[0][i] = subfields[0][i].trim();
subfields[1][i] = subfields[1][i].trim();
+ if (subfields[0][i].length() == 0 && subfields[1][i].length() == 0) {
+ logger.warning("Empty pair: " + line);
+ continue;
+ }
+ if (subfields[0][i].length() == 0) {
+ subfields[0][i] = "__";
+ }
+ if (subfields[1][i].length() == 0) {
+ subfields[1][i] = "__";
+ }
pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i]));
}
final IndexedEntry entryData = new IndexedEntry(pairEntry);
for (String token : tokens) {
token = TRIM_PUNC.matcher(token).replaceAll("");
if (/*!alreadyDone.contains(token) && */token.length() > 0) {
- final List<IndexedEntry> entries = indexBuilder.getOrCreateEntries(token, entryTypeName);
- entries.add(entryData);
+ indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), entryTypeName);
// alreadyDone.add(token);
// also split words on dashes, do them, too.
final String[] dashed = token.split("-");
for (final String dashedToken : dashed) {
if (/*!alreadyDone.contains(dashedToken) && */dashedToken.length() > 0) {
- final List<IndexedEntry> dashEntries = indexBuilder.getOrCreateEntries(dashedToken, EntryTypeName.PART_OF_HYPHENATED);
- dashEntries.add(entryData);
+ indexBuilder.addEntryWithTokens(entryData, Collections.singleton(dashedToken), EntryTypeName.PART_OF_HYPHENATED);
}
}
}
for (final String token : bracketedTokens) {
assert !token.contains("-");
if (/*!alreadyDone.contains(token) && */token.length() > 0) {
- final List<IndexedEntry> entries = indexBuilder.getOrCreateEntries(token, EntryTypeName.BRACKETED);
- entries.add(entryData);
+ indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.BRACKETED);
}
}
for (final String token : parenTokens) {
assert !token.contains("-");
if (/*!alreadyDone.contains(token) && */token.length() > 0) {
- final List<IndexedEntry> entries = indexBuilder.getOrCreateEntries(token, EntryTypeName.PARENTHESIZED);
- entries.add(entryData);
+ indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.PARENTHESIZED);
}
}