final DictionaryBuilder dictionaryBuilder;
public final Index index;
+ final Set<String> stoplist;
final SortedMap<String, TokenData> tokenToData;
- IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final boolean swapPairEntries) {
+ IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final Set<String> stoplist, final boolean swapPairEntries) {
this.dictionaryBuilder = dictionaryBuilder;
index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries);
tokenToData = new TreeMap<String, TokenData>(new NormalizeComparator(index.normalizer(), language.getCollator()));
+ this.stoplist = stoplist;
}
public void build() {
.normalizer().transliterate(tokenData.token), startRow, numRows));
}
- final List<IndexEntry> sortedEntries = new ArrayList<IndexEntry>(index.sortedIndexEntries);
- Collections.sort(sortedEntries, new Comparator<IndexEntry>() {
+ final List<IndexEntry> entriesSortedByRows = new ArrayList<IndexEntry>(index.sortedIndexEntries);
+ Collections.sort(entriesSortedByRows, new Comparator<IndexEntry>() {
@Override
public int compare(IndexEntry object1, IndexEntry object2) {
return object2.numRows - object1.numRows;
}});
System.out.println("Most common tokens:");
- for (int i = 0; i < 50 && i < sortedEntries.size(); ++i) {
- System.out.println(" " + sortedEntries.get(i));
+ for (int i = 0; i < 50 && i < entriesSortedByRows.size(); ++i) {
+ System.out.println(" " + entriesSortedByRows.get(i));
}
}
}
}
- public TokenData getOrCreateTokenData(final String token) {
+ private TokenData getOrCreateTokenData(final String token) {
TokenData tokenData = tokenToData.get(token);
if (tokenData == null) {
tokenData = new TokenData(token);
return tokenData;
}
- public List<IndexedEntry> getOrCreateEntries(final String token, final EntryTypeName entryTypeName) {
+ private List<IndexedEntry> getOrCreateEntries(final String token, final EntryTypeName entryTypeName) {
final TokenData tokenData = getOrCreateTokenData(token);
List<IndexedEntry> entries = tokenData.typeToEntries.get(entryTypeName);
if (entries == null) {
entries = new ArrayList<IndexedEntry>();
tokenData.typeToEntries.put(entryTypeName, entries);
}
+ if (token.contains("Aosta")) {
+ System.out.println("asdfasdf");
+ }
return entries;
}
public void addEntryWithTokens(final IndexedEntry indexedEntry, final Set<String> tokens,
final EntryTypeName entryTypeName) {
for (final String token : tokens) {
+ if (entryTypeName.overridesStopList || !stoplist.contains(token))
getOrCreateEntries(token, entryTypeName).add(indexedEntry);
}
}
public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString,
- final EntryTypeName singleTokenEntryTypeName, final EntryTypeName multiTokenEntryTypeName) {
+ final EntryTypeName entryTypeName) {
final Set<String> tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR);
- addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? singleTokenEntryTypeName : multiTokenEntryTypeName);
+ addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? entryTypeName.singleWordInstance : entryTypeName);
}
- public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString,
+ public void addEntryWithStringNoSingle(final IndexedEntry indexedEntry, final String untokenizedString,
final EntryTypeName entryTypeName) {
- addEntryWithString(indexedEntry, untokenizedString, entryTypeName, entryTypeName);
+ final Set<String> tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR);
+ addEntryWithTokens(indexedEntry, tokens, entryTypeName);
}
}