X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FIndexBuilder.java;h=680cf4adc5dee305793dced0a21c5b4be27ed0a8;hb=2fc669d88306d563fc9c899d8d91b25d591692ea;hp=a8d225a6505f8d6aa519e053baae3789b9449c8f;hpb=d46f529d02bf4306a922c521d032f7620020b1e8;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java index a8d225a..680cf4a 100644 --- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java @@ -29,137 +29,145 @@ import com.hughes.android.dictionary.engine.Index.IndexEntry; import com.hughes.android.dictionary.parser.DictFileParser; public class IndexBuilder { - - final DictionaryBuilder dictionaryBuilder; - public final Index index; - final Set stoplist; - - final SortedMap tokenToData; - - IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final Set stoplist, final boolean swapPairEntries) { - this.dictionaryBuilder = dictionaryBuilder; - index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries, stoplist); - tokenToData = new TreeMap(index.getSortComparator()); - this.stoplist = stoplist; - } - - public void build() { - final Set tokenIndexedEntries = new HashSet(); - final List rows = index.rows; - index.mainTokenCount = 0; - for (final TokenData tokenData : tokenToData.values()) { - tokenIndexedEntries.clear(); - final int indexIndex = index.sortedIndexEntries.size(); - final int startRow = rows.size(); - - TokenRow tokenRow = null; - - int numRows = 0; // off by one--doesn't count the token row! -// System.out.println("TOKEN: " + tokenData.token); - for (final Map.Entry> typeToIndexedEntries : tokenData.typeToEntries.entrySet()) { - for (final IndexedEntry indexedEntry : typeToIndexedEntries.getValue()) { - - if (!indexedEntry.isValid) { - continue; - } - - if (tokenRow == null) { -// System.out.println("Added TokenRow: " + rows.get(rows.size() - 1)); - tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry); - rows.add(tokenRow); - if (tokenRow.hasMainEntry) { - index.mainTokenCount++; + + final DictionaryBuilder dictionaryBuilder; + public final Index index; + final Set stoplist; + + final SortedMap tokenToData; + + IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final Set stoplist, final boolean swapPairEntries) { + this.dictionaryBuilder = dictionaryBuilder; + index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries, stoplist); + tokenToData = new TreeMap<>(index.getSortComparator()); + this.stoplist = stoplist; + } + + public void build() { + final Set tokenIndexedEntries = new HashSet<>(); + final List rows = index.rows; + index.mainTokenCount = 0; + for (final TokenData tokenData : tokenToData.values()) { + tokenIndexedEntries.clear(); + final int indexIndex = index.sortedIndexEntries.size(); + final int startRow = rows.size(); + + TokenRow tokenRow = null; + if (!tokenData.htmlEntries.isEmpty()) { + tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry); + rows.add(tokenRow); } - } - - if (indexedEntry.index() == -1) { - indexedEntry.addToDictionary(dictionaryBuilder.dictionary); - assert indexedEntry.index() >= 0; - } - if (tokenIndexedEntries.add(indexedEntry)) { - rows.add(indexedEntry.entry.CreateRow(indexedEntry.index(), rows.size(), index)); - ++indexedEntry.entry.entrySource.numEntries; - ++numRows; - + +// System.out.println("Added TokenRow: " + rows.get(rows.size() - 1)); + + int numRows = 0; // off by one--doesn't count the token row! +// System.out.println("TOKEN: " + tokenData.token); + for (final Map.Entry> typeToIndexedEntries : tokenData.typeToEntries.entrySet()) { + for (final IndexedEntry indexedEntry : typeToIndexedEntries.getValue()) { + if (!indexedEntry.isValid) { + continue; + } + + if (tokenRow == null) { + tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry); + rows.add(tokenRow); + } + + if (indexedEntry.entry.index() == -1) { + indexedEntry.entry.addToDictionary(dictionaryBuilder.dictionary); + assert indexedEntry.entry.index() >= 0; + } + if (tokenIndexedEntries.add(indexedEntry) && !tokenData.htmlEntries.contains(indexedEntry.entry)) { + rows.add(indexedEntry.entry.CreateRow(rows.size(), index)); + ++indexedEntry.entry.entrySource.numEntries; + ++numRows; + // System.out.print(" " + typeToEntry.getKey() + ": "); - // rows.get(rows.size() - 1).print(System.out); + // rows.get(rows.size() - 1).print(System.out); // System.out.println(); - } + } + } + } + + if (tokenRow != null) { + if (tokenRow.hasMainEntry) { + index.mainTokenCount++; + } + + final Index.IndexEntry indexEntry = new Index.IndexEntry(index, tokenData.token, index + .normalizer().transliterate(tokenData.token), startRow, numRows); + indexEntry.htmlEntries.addAll(tokenData.htmlEntries); + index.sortedIndexEntries.add(indexEntry); + } + } + + final List entriesSortedByNumRows = new ArrayList<>(index.sortedIndexEntries); + entriesSortedByNumRows.sort((object1, object2) -> object2.numRows - object1.numRows); + System.out.println("Most common tokens:"); + for (int i = 0; i < 50 && i < entriesSortedByNumRows.size(); ++i) { + System.out.println(" " + entriesSortedByNumRows.get(i)); } - } - index.sortedIndexEntries.add(new Index.IndexEntry(tokenData.token, index - .normalizer().transliterate(tokenData.token), startRow, numRows)); } - - final List entriesSortedByNumRows = new ArrayList(index.sortedIndexEntries); - Collections.sort(entriesSortedByNumRows, new Comparator() { - @Override - public int compare(IndexEntry object1, IndexEntry object2) { - return object2.numRows - object1.numRows; - }}); - System.out.println("Most common tokens:"); - for (int i = 0; i < 50 && i < entriesSortedByNumRows.size(); ++i) { - System.out.println(" " + entriesSortedByNumRows.get(i)); + + public static class TokenData { + final String token; + + final Map> typeToEntries = new EnumMap<>(EntryTypeName.class); + public boolean hasMainEntry = false; + + public final List htmlEntries = new ArrayList<>(); + + TokenData(final String token) { + assert token.equals(token.trim()); + assert token.length() > 0; + this.token = token; + } } - } - - static class TokenData { - final String token; - - final Map> typeToEntries = new EnumMap>(EntryTypeName.class); - boolean hasMainEntry = false; - - TokenData(final String token) { - assert token.equals(token.trim()); - assert token.length() > 0; - this.token = token; + + public TokenData getOrCreateTokenData(final String token) { + TokenData tokenData = tokenToData.get(token); + if (tokenData == null) { + tokenData = new TokenData(token); + tokenToData.put(token, tokenData); + } + return tokenData; } - } - private TokenData getOrCreateTokenData(final String token) { - TokenData tokenData = tokenToData.get(token); - if (tokenData == null) { - tokenData = new TokenData(token); - tokenToData.put(token, tokenData); + private List getOrCreateEntries(final String token, final EntryTypeName entryTypeName) { + final TokenData tokenData = getOrCreateTokenData(token); + List entries = tokenData.typeToEntries.get(entryTypeName); + if (entryTypeName.mainWord) { + tokenData.hasMainEntry = true; + } + if (entries == null) { + entries = new ArrayList<>(); + tokenData.typeToEntries.put(entryTypeName, entries); + } + return entries; } - return tokenData; - } - - private List getOrCreateEntries(final String token, final EntryTypeName entryTypeName) { - final TokenData tokenData = getOrCreateTokenData(token); - List entries = tokenData.typeToEntries.get(entryTypeName); - if (entryTypeName.mainWord) { - tokenData.hasMainEntry = true; + + public void addEntryWithTokens(final IndexedEntry indexedEntry, final Set tokens, + final EntryTypeName entryTypeName) { + if (indexedEntry == null) { + System.out.println("asdfasdf"); + } + assert indexedEntry != null; + for (final String token : tokens) { + if (entryTypeName.overridesStopList || !stoplist.contains(token)) { + getOrCreateEntries(token, entryTypeName).add(indexedEntry); + } + } } - if (entries == null) { - entries = new ArrayList(); - tokenData.typeToEntries.put(entryTypeName, entries); + + public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString, + final EntryTypeName entryTypeName) { + final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); + addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? entryTypeName.singleWordInstance : entryTypeName); } - return entries; - } - public void addEntryWithTokens(final IndexedEntry indexedEntry, final Set tokens, - final EntryTypeName entryTypeName) { - if (indexedEntry == null) { - System.out.println("asdfasdf"); + public void addEntryWithStringNoSingle(final IndexedEntry indexedEntry, final String untokenizedString, + final EntryTypeName entryTypeName) { + final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); + addEntryWithTokens(indexedEntry, tokens, entryTypeName); } - assert indexedEntry != null; - for (final String token : tokens) { - if (entryTypeName.overridesStopList || !stoplist.contains(token)) { - getOrCreateEntries(token, entryTypeName).add(indexedEntry); - } - } - } - - public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString, - final EntryTypeName entryTypeName) { - final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); - addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? entryTypeName.singleWordInstance : entryTypeName); - } - - public void addEntryWithStringNoSingle(final IndexedEntry indexedEntry, final String untokenizedString, - final EntryTypeName entryTypeName) { - final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); - addEntryWithTokens(indexedEntry, tokens, entryTypeName); - } }