X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FIndexBuilder.java;h=0c3fa13da2147e30c0bd9e9f0ab3c0ed2b925bb0;hb=26ab537cbfd3e303f636d793fd55ea950dc8f5b2;hp=59d44c5c43a4e3b3ae231f67c3063962149440f3;hpb=796056edb719a04daf100ddbdbc7b845bce1eaba;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java index 59d44c5..0c3fa13 100644 --- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java @@ -1,6 +1,22 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package com.hughes.android.dictionary.engine; import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; import java.util.EnumMap; import java.util.HashSet; import java.util.List; @@ -9,51 +25,101 @@ import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; -import com.hughes.android.dictionary.Language; +import com.hughes.android.dictionary.engine.Index.IndexEntry; +import com.hughes.android.dictionary.parser.DictFileParser; public class IndexBuilder { final DictionaryBuilder dictionaryBuilder; - final Index index; + public final Index index; + final Set stoplist; final SortedMap tokenToData; - @SuppressWarnings("unchecked") - IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language) { + IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final Set stoplist, final boolean swapPairEntries) { this.dictionaryBuilder = dictionaryBuilder; - index = new Index(dictionaryBuilder.dictionary, shortName, longName, language); - tokenToData = new TreeMap(language.getSortCollator()); + index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries, stoplist); + tokenToData = new TreeMap(index.getSortComparator()); + this.stoplist = stoplist; } public void build() { - final Set tokenEntryDatas = new HashSet(); + final Set tokenIndexedEntries = new HashSet(); final List rows = index.rows; + index.mainTokenCount = 0; for (final TokenData tokenData : tokenToData.values()) { - tokenEntryDatas.clear(); - final int indexRow = index.sortedIndexEntries.size(); - index.sortedIndexEntries.add(new Index.IndexEntry(tokenData.token, rows.size())); - rows.add(new TokenRow(indexRow, rows.size(), index)); - int count = 0; - System.out.println("TOKEN: " + tokenData.token); - for (final Map.Entry> typeToEntry : tokenData.typeToEntries.entrySet()) { - for (final EntryData entryData : typeToEntry.getValue()) { - if (tokenEntryDatas.add(entryData)) { - rows.add(new PairEntry.Row(entryData.index(), rows.size(), index)); - ++count; + tokenIndexedEntries.clear(); + final int indexIndex = index.sortedIndexEntries.size(); + final int startRow = rows.size(); + + TokenRow tokenRow = null; + if (!tokenData.htmlEntries.isEmpty()) { + tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry); + rows.add(tokenRow); + } + +// System.out.println("Added TokenRow: " + rows.get(rows.size() - 1)); + + int numRows = 0; // off by one--doesn't count the token row! +// System.out.println("TOKEN: " + tokenData.token); + for (final Map.Entry> typeToIndexedEntries : tokenData.typeToEntries.entrySet()) { + for (final IndexedEntry indexedEntry : typeToIndexedEntries.getValue()) { + if (!indexedEntry.isValid) { + continue; + } + + if (tokenRow == null) { + tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry); + rows.add(tokenRow); + } + + if (indexedEntry.entry.index() == -1) { + indexedEntry.entry.addToDictionary(dictionaryBuilder.dictionary); + assert indexedEntry.entry.index() >= 0; + } + if (tokenIndexedEntries.add(indexedEntry) && !tokenData.htmlEntries.contains(indexedEntry.entry)) { + rows.add(indexedEntry.entry.CreateRow(rows.size(), index)); + ++indexedEntry.entry.entrySource.numEntries; + ++numRows; - System.out.print(" " + typeToEntry.getKey() + ": "); - rows.get(rows.size() - 1).print(System.out); - System.out.println(); +// System.out.print(" " + typeToEntry.getKey() + ": "); + // rows.get(rows.size() - 1).print(System.out); +// System.out.println(); } } } + + if (tokenRow != null) { + if (tokenRow.hasMainEntry) { + index.mainTokenCount++; + } + + final Index.IndexEntry indexEntry = new Index.IndexEntry(index, tokenData.token, index + .normalizer().transliterate(tokenData.token), startRow, numRows); + indexEntry.htmlEntries.addAll(tokenData.htmlEntries); + index.sortedIndexEntries.add(indexEntry); + } + } + + final List entriesSortedByNumRows = new ArrayList(index.sortedIndexEntries); + Collections.sort(entriesSortedByNumRows, new Comparator() { + @Override + public int compare(IndexEntry object1, IndexEntry object2) { + return object2.numRows - object1.numRows; + }}); + System.out.println("Most common tokens:"); + for (int i = 0; i < 50 && i < entriesSortedByNumRows.size(); ++i) { + System.out.println(" " + entriesSortedByNumRows.get(i)); } } - static class TokenData { + public static class TokenData { final String token; - final Map> typeToEntries = new EnumMap>(EntryTypeName.class); + final Map> typeToEntries = new EnumMap>(EntryTypeName.class); + public boolean hasMainEntry = false; + + public List htmlEntries = new ArrayList(); TokenData(final String token) { assert token.equals(token.trim()); @@ -71,15 +137,41 @@ public class IndexBuilder { return tokenData; } - public List getOrCreateEntries(final String token, final EntryTypeName entryTypeName) { + private List getOrCreateEntries(final String token, final EntryTypeName entryTypeName) { final TokenData tokenData = getOrCreateTokenData(token); - List entries = tokenData.typeToEntries.get(entryTypeName); + List entries = tokenData.typeToEntries.get(entryTypeName); + if (entryTypeName.mainWord) { + tokenData.hasMainEntry = true; + } if (entries == null) { - entries = new ArrayList(); + entries = new ArrayList(); tokenData.typeToEntries.put(entryTypeName, entries); } return entries; } - + public void addEntryWithTokens(final IndexedEntry indexedEntry, final Set tokens, + final EntryTypeName entryTypeName) { + if (indexedEntry == null) { + System.out.println("asdfasdf"); + } + assert indexedEntry != null; + for (final String token : tokens) { + if (entryTypeName.overridesStopList || !stoplist.contains(token)) { + getOrCreateEntries(token, entryTypeName).add(indexedEntry); + } + } + } + + public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString, + final EntryTypeName entryTypeName) { + final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); + addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? entryTypeName.singleWordInstance : entryTypeName); + } + + public void addEntryWithStringNoSingle(final IndexedEntry indexedEntry, final String untokenizedString, + final EntryTypeName entryTypeName) { + final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); + addEntryWithTokens(indexedEntry, tokens, entryTypeName); + } }