X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FIndexBuilder.java;h=32a087f47390d48e1c9ae23d928c07c4449fe735;hb=d06b99b469b18cfa4a8a4bd45d51ee4ebd7efaca;hp=4f64fa25258dd835709707aa27cb20e960b3f64d;hpb=9a0850fd39f27b5cf08dbf63510f523d8bceff5d;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java index 4f64fa2..32a087f 100644 --- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java @@ -1,3 +1,17 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package com.hughes.android.dictionary.engine; import java.util.ArrayList; @@ -12,23 +26,25 @@ import java.util.SortedMap; import java.util.TreeMap; import com.hughes.android.dictionary.engine.Index.IndexEntry; - +import com.hughes.android.dictionary.parser.DictFileParser; public class IndexBuilder { final DictionaryBuilder dictionaryBuilder; public final Index index; + final Set stoplist; final SortedMap tokenToData; - IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final boolean swapPairEntries) { + IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final Set stoplist, final boolean swapPairEntries) { this.dictionaryBuilder = dictionaryBuilder; index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries); tokenToData = new TreeMap(new NormalizeComparator(index.normalizer(), language.getCollator())); + this.stoplist = stoplist; } public void build() { - final Set tokenEntryDatas = new HashSet(); + final Set tokenEntryDatas = new HashSet(); final List rows = index.rows; for (final TokenData tokenData : tokenToData.values()) { tokenEntryDatas.clear(); @@ -38,8 +54,12 @@ public class IndexBuilder { // System.out.println("Added TokenRow: " + rows.get(rows.size() - 1)); int numRows = 0; // System.out.println("TOKEN: " + tokenData.token); - for (final Map.Entry> typeToEntry : tokenData.typeToEntries.entrySet()) { - for (final EntryData entryData : typeToEntry.getValue()) { + for (final Map.Entry> typeToEntry : tokenData.typeToEntries.entrySet()) { + for (final IndexedEntry entryData : typeToEntry.getValue()) { + if (entryData.index() == -1) { + entryData.addToDictionary(dictionaryBuilder.dictionary); + assert entryData.index() >= 0; + } if (tokenEntryDatas.add(entryData)) { rows.add(new PairEntry.Row(entryData.index(), rows.size(), index)); ++numRows; @@ -54,22 +74,22 @@ public class IndexBuilder { .normalizer().transliterate(tokenData.token), startRow, numRows)); } - final List sortedEntries = new ArrayList(index.sortedIndexEntries); - Collections.sort(sortedEntries, new Comparator() { + final List entriesSortedByRows = new ArrayList(index.sortedIndexEntries); + Collections.sort(entriesSortedByRows, new Comparator() { @Override public int compare(IndexEntry object1, IndexEntry object2) { return object2.numRows - object1.numRows; }}); System.out.println("Most common tokens:"); - for (int i = 0; i < 50 && i < sortedEntries.size(); ++i) { - System.out.println(" " + sortedEntries.get(i)); + for (int i = 0; i < 50 && i < entriesSortedByRows.size(); ++i) { + System.out.println(" " + entriesSortedByRows.get(i)); } } static class TokenData { final String token; - final Map> typeToEntries = new EnumMap>(EntryTypeName.class); + final Map> typeToEntries = new EnumMap>(EntryTypeName.class); TokenData(final String token) { assert token.equals(token.trim()); @@ -78,7 +98,7 @@ public class IndexBuilder { } } - public TokenData getOrCreateTokenData(final String token) { + private TokenData getOrCreateTokenData(final String token) { TokenData tokenData = tokenToData.get(token); if (tokenData == null) { tokenData = new TokenData(token); @@ -87,22 +107,37 @@ public class IndexBuilder { return tokenData; } - public List getOrCreateEntries(final String token, final EntryTypeName entryTypeName) { + private List getOrCreateEntries(final String token, final EntryTypeName entryTypeName) { final TokenData tokenData = getOrCreateTokenData(token); - List entries = tokenData.typeToEntries.get(entryTypeName); + List entries = tokenData.typeToEntries.get(entryTypeName); if (entries == null) { - entries = new ArrayList(); + entries = new ArrayList(); tokenData.typeToEntries.put(entryTypeName, entries); } return entries; } - public void addEntryWithTokens(final EntryData entryData, final Set tokens, + public void addEntryWithTokens(final IndexedEntry indexedEntry, final Set tokens, final EntryTypeName entryTypeName) { + if (indexedEntry == null) { + System.out.println("asdfasdf"); + } + assert indexedEntry != null; for (final String token : tokens) { - getOrCreateEntries(token, entryTypeName).add(entryData); + if (entryTypeName.overridesStopList || !stoplist.contains(token)) + getOrCreateEntries(token, entryTypeName).add(indexedEntry); } } - + public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString, + final EntryTypeName entryTypeName) { + final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); + addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? entryTypeName.singleWordInstance : entryTypeName); + } + + public void addEntryWithStringNoSingle(final IndexedEntry indexedEntry, final String untokenizedString, + final EntryTypeName entryTypeName) { + final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); + addEntryWithTokens(indexedEntry, tokens, entryTypeName); + } }