// Copyright 2011 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package com.hughes.android.dictionary.engine; import com.hughes.android.dictionary.DictionaryInfo; import com.hughes.android.dictionary.DictionaryInfo.IndexInfo; import com.hughes.android.dictionary.engine.RowBase.RowKey; import com.hughes.util.CachingList; import com.hughes.util.StringUtil; import com.hughes.util.TransformingList; import com.hughes.util.raf.RAFList; import com.hughes.util.raf.RAFSerializable; import com.hughes.util.raf.RAFSerializer; import com.hughes.util.raf.SerializableSerializer; import com.hughes.util.raf.UniformRAFList; import com.ibm.icu.text.Transliterator; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.io.PrintStream; import java.io.RandomAccessFile; import java.nio.channels.FileChannel; import java.util.AbstractList; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.EnumMap; import java.util.HashSet; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; import java.util.regex.Pattern; public final class Index implements RAFSerializable { private static final int CACHE_SIZE = 5000; public final Dictionary dict; public final String shortName; // Typically the ISO code for the language. public final String longName; // persisted: tells how the entries are sorted. public final Language sortLanguage; public final String normalizerRules; // Built from the two above. private Transliterator normalizer; // persisted public final List sortedIndexEntries; // persisted. public final Set stoplist; // One big list! // Various sub-types. // persisted public final List rows; public final boolean swapPairEntries; // Version 2: @SuppressWarnings("WeakerAccess") public int mainTokenCount = -1; // -------------------------------------------------------------------------- public Index(final Dictionary dict, final String shortName, final String longName, final Language sortLanguage, final String normalizerRules, final boolean swapPairEntries, final Set stoplist) { this.dict = dict; this.shortName = shortName; this.longName = longName; this.sortLanguage = sortLanguage; this.normalizerRules = normalizerRules; this.swapPairEntries = swapPairEntries; sortedIndexEntries = new ArrayList<>(); this.stoplist = stoplist; rows = new ArrayList<>(); normalizer = null; } /** * Deferred initialization because it can be slow. */ @SuppressWarnings("WeakerAccess") public synchronized Transliterator normalizer() { if (normalizer == null) { normalizer = TransliteratorManager.get(normalizerRules); } return normalizer; } /** * Note that using this comparator probably involves doing too many text * normalizations. */ @SuppressWarnings("WeakerAccess") public NormalizeComparator getSortComparator() { return new NormalizeComparator(normalizer(), sortLanguage.getCollator(), dict.dictFileVersion); } public Index(final Dictionary dict, final FileChannel inp, final DataInput raf) throws IOException { this.dict = dict; shortName = raf.readUTF(); longName = raf.readUTF(); final String languageCode = raf.readUTF(); sortLanguage = Language.lookup(languageCode); normalizerRules = raf.readUTF(); swapPairEntries = raf.readBoolean(); if (sortLanguage == null) { throw new IOException("Unsupported language: " + languageCode); } if (dict.dictFileVersion >= 2) { mainTokenCount = raf.readInt(); } sortedIndexEntries = CachingList.create( RAFList.create(inp, new IndexEntrySerializer(dict.dictFileVersion == 6 ? inp : null), inp.position(), dict.dictFileVersion, dict.dictInfo + " idx " + languageCode + ": "), CACHE_SIZE, true); if (dict.dictFileVersion >= 7) { int count = StringUtil.readVarInt(raf); stoplist = new HashSet<>(count); for (int i = 0; i < count; ++i) { stoplist.add(raf.readUTF()); } } else if (dict.dictFileVersion >= 4) { stoplist = new SerializableSerializer>().read(raf); } else { stoplist = Collections.emptySet(); } rows = CachingList.create( UniformRAFList.create(inp, new RowBase.Serializer(this), inp.position()), CACHE_SIZE, true); } @Override public void write(final DataOutput out) throws IOException { RandomAccessFile raf = (RandomAccessFile)out; raf.writeUTF(shortName); raf.writeUTF(longName); raf.writeUTF(sortLanguage.getIsoCode()); raf.writeUTF(normalizerRules); raf.writeBoolean(swapPairEntries); raf.writeInt(mainTokenCount); RAFList.write(raf, sortedIndexEntries, new IndexEntrySerializer(null), 32, true); StringUtil.writeVarInt(raf, stoplist.size()); for (String i : stoplist) { raf.writeUTF(i); } UniformRAFList.write(raf, rows, new RowBase.Serializer(this), 3 /* bytes per entry */); } public void print(final PrintStream out) { for (final RowBase row : rows) { row.print(out); } } private final class IndexEntrySerializer implements RAFSerializer { private final FileChannel ch; IndexEntrySerializer(FileChannel ch) { this.ch = ch; } @Override public IndexEntry read(DataInput raf) throws IOException { return new IndexEntry(Index.this, ch, raf); } @Override public void write(DataOutput raf, IndexEntry t) throws IOException { t.write(raf); } } public static final class IndexEntry implements RAFSerializable { public final String token; private final String normalizedToken; public final int startRow; final int numRows; // doesn't count the token row! public List htmlEntries; public IndexEntry(final Index index, final String token, final String normalizedToken, final int startRow, final int numRows) { assert token.equals(token.trim()); assert token.length() > 0; this.token = token; this.normalizedToken = normalizedToken; this.startRow = startRow; this.numRows = numRows; this.htmlEntries = new ArrayList<>(); } IndexEntry(final Index index, final FileChannel ch, final DataInput raf) throws IOException { token = raf.readUTF(); if (index.dict.dictFileVersion >= 7) { startRow = StringUtil.readVarInt(raf); numRows = StringUtil.readVarInt(raf); } else { startRow = raf.readInt(); numRows = raf.readInt(); } final boolean hasNormalizedForm = raf.readBoolean(); normalizedToken = hasNormalizedForm ? raf.readUTF() : token; if (index.dict.dictFileVersion >= 7) { int size = StringUtil.readVarInt(raf); if (size == 0) { this.htmlEntries = Collections.emptyList(); } else { final int[] htmlEntryIndices = new int[size]; for (int i = 0; i < size; ++i) { htmlEntryIndices[i] = StringUtil.readVarInt(raf); } this.htmlEntries = new AbstractList() { @Override public HtmlEntry get(int i) { return index.dict.htmlEntries.get(htmlEntryIndices[i]); } @Override public int size() { return htmlEntryIndices.length; } }; } } else if (index.dict.dictFileVersion >= 6) { this.htmlEntries = CachingList.create( RAFList.create(ch, index.dict.htmlEntryIndexSerializer, ch.position(), index.dict.dictFileVersion, index.dict.dictInfo + " htmlEntries: "), 1, false); } else { this.htmlEntries = Collections.emptyList(); } } public void write(DataOutput raf) throws IOException { raf.writeUTF(token); StringUtil.writeVarInt(raf, startRow); StringUtil.writeVarInt(raf, numRows); final boolean hasNormalizedForm = !token.equals(normalizedToken); raf.writeBoolean(hasNormalizedForm); if (hasNormalizedForm) { raf.writeUTF(normalizedToken); } StringUtil.writeVarInt(raf, htmlEntries.size()); for (HtmlEntry e : htmlEntries) StringUtil.writeVarInt(raf, e.index()); } public String toString() { return String.format("%s@%d(%d)", token, startRow, numRows); } String normalizedToken() { return normalizedToken; } } private static final TransformingList.Transformer INDEX_ENTRY_TO_TOKEN = new TransformingList.Transformer() { @Override public String transform(IndexEntry t1) { return t1.token; } }; public IndexEntry findExact(final String exactToken) { final int result = Collections.binarySearch( TransformingList.create(sortedIndexEntries, INDEX_ENTRY_TO_TOKEN), exactToken, getSortComparator()); if (result >= 0) { return sortedIndexEntries.get(result); } return null; } public IndexEntry findInsertionPoint(String token, final AtomicBoolean interrupted) { final int index = findInsertionPointIndex(token, interrupted); return index != -1 ? sortedIndexEntries.get(index) : null; } private int compareIdx(String token, final Comparator