1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.engine;
17 import java.io.ByteArrayOutputStream;
18 import java.io.DataInput;
19 import java.io.DataInputStream;
20 import java.io.DataOutput;
22 import java.io.IOException;
23 import java.io.ObjectOutputStream;
24 import java.io.PrintStream;
25 import java.io.RandomAccessFile;
26 import java.nio.channels.Channels;
27 import java.nio.channels.FileChannel;
28 import java.nio.charset.StandardCharsets;
29 import java.util.ArrayList;
30 import java.util.Collections;
31 import java.util.List;
32 import java.util.zip.GZIPOutputStream;
34 import com.hughes.android.dictionary.DictionaryInfo;
35 import com.hughes.util.CachingList;
36 import com.hughes.util.raf.RAFList;
37 import com.hughes.util.raf.RAFListSerializer;
38 import com.hughes.util.raf.RAFSerializable;
40 public class Dictionary implements RAFSerializable<Dictionary> {
42 private static final int CACHE_SIZE = 5000;
44 private static final int CURRENT_DICT_VERSION = 7;
45 private static final String END_OF_DICTIONARY = "END OF DICTIONARY";
48 final int dictFileVersion;
49 private final long creationMillis;
50 public final String dictInfo;
51 public final List<PairEntry> pairEntries;
52 public final List<TextEntry> textEntries;
53 public final List<HtmlEntry> htmlEntries;
54 public final List<byte[]> htmlData;
55 public final List<EntrySource> sources;
56 public final List<Index> indices;
59 * dictFileVersion 1 adds: <li>links to sources? dictFileVersion 2 adds: <li>
60 * counts of tokens in indices.
63 public Dictionary(final String dictInfo) {
64 this.dictFileVersion = CURRENT_DICT_VERSION;
65 this.creationMillis = System.currentTimeMillis();
66 this.dictInfo = dictInfo;
67 pairEntries = new ArrayList<>();
68 textEntries = new ArrayList<>();
69 htmlEntries = new ArrayList<>();
71 sources = new ArrayList<>();
72 indices = new ArrayList<>();
75 public Dictionary(final FileChannel ch) throws IOException {
76 DataInput raf = new DataInputStream(Channels.newInputStream(ch));
77 dictFileVersion = raf.readInt();
78 if (dictFileVersion < 0 || dictFileVersion > CURRENT_DICT_VERSION) {
79 throw new IOException("Invalid dictionary version: " + dictFileVersion);
81 creationMillis = raf.readLong();
82 dictInfo = raf.readUTF();
84 // Load the sources, then seek past them, because reading them later
85 // disrupts the offset.
87 final RAFList<EntrySource> rafSources = RAFList.create(ch, new EntrySource.Serializer(
88 this), ch.position(), dictFileVersion, dictInfo + " sources: ");
89 sources = new ArrayList<>(rafSources);
90 ch.position(rafSources.getEndOffset());
92 pairEntries = CachingList.create(
93 RAFList.create(ch, new PairEntry.Serializer(this), ch.position(), dictFileVersion, dictInfo + " pairs: "),
95 textEntries = CachingList.create(
96 RAFList.create(ch, new TextEntry.Serializer(this), ch.position(), dictFileVersion, dictInfo + " text: "),
98 if (dictFileVersion >= 5) {
99 htmlEntries = CachingList.create(
100 RAFList.create(ch, new HtmlEntry.Serializer(this, ch), ch.position(), dictFileVersion, dictInfo + " html: "),
103 htmlEntries = Collections.emptyList();
105 if (dictFileVersion >= 7) {
106 htmlData = RAFList.create(ch, new HtmlEntry.DataDeserializer(), ch.position(), dictFileVersion, dictInfo + " html: ");
110 indices = CachingList.createFullyCached(RAFList.create(ch, new IndexSerializer(ch),
111 ch.position(), dictFileVersion, dictInfo + " index: "));
112 } catch (RuntimeException e) {
113 throw new IOException("RuntimeException loading dictionary", e);
115 final String end = raf.readUTF();
116 if (!end.equals(END_OF_DICTIONARY)) {
117 throw new IOException("Dictionary seems corrupt: " + end);
122 public void write(DataOutput out) throws IOException {
123 RandomAccessFile raf = (RandomAccessFile)out;
124 if (dictFileVersion < 7) throw new RuntimeException("write function cannot write formats older than v7!");
125 raf.writeInt(dictFileVersion);
126 raf.writeLong(creationMillis);
127 raf.writeUTF(dictInfo);
128 System.out.println("sources start: " + raf.getFilePointer());
129 RAFList.write(raf, sources, new EntrySource.Serializer(this));
130 System.out.println("pair start: " + raf.getFilePointer());
131 RAFList.write(raf, pairEntries, new PairEntry.Serializer(this), 64, true);
132 System.out.println("text start: " + raf.getFilePointer());
133 RAFList.write(raf, textEntries, new TextEntry.Serializer(this));
134 System.out.println("html index start: " + raf.getFilePointer());
135 RAFList.write(raf, htmlEntries, new HtmlEntry.Serializer(this, null), 64, true);
136 System.out.println("html data start: " + raf.getFilePointer());
137 assert htmlData == null;
138 RAFList.write(raf, htmlEntries, new HtmlEntry.DataSerializer(), 128, true);
139 System.out.println("indices start: " + raf.getFilePointer());
140 RAFList.write(raf, indices, new IndexSerializer(null));
141 System.out.println("end: " + raf.getFilePointer());
142 raf.writeUTF(END_OF_DICTIONARY);
145 private void writev6Sources(RandomAccessFile out) throws IOException {
146 out.writeInt(sources.size());
147 long tocPos = out.getFilePointer();
148 out.seek(tocPos + sources.size() * 8 + 8);
149 for (EntrySource s : sources) {
150 long dataPos = out.getFilePointer();
152 out.writeLong(dataPos);
155 out.writeUTF(s.getName());
156 out.writeInt(s.getNumEntries());
158 long dataPos = out.getFilePointer();
160 out.writeLong(dataPos);
164 private void writev6PairEntries(RandomAccessFile out) throws IOException {
165 out.writeInt(pairEntries.size());
166 long tocPos = out.getFilePointer();
167 out.seek(tocPos + pairEntries.size() * 8 + 8);
168 for (PairEntry pe : pairEntries) {
169 long dataPos = out.getFilePointer();
171 out.writeLong(dataPos);
174 out.writeShort(pe.entrySource.index());
175 out.writeInt(pe.pairs.size());
176 for (PairEntry.Pair p : pe.pairs) {
177 out.writeUTF(p.lang1);
178 out.writeUTF(p.lang2);
181 long dataPos = out.getFilePointer();
183 out.writeLong(dataPos);
187 private void writev6TextEntries(RandomAccessFile out) throws IOException {
188 out.writeInt(textEntries.size());
189 long tocPos = out.getFilePointer();
190 out.seek(tocPos + textEntries.size() * 8 + 8);
191 for (TextEntry t : textEntries) {
192 long dataPos = out.getFilePointer();
194 out.writeLong(dataPos);
197 out.writeShort(t.entrySource.index());
198 out.writeUTF(t.text);
200 long dataPos = out.getFilePointer();
202 out.writeLong(dataPos);
206 private void writev6EmptyList(RandomAccessFile out) throws IOException {
208 out.writeLong(out.getFilePointer() + 8);
211 private void writev6HtmlEntries(RandomAccessFile out) throws IOException {
212 out.writeInt(htmlEntries.size());
213 long tocPos = out.getFilePointer();
214 out.seek(tocPos + htmlEntries.size() * 8 + 8);
215 for (HtmlEntry h : htmlEntries) {
216 long dataPos = out.getFilePointer();
218 out.writeLong(dataPos);
221 out.writeShort(h.entrySource.index());
222 out.writeUTF(h.title);
223 byte[] data = h.getHtml().getBytes(StandardCharsets.UTF_8);
224 out.writeInt(data.length);
225 ByteArrayOutputStream baos = new ByteArrayOutputStream();
226 GZIPOutputStream gzout = new GZIPOutputStream(baos);
229 out.writeInt(baos.size());
230 out.write(baos.toByteArray());
232 long dataPos = out.getFilePointer();
234 out.writeLong(dataPos);
238 private void writev6HtmlIndices(RandomAccessFile out, List<HtmlEntry> entries) throws IOException {
239 out.writeInt(entries.size());
240 long tocPos = out.getFilePointer();
241 out.seek(tocPos + entries.size() * 8 + 8);
242 for (HtmlEntry e : entries) {
243 long dataPos = out.getFilePointer();
245 out.writeLong(dataPos);
248 out.writeInt(e.index());
250 long dataPos = out.getFilePointer();
252 out.writeLong(dataPos);
256 private void writev6IndexEntries(RandomAccessFile out, List<Index.IndexEntry> entries, int[] prunedRowIdx) throws IOException {
257 out.writeInt(entries.size());
258 long tocPos = out.getFilePointer();
259 out.seek(tocPos + entries.size() * 8 + 8);
260 for (Index.IndexEntry e : entries) {
261 long dataPos = out.getFilePointer();
263 out.writeLong(dataPos);
266 out.writeUTF(e.token);
268 int startRow = e.startRow;
269 int numRows = e.numRows;
270 if (prunedRowIdx != null) {
271 // note: the start row will always be a TokenRow
272 // and thus never be pruned
274 for (int i = 1; i < numRows; i++) {
275 if (prunedRowIdx[startRow + i] >= 0) newNumRows++;
277 startRow = prunedRowIdx[startRow];
278 numRows = newNumRows;
281 out.writeInt(startRow);
282 out.writeInt(numRows);
283 final boolean hasNormalizedForm = !e.token.equals(e.normalizedToken());
284 out.writeBoolean(hasNormalizedForm);
285 if (hasNormalizedForm) out.writeUTF(e.normalizedToken());
286 writev6HtmlIndices(out, prunedRowIdx == null ? e.htmlEntries : Collections.<HtmlEntry>emptyList());
288 long dataPos = out.getFilePointer();
290 out.writeLong(dataPos);
294 private void writev6Index(RandomAccessFile out, boolean skipHtml) throws IOException {
295 out.writeInt(indices.size());
296 long tocPos = out.getFilePointer();
297 out.seek(tocPos + indices.size() * 8 + 8);
298 for (Index idx : indices) {
299 // create pruned index for skipHtml feature
300 int[] prunedRowIdx = null;
303 prunedRowIdx = new int[idx.rows.size()];
304 for (int i = 0; i < idx.rows.size(); i++) {
305 final RowBase r = idx.rows.get(i);
306 // prune Html entries
307 boolean pruned = r instanceof HtmlEntry.Row;
308 prunedRowIdx[i] = pruned ? -1 : prunedSize;
309 if (!pruned) prunedSize++;
313 long dataPos = out.getFilePointer();
315 out.writeLong(dataPos);
318 out.writeUTF(idx.shortName);
319 out.writeUTF(idx.longName);
320 out.writeUTF(idx.sortLanguage.getIsoCode());
321 out.writeUTF(idx.normalizerRules);
322 out.writeBoolean(idx.swapPairEntries);
323 out.writeInt(idx.mainTokenCount);
324 writev6IndexEntries(out, idx.sortedIndexEntries, prunedRowIdx);
326 // write stoplist, serializing the whole Set *shudder*
327 final ByteArrayOutputStream baos = new ByteArrayOutputStream();
328 final ObjectOutputStream oos = new ObjectOutputStream(baos);
329 oos.writeObject(idx.stoplist);
331 final byte[] bytes = baos.toByteArray();
332 out.writeInt(bytes.length);
335 out.writeInt(skipHtml ? prunedSize : idx.rows.size());
337 for (RowBase r : idx.rows) {
339 if (r instanceof PairEntry.Row) {
341 } else if (r instanceof TokenRow) {
342 final TokenRow tokenRow = (TokenRow)r;
343 type = tokenRow.hasMainEntry ? 1 : 3;
344 } else if (r instanceof TextEntry.Row) {
346 } else if (r instanceof HtmlEntry.Row) {
348 if (skipHtml) continue;
350 throw new RuntimeException("Row type not supported for v6");
353 out.writeInt(r.referenceIndex);
356 long dataPos = out.getFilePointer();
358 out.writeLong(dataPos);
362 public void writev6(DataOutput out, boolean skipHtml) throws IOException {
363 RandomAccessFile raf = (RandomAccessFile)out;
365 raf.writeLong(creationMillis);
366 raf.writeUTF(dictInfo);
367 System.out.println("sources start: " + raf.getFilePointer());
369 System.out.println("pair start: " + raf.getFilePointer());
370 writev6PairEntries(raf);
371 System.out.println("text start: " + raf.getFilePointer());
372 writev6TextEntries(raf);
373 System.out.println("html index start: " + raf.getFilePointer());
374 if (skipHtml) writev6EmptyList(raf);
375 else writev6HtmlEntries(raf);
376 System.out.println("indices start: " + raf.getFilePointer());
377 writev6Index(raf, skipHtml);
378 System.out.println("end: " + raf.getFilePointer());
379 raf.writeUTF(END_OF_DICTIONARY);
382 private final class IndexSerializer implements RAFListSerializer<Index> {
383 private final FileChannel ch;
385 IndexSerializer(FileChannel ch) {
390 public Index read(DataInput raf, final int readIndex) throws IOException {
391 return new Index(Dictionary.this, ch, raf);
395 public void write(DataOutput raf, Index t) throws IOException {
400 final RAFListSerializer<HtmlEntry> htmlEntryIndexSerializer = new RAFListSerializer<HtmlEntry>() {
402 public void write(DataOutput raf, HtmlEntry t) {
407 public HtmlEntry read(DataInput raf, int readIndex) throws IOException {
408 return htmlEntries.get(raf.readInt());
412 public void print(final PrintStream out) {
413 out.println("dictInfo=" + dictInfo);
414 for (final EntrySource entrySource : sources) {
415 out.printf("EntrySource: %s %d\n", entrySource.name, entrySource.numEntries);
418 for (final Index index : indices) {
419 out.printf("Index: %s %s\n", index.shortName, index.longName);
425 public DictionaryInfo getDictionaryInfo() {
426 final DictionaryInfo result = new DictionaryInfo();
427 result.creationMillis = this.creationMillis;
428 result.dictInfo = this.dictInfo;
429 for (final Index index : indices) {
430 result.indexInfos.add(index.getIndexInfo());
435 public static DictionaryInfo getDictionaryInfo(final File file) {
436 RandomAccessFile raf = null;
438 raf = new RandomAccessFile(file, "r");
439 final Dictionary dict = new Dictionary(raf.getChannel());
440 final DictionaryInfo dictionaryInfo = dict.getDictionaryInfo();
441 dictionaryInfo.uncompressedFilename = file.getName();
442 dictionaryInfo.uncompressedBytes = file.length();
444 return dictionaryInfo;
445 } catch (IOException e) {
446 final DictionaryInfo dictionaryInfo = new DictionaryInfo();
447 dictionaryInfo.uncompressedFilename = file.getName();
448 dictionaryInfo.uncompressedBytes = file.length();
449 return dictionaryInfo;
454 } catch (IOException e) {