1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.engine;
17 import java.io.ByteArrayOutputStream;
18 import java.io.DataInput;
19 import java.io.DataInputStream;
20 import java.io.DataOutput;
22 import java.io.IOException;
23 import java.io.ObjectOutputStream;
24 import java.io.PrintStream;
25 import java.io.RandomAccessFile;
26 import java.nio.channels.Channels;
27 import java.nio.channels.FileChannel;
28 import java.nio.charset.StandardCharsets;
29 import java.util.ArrayList;
30 import java.util.Collections;
31 import java.util.List;
32 import java.util.zip.GZIPOutputStream;
34 import com.hughes.android.dictionary.DictionaryInfo;
35 import com.hughes.util.CachingList;
36 import com.hughes.util.raf.RAFList;
37 import com.hughes.util.raf.RAFListSerializer;
38 import com.hughes.util.raf.RAFSerializable;
40 public class Dictionary implements RAFSerializable<Dictionary> {
42 private static final int CACHE_SIZE = 5000;
44 private static final int CURRENT_DICT_VERSION = 7;
45 private static final String END_OF_DICTIONARY = "END OF DICTIONARY";
48 final int dictFileVersion;
49 private final long creationMillis;
50 public final String dictInfo;
51 public final List<PairEntry> pairEntries;
52 public final List<TextEntry> textEntries;
53 public final List<HtmlEntry> htmlEntries;
54 public final List<byte[]> htmlData;
55 public final List<EntrySource> sources;
56 public final List<Index> indices;
59 * dictFileVersion 1 adds: <li>links to sources? dictFileVersion 2 adds: <li>
60 * counts of tokens in indices.
63 public Dictionary(final String dictInfo) {
64 this.dictFileVersion = CURRENT_DICT_VERSION;
65 this.creationMillis = System.currentTimeMillis();
66 this.dictInfo = dictInfo;
67 pairEntries = new ArrayList<>();
68 textEntries = new ArrayList<>();
69 htmlEntries = new ArrayList<>();
71 sources = new ArrayList<>();
72 indices = new ArrayList<>();
75 public Dictionary(final FileChannel ch) throws IOException {
76 DataInput raf = new DataInputStream(Channels.newInputStream(ch));
77 dictFileVersion = raf.readInt();
78 if (dictFileVersion < 0 || dictFileVersion > CURRENT_DICT_VERSION) {
79 throw new IOException("Invalid dictionary version: " + dictFileVersion);
81 creationMillis = raf.readLong();
82 dictInfo = raf.readUTF();
84 // Load the sources, then seek past them, because reading them later
85 // disrupts the offset.
87 final RAFList<EntrySource> rafSources = RAFList.create(ch, new EntrySource.Serializer(
88 this), ch.position(), dictFileVersion, dictInfo + " sources: ");
89 sources = new ArrayList<>(rafSources);
90 ch.position(rafSources.getEndOffset());
92 pairEntries = CachingList.create(
93 RAFList.create(ch, new PairEntry.Serializer(this), ch.position(), dictFileVersion, dictInfo + " pairs: "),
95 textEntries = CachingList.create(
96 RAFList.create(ch, new TextEntry.Serializer(this), ch.position(), dictFileVersion, dictInfo + " text: "),
98 if (dictFileVersion >= 5) {
99 htmlEntries = CachingList.create(
100 RAFList.create(ch, new HtmlEntry.Serializer(this, ch), ch.position(), dictFileVersion, dictInfo + " html: "),
103 htmlEntries = Collections.emptyList();
105 if (dictFileVersion >= 7) {
106 htmlData = RAFList.create(ch, new HtmlEntry.DataDeserializer(), ch.position(), dictFileVersion, dictInfo + " html: ");
110 indices = CachingList.createFullyCached(RAFList.create(ch, new IndexSerializer(ch),
111 ch.position(), dictFileVersion, dictInfo + " index: "));
112 } catch (RuntimeException e) {
113 throw new IOException("RuntimeException loading dictionary", e);
115 final String end = raf.readUTF();
116 if (!end.equals(END_OF_DICTIONARY)) {
117 throw new IOException("Dictionary seems corrupt: " + end);
122 public void write(DataOutput out) throws IOException {
123 RandomAccessFile raf = (RandomAccessFile)out;
124 if (dictFileVersion < 7) throw new RuntimeException("write function cannot write formats older than v7!");
125 raf.writeInt(dictFileVersion);
126 raf.writeLong(creationMillis);
127 raf.writeUTF(dictInfo);
128 System.out.println("sources start: " + raf.getFilePointer());
129 RAFList.write(raf, sources, new EntrySource.Serializer(this));
130 System.out.println("pair start: " + raf.getFilePointer());
131 RAFList.write(raf, pairEntries, new PairEntry.Serializer(this), 64, true);
132 System.out.println("text start: " + raf.getFilePointer());
133 RAFList.write(raf, textEntries, new TextEntry.Serializer(this));
134 System.out.println("html index start: " + raf.getFilePointer());
135 RAFList.write(raf, htmlEntries, new HtmlEntry.Serializer(this, null), 64, true);
136 System.out.println("html data start: " + raf.getFilePointer());
137 assert htmlData == null;
138 RAFList.write(raf, htmlEntries, new HtmlEntry.DataSerializer(), 128, true);
139 System.out.println("indices start: " + raf.getFilePointer());
140 RAFList.write(raf, indices, new IndexSerializer(null));
141 System.out.println("end: " + raf.getFilePointer());
142 raf.writeUTF(END_OF_DICTIONARY);
145 private void writev6Sources(RandomAccessFile out) throws IOException {
146 out.writeInt(sources.size());
147 long tocPos = out.getFilePointer();
148 out.seek(tocPos + sources.size() * 8 + 8);
149 for (EntrySource s : sources) {
150 long dataPos = out.getFilePointer();
152 out.writeLong(dataPos);
155 out.writeUTF(s.getName());
156 out.writeInt(s.getNumEntries());
158 long dataPos = out.getFilePointer();
160 out.writeLong(dataPos);
164 private void writev6PairEntries(RandomAccessFile out) throws IOException {
165 out.writeInt(pairEntries.size());
166 long tocPos = out.getFilePointer();
167 out.seek(tocPos + pairEntries.size() * 8 + 8);
168 for (PairEntry pe : pairEntries) {
169 long dataPos = out.getFilePointer();
171 out.writeLong(dataPos);
174 out.writeShort(pe.entrySource.index());
175 out.writeInt(pe.pairs.size());
176 for (PairEntry.Pair p : pe.pairs) {
177 out.writeUTF(p.lang1);
178 out.writeUTF(p.lang2);
181 long dataPos = out.getFilePointer();
183 out.writeLong(dataPos);
187 private void writev6TextEntries(RandomAccessFile out) throws IOException {
188 out.writeInt(textEntries.size());
189 long tocPos = out.getFilePointer();
190 out.seek(tocPos + textEntries.size() * 8 + 8);
191 for (TextEntry t : textEntries) {
192 long dataPos = out.getFilePointer();
194 out.writeLong(dataPos);
197 out.writeShort(t.entrySource.index());
198 out.writeUTF(t.text);
200 long dataPos = out.getFilePointer();
202 out.writeLong(dataPos);
206 private void writev6HtmlEntries(RandomAccessFile out) throws IOException {
207 out.writeInt(htmlEntries.size());
208 long tocPos = out.getFilePointer();
209 out.seek(tocPos + htmlEntries.size() * 8 + 8);
210 for (HtmlEntry h : htmlEntries) {
211 long dataPos = out.getFilePointer();
213 out.writeLong(dataPos);
216 out.writeShort(h.entrySource.index());
217 out.writeUTF(h.title);
218 byte[] data = h.getHtml().getBytes(StandardCharsets.UTF_8);
219 out.writeInt(data.length);
220 ByteArrayOutputStream baos = new ByteArrayOutputStream();
221 GZIPOutputStream gzout = new GZIPOutputStream(baos);
224 out.writeInt(baos.size());
225 out.write(baos.toByteArray());
227 long dataPos = out.getFilePointer();
229 out.writeLong(dataPos);
233 private void writev6HtmlIndices(RandomAccessFile out, List<HtmlEntry> entries) throws IOException {
234 out.writeInt(entries.size());
235 long tocPos = out.getFilePointer();
236 out.seek(tocPos + entries.size() * 8 + 8);
237 for (HtmlEntry e : entries) {
238 long dataPos = out.getFilePointer();
240 out.writeLong(dataPos);
243 out.writeInt(e.index());
245 long dataPos = out.getFilePointer();
247 out.writeLong(dataPos);
251 private void writev6IndexEntries(RandomAccessFile out, List<Index.IndexEntry> entries) throws IOException {
252 out.writeInt(entries.size());
253 long tocPos = out.getFilePointer();
254 out.seek(tocPos + entries.size() * 8 + 8);
255 for (Index.IndexEntry e : entries) {
256 long dataPos = out.getFilePointer();
258 out.writeLong(dataPos);
261 out.writeUTF(e.token);
262 out.writeInt(e.startRow);
263 out.writeInt(e.numRows);
264 final boolean hasNormalizedForm = !e.token.equals(e.normalizedToken());
265 out.writeBoolean(hasNormalizedForm);
266 if (hasNormalizedForm) out.writeUTF(e.normalizedToken());
267 writev6HtmlIndices(out, e.htmlEntries);
269 long dataPos = out.getFilePointer();
271 out.writeLong(dataPos);
275 private void writev6Index(RandomAccessFile out) throws IOException {
276 out.writeInt(indices.size());
277 long tocPos = out.getFilePointer();
278 out.seek(tocPos + indices.size() * 8 + 8);
279 for (Index idx : indices) {
280 long dataPos = out.getFilePointer();
282 out.writeLong(dataPos);
285 out.writeUTF(idx.shortName);
286 out.writeUTF(idx.longName);
287 out.writeUTF(idx.sortLanguage.getIsoCode());
288 out.writeUTF(idx.normalizerRules);
289 out.writeBoolean(idx.swapPairEntries);
290 out.writeInt(idx.mainTokenCount);
291 writev6IndexEntries(out, idx.sortedIndexEntries);
293 // write stoplist, serializing the whole Set *shudder*
294 final ByteArrayOutputStream baos = new ByteArrayOutputStream();
295 final ObjectOutputStream oos = new ObjectOutputStream(baos);
296 oos.writeObject(idx.stoplist);
298 final byte[] bytes = baos.toByteArray();
299 out.writeInt(bytes.length);
302 out.writeInt(idx.rows.size());
304 for (RowBase r : idx.rows) {
306 if (r instanceof PairEntry.Row) {
308 } else if (r instanceof TokenRow) {
309 final TokenRow tokenRow = (TokenRow)r;
310 type = tokenRow.hasMainEntry ? 1 : 3;
311 } else if (r instanceof TextEntry.Row) {
313 } else if (r instanceof HtmlEntry.Row) {
316 throw new RuntimeException("Row type not supported for v6");
319 out.writeInt(r.referenceIndex);
322 long dataPos = out.getFilePointer();
324 out.writeLong(dataPos);
328 public void writev6(DataOutput out) throws IOException {
329 RandomAccessFile raf = (RandomAccessFile)out;
331 raf.writeLong(creationMillis);
332 raf.writeUTF(dictInfo);
333 System.out.println("sources start: " + raf.getFilePointer());
335 System.out.println("pair start: " + raf.getFilePointer());
336 writev6PairEntries(raf);
337 System.out.println("text start: " + raf.getFilePointer());
338 writev6TextEntries(raf);
339 System.out.println("html index start: " + raf.getFilePointer());
340 writev6HtmlEntries(raf);
341 System.out.println("indices start: " + raf.getFilePointer());
343 System.out.println("end: " + raf.getFilePointer());
344 raf.writeUTF(END_OF_DICTIONARY);
347 private final class IndexSerializer implements RAFListSerializer<Index> {
348 private final FileChannel ch;
350 IndexSerializer(FileChannel ch) {
355 public Index read(DataInput raf, final int readIndex) throws IOException {
356 return new Index(Dictionary.this, ch, raf);
360 public void write(DataOutput raf, Index t) throws IOException {
365 final RAFListSerializer<HtmlEntry> htmlEntryIndexSerializer = new RAFListSerializer<HtmlEntry>() {
367 public void write(DataOutput raf, HtmlEntry t) {
372 public HtmlEntry read(DataInput raf, int readIndex) throws IOException {
373 return htmlEntries.get(raf.readInt());
377 public void print(final PrintStream out) {
378 out.println("dictInfo=" + dictInfo);
379 for (final EntrySource entrySource : sources) {
380 out.printf("EntrySource: %s %d\n", entrySource.name, entrySource.numEntries);
383 for (final Index index : indices) {
384 out.printf("Index: %s %s\n", index.shortName, index.longName);
390 public DictionaryInfo getDictionaryInfo() {
391 final DictionaryInfo result = new DictionaryInfo();
392 result.creationMillis = this.creationMillis;
393 result.dictInfo = this.dictInfo;
394 for (final Index index : indices) {
395 result.indexInfos.add(index.getIndexInfo());
400 public static DictionaryInfo getDictionaryInfo(final File file) {
401 RandomAccessFile raf = null;
403 raf = new RandomAccessFile(file, "r");
404 final Dictionary dict = new Dictionary(raf.getChannel());
405 final DictionaryInfo dictionaryInfo = dict.getDictionaryInfo();
406 dictionaryInfo.uncompressedFilename = file.getName();
407 dictionaryInfo.uncompressedBytes = file.length();
409 return dictionaryInfo;
410 } catch (IOException e) {
411 final DictionaryInfo dictionaryInfo = new DictionaryInfo();
412 dictionaryInfo.uncompressedFilename = file.getName();
413 dictionaryInfo.uncompressedBytes = file.length();
414 return dictionaryInfo;
419 } catch (IOException e) {