]> gitweb.fperrin.net Git - Dictionary.git/blob - src/com/hughes/android/dictionary/engine/Dictionary.java
82ba2cfa6d90c319c1cbcb8d01e6950dbab19402
[Dictionary.git] / src / com / hughes / android / dictionary / engine / Dictionary.java
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 package com.hughes.android.dictionary.engine;
16
17 import com.hughes.android.dictionary.DictionaryInfo;
18 import com.hughes.util.CachingList;
19 import com.hughes.util.raf.RAFList;
20 import com.hughes.util.raf.RAFListSerializer;
21 import com.hughes.util.raf.RAFSerializable;
22
23 import java.io.ByteArrayOutputStream;
24 import java.io.DataInput;
25 import java.io.DataInputStream;
26 import java.io.DataOutput;
27 import java.io.File;
28 import java.io.IOException;
29 import java.io.ObjectOutputStream;
30 import java.io.PrintStream;
31 import java.io.RandomAccessFile;
32 import java.nio.channels.Channels;
33 import java.nio.channels.FileChannel;
34 import java.util.ArrayList;
35 import java.util.Collections;
36 import java.util.List;
37 import java.util.zip.GZIPOutputStream;
38
39 public class Dictionary implements RAFSerializable<Dictionary> {
40
41     private static final int CACHE_SIZE = 5000;
42
43     private static final int CURRENT_DICT_VERSION = 7;
44     private static final String END_OF_DICTIONARY = "END OF DICTIONARY";
45
46     // persisted
47     final int dictFileVersion;
48     private final long creationMillis;
49     public final String dictInfo;
50     public final List<PairEntry> pairEntries;
51     public final List<TextEntry> textEntries;
52     public final List<HtmlEntry> htmlEntries;
53     public final List<byte[]> htmlData;
54     public final List<EntrySource> sources;
55     public final List<Index> indices;
56
57     /**
58      * dictFileVersion 1 adds: <li>links to sources? dictFileVersion 2 adds: <li>
59      * counts of tokens in indices.
60      */
61
62     public Dictionary(final String dictInfo) {
63         this.dictFileVersion = CURRENT_DICT_VERSION;
64         this.creationMillis = System.currentTimeMillis();
65         this.dictInfo = dictInfo;
66         pairEntries = new ArrayList<>();
67         textEntries = new ArrayList<>();
68         htmlEntries = new ArrayList<>();
69         htmlData = null;
70         sources = new ArrayList<>();
71         indices = new ArrayList<>();
72     }
73
74     public Dictionary(final FileChannel ch) throws IOException {
75         DataInput raf = new DataInputStream(Channels.newInputStream(ch));
76         dictFileVersion = raf.readInt();
77         if (dictFileVersion < 0 || dictFileVersion > CURRENT_DICT_VERSION) {
78             throw new IOException("Invalid dictionary version: " + dictFileVersion);
79         }
80         creationMillis = raf.readLong();
81         dictInfo = raf.readUTF();
82
83         // Load the sources, then seek past them, because reading them later
84         // disrupts the offset.
85         try {
86             final RAFList<EntrySource> rafSources = RAFList.create(ch, new EntrySource.Serializer(
87                     this), ch.position(), dictFileVersion, dictInfo + " sources: ");
88             sources = new ArrayList<>(rafSources);
89             ch.position(rafSources.getEndOffset());
90
91             pairEntries = CachingList.create(
92                               RAFList.create(ch, new PairEntry.Serializer(this), ch.position(), dictFileVersion, dictInfo + " pairs: "),
93                               CACHE_SIZE, false);
94             textEntries = CachingList.create(
95                               RAFList.create(ch, new TextEntry.Serializer(this), ch.position(), dictFileVersion, dictInfo + " text: "),
96                               CACHE_SIZE, true);
97             if (dictFileVersion >= 5) {
98                 htmlEntries = CachingList.create(
99                                   RAFList.create(ch, new HtmlEntry.Serializer(this, ch), ch.position(), dictFileVersion, dictInfo + " html: "),
100                                   CACHE_SIZE, false);
101             } else {
102                 htmlEntries = Collections.emptyList();
103             }
104             if (dictFileVersion >= 7) {
105                 htmlData = RAFList.create(ch, new HtmlEntry.DataDeserializer(), ch.position(), dictFileVersion, dictInfo + " html: ");
106             } else {
107                 htmlData = null;
108             }
109             indices = CachingList.createFullyCached(RAFList.create(ch, new IndexSerializer(ch),
110                                                     ch.position(), dictFileVersion, dictInfo + " index: "));
111         } catch (RuntimeException e) {
112             throw new IOException("RuntimeException loading dictionary", e);
113         }
114         final String end = raf.readUTF();
115         if (!end.equals(END_OF_DICTIONARY)) {
116             throw new IOException("Dictionary seems corrupt: " + end);
117         }
118     }
119
120     @Override
121     public void write(DataOutput out) throws IOException {
122         RandomAccessFile raf = (RandomAccessFile)out;
123         if (dictFileVersion < 7) throw new RuntimeException("write function cannot write formats older than v7!");
124         raf.writeInt(dictFileVersion);
125         raf.writeLong(creationMillis);
126         raf.writeUTF(dictInfo);
127         System.out.println("sources start: " + raf.getFilePointer());
128         RAFList.write(raf, sources, new EntrySource.Serializer(this));
129         System.out.println("pair start: " + raf.getFilePointer());
130         RAFList.write(raf, pairEntries, new PairEntry.Serializer(this), 64, true);
131         System.out.println("text start: " + raf.getFilePointer());
132         RAFList.write(raf, textEntries, new TextEntry.Serializer(this));
133         System.out.println("html index start: " + raf.getFilePointer());
134         RAFList.write(raf, htmlEntries, new HtmlEntry.Serializer(this, null), 64, true);
135         System.out.println("html data start: " + raf.getFilePointer());
136         assert htmlData == null;
137         RAFList.write(raf, htmlEntries, new HtmlEntry.DataSerializer(), 128, true);
138         System.out.println("indices start: " + raf.getFilePointer());
139         RAFList.write(raf, indices, new IndexSerializer(null));
140         System.out.println("end: " + raf.getFilePointer());
141         raf.writeUTF(END_OF_DICTIONARY);
142     }
143
144     private void writev6Sources(RandomAccessFile out) throws IOException {
145         out.writeInt(sources.size());
146         long tocPos = out.getFilePointer();
147         out.seek(tocPos + sources.size() * 8 + 8);
148         for (EntrySource s : sources) {
149             long dataPos = out.getFilePointer();
150             out.seek(tocPos);
151             out.writeLong(dataPos);
152             tocPos += 8;
153             out.seek(dataPos);
154             out.writeUTF(s.getName());
155             out.writeInt(s.getNumEntries());
156         }
157         long dataPos = out.getFilePointer();
158         out.seek(tocPos);
159         out.writeLong(dataPos);
160         out.seek(dataPos);
161     }
162
163     private void writev6PairEntries(RandomAccessFile out) throws IOException {
164         out.writeInt(pairEntries.size());
165         long tocPos = out.getFilePointer();
166         out.seek(tocPos + pairEntries.size() * 8 + 8);
167         for (PairEntry pe : pairEntries) {
168             long dataPos = out.getFilePointer();
169             out.seek(tocPos);
170             out.writeLong(dataPos);
171             tocPos += 8;
172             out.seek(dataPos);
173             out.writeShort(pe.entrySource.index());
174             out.writeInt(pe.pairs.size());
175             for (PairEntry.Pair p : pe.pairs) {
176                 out.writeUTF(p.lang1);
177                 out.writeUTF(p.lang2);
178             }
179         }
180         long dataPos = out.getFilePointer();
181         out.seek(tocPos);
182         out.writeLong(dataPos);
183         out.seek(dataPos);
184     }
185
186     private void writev6TextEntries(RandomAccessFile out) throws IOException {
187         out.writeInt(textEntries.size());
188         long tocPos = out.getFilePointer();
189         out.seek(tocPos + textEntries.size() * 8 + 8);
190         for (TextEntry t : textEntries) {
191             long dataPos = out.getFilePointer();
192             out.seek(tocPos);
193             out.writeLong(dataPos);
194             tocPos += 8;
195             out.seek(dataPos);
196             out.writeShort(t.entrySource.index());
197             out.writeUTF(t.text);
198         }
199         long dataPos = out.getFilePointer();
200         out.seek(tocPos);
201         out.writeLong(dataPos);
202         out.seek(dataPos);
203     }
204
205     private void writev6HtmlEntries(RandomAccessFile out) throws IOException {
206         out.writeInt(htmlEntries.size());
207         long tocPos = out.getFilePointer();
208         out.seek(tocPos + htmlEntries.size() * 8 + 8);
209         for (HtmlEntry h : htmlEntries) {
210             long dataPos = out.getFilePointer();
211             out.seek(tocPos);
212             out.writeLong(dataPos);
213             tocPos += 8;
214             out.seek(dataPos);
215             out.writeShort(h.entrySource.index());
216             out.writeUTF(h.title);
217             byte[] data = h.getHtml().getBytes("UTF-8");
218             out.writeInt(data.length);
219             ByteArrayOutputStream baos = new ByteArrayOutputStream();
220             GZIPOutputStream gzout = new GZIPOutputStream(baos);
221             gzout.write(data);
222             gzout.close();
223             out.writeInt(baos.size());
224             out.write(baos.toByteArray());
225         }
226         long dataPos = out.getFilePointer();
227         out.seek(tocPos);
228         out.writeLong(dataPos);
229         out.seek(dataPos);
230     }
231
232     private void writev6HtmlIndices(RandomAccessFile out, List<HtmlEntry> entries) throws IOException {
233         out.writeInt(entries.size());
234         long tocPos = out.getFilePointer();
235         out.seek(tocPos + entries.size() * 8 + 8);
236         for (HtmlEntry e : entries) {
237             long dataPos = out.getFilePointer();
238             out.seek(tocPos);
239             out.writeLong(dataPos);
240             tocPos += 8;
241             out.seek(dataPos);
242             out.writeInt(e.index());
243         }
244         long dataPos = out.getFilePointer();
245         out.seek(tocPos);
246         out.writeLong(dataPos);
247         out.seek(dataPos);
248     }
249
250     private void writev6IndexEntries(RandomAccessFile out, List<Index.IndexEntry> entries) throws IOException {
251         out.writeInt(entries.size());
252         long tocPos = out.getFilePointer();
253         out.seek(tocPos + entries.size() * 8 + 8);
254         for (Index.IndexEntry e : entries) {
255             long dataPos = out.getFilePointer();
256             out.seek(tocPos);
257             out.writeLong(dataPos);
258             tocPos += 8;
259             out.seek(dataPos);
260             out.writeUTF(e.token);
261             out.writeInt(e.startRow);
262             out.writeInt(e.numRows);
263             final boolean hasNormalizedForm = !e.token.equals(e.normalizedToken());
264             out.writeBoolean(hasNormalizedForm);
265             if (hasNormalizedForm) out.writeUTF(e.normalizedToken());
266             writev6HtmlIndices(out, e.htmlEntries);
267         }
268         long dataPos = out.getFilePointer();
269         out.seek(tocPos);
270         out.writeLong(dataPos);
271         out.seek(dataPos);
272     }
273
274     private void writev6Index(RandomAccessFile out) throws IOException {
275         out.writeInt(indices.size());
276         long tocPos = out.getFilePointer();
277         out.seek(tocPos + indices.size() * 8 + 8);
278         for (Index idx : indices) {
279             long dataPos = out.getFilePointer();
280             out.seek(tocPos);
281             out.writeLong(dataPos);
282             tocPos += 8;
283             out.seek(dataPos);
284             out.writeUTF(idx.shortName);
285             out.writeUTF(idx.longName);
286             out.writeUTF(idx.sortLanguage.getIsoCode());
287             out.writeUTF(idx.normalizerRules);
288             out.writeBoolean(idx.swapPairEntries);
289             out.writeInt(idx.mainTokenCount);
290             writev6IndexEntries(out, idx.sortedIndexEntries);
291
292             // write stoplist, serializing the whole Set *shudder*
293             final ByteArrayOutputStream baos = new ByteArrayOutputStream();
294             final ObjectOutputStream oos = new ObjectOutputStream(baos);
295             oos.writeObject(idx.stoplist);
296             oos.close();
297             final byte[] bytes = baos.toByteArray();
298             out.writeInt(bytes.length);
299             out.write(bytes);
300
301             out.writeInt(idx.rows.size());
302             out.writeInt(5);
303             for (RowBase r : idx.rows) {
304                 int type = 0;
305                 if (r instanceof PairEntry.Row) {
306                     type = 0;
307                 } else if (r instanceof TokenRow) {
308                     final TokenRow tokenRow = (TokenRow)r;
309                     type = tokenRow.hasMainEntry ? 1 : 3;
310                 } else if (r instanceof TextEntry.Row) {
311                     type = 2;
312                 } else if (r instanceof HtmlEntry.Row) {
313                     type = 4;
314                 } else {
315                     throw new RuntimeException("Row type not supported for v6");
316                 }
317                 out.writeByte(type);
318                 out.writeInt(r.referenceIndex);
319             }
320         }
321         long dataPos = out.getFilePointer();
322         out.seek(tocPos);
323         out.writeLong(dataPos);
324         out.seek(dataPos);
325     }
326
327     public void writev6(DataOutput out) throws IOException {
328         RandomAccessFile raf = (RandomAccessFile)out;
329         raf.writeInt(6);
330         raf.writeLong(creationMillis);
331         raf.writeUTF(dictInfo);
332         System.out.println("sources start: " + raf.getFilePointer());
333         writev6Sources(raf);
334         System.out.println("pair start: " + raf.getFilePointer());
335         writev6PairEntries(raf);
336         System.out.println("text start: " + raf.getFilePointer());
337         writev6TextEntries(raf);
338         System.out.println("html index start: " + raf.getFilePointer());
339         writev6HtmlEntries(raf);
340         System.out.println("indices start: " + raf.getFilePointer());
341         writev6Index(raf);
342         System.out.println("end: " + raf.getFilePointer());
343         raf.writeUTF(END_OF_DICTIONARY);
344     }
345
346     private final class IndexSerializer implements RAFListSerializer<Index> {
347         private final FileChannel ch;
348
349         IndexSerializer(FileChannel ch) {
350             this.ch = ch;
351         }
352
353         @Override
354         public Index read(DataInput raf, final int readIndex) throws IOException {
355             return new Index(Dictionary.this, ch, raf);
356         }
357
358         @Override
359         public void write(DataOutput raf, Index t) throws IOException {
360             t.write(raf);
361         }
362     }
363
364     final RAFListSerializer<HtmlEntry> htmlEntryIndexSerializer = new RAFListSerializer<HtmlEntry>() {
365         @Override
366         public void write(DataOutput raf, HtmlEntry t) {
367             assert false;
368         }
369
370         @Override
371         public HtmlEntry read(DataInput raf, int readIndex) throws IOException {
372             return htmlEntries.get(raf.readInt());
373         }
374     };
375
376     public void print(final PrintStream out) {
377         out.println("dictInfo=" + dictInfo);
378         for (final EntrySource entrySource : sources) {
379             out.printf("EntrySource: %s %d\n", entrySource.name, entrySource.numEntries);
380         }
381         out.println();
382         for (final Index index : indices) {
383             out.printf("Index: %s %s\n", index.shortName, index.longName);
384             index.print(out);
385             out.println();
386         }
387     }
388
389     public DictionaryInfo getDictionaryInfo() {
390         final DictionaryInfo result = new DictionaryInfo();
391         result.creationMillis = this.creationMillis;
392         result.dictInfo = this.dictInfo;
393         for (final Index index : indices) {
394             result.indexInfos.add(index.getIndexInfo());
395         }
396         return result;
397     }
398
399     public static DictionaryInfo getDictionaryInfo(final File file) {
400         RandomAccessFile raf = null;
401         try {
402             raf = new RandomAccessFile(file, "r");
403             final Dictionary dict = new Dictionary(raf.getChannel());
404             final DictionaryInfo dictionaryInfo = dict.getDictionaryInfo();
405             dictionaryInfo.uncompressedFilename = file.getName();
406             dictionaryInfo.uncompressedBytes = file.length();
407             raf.close();
408             return dictionaryInfo;
409         } catch (IOException e) {
410             final DictionaryInfo dictionaryInfo = new DictionaryInfo();
411             dictionaryInfo.uncompressedFilename = file.getName();
412             dictionaryInfo.uncompressedBytes = file.length();
413             return dictionaryInfo;
414         } finally {
415             if (raf != null) {
416                 try {
417                     raf.close();
418                 } catch (IOException e) {
419                     e.printStackTrace();
420                 }
421             }
422         }
423     }
424
425 }