]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/engine/DictionaryV6Writer.java
c6049f65f9336f5449b680eae7aa0d7aab1d4805
[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / DictionaryV6Writer.java
1 // Copyright 2020 Reimar Döffinger. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 package com.hughes.android.dictionary.engine;
16
17 import java.io.BufferedOutputStream;
18 import java.io.ByteArrayOutputStream;
19 import java.io.DataOutputStream;
20 import java.io.FileOutputStream;
21 import java.io.IOException;
22 import java.io.ObjectOutputStream;
23 import java.io.RandomAccessFile;
24 import java.nio.charset.StandardCharsets;
25 import java.util.Collections;
26 import java.util.List;
27 import java.util.zip.GZIPOutputStream;
28
29 import com.hughes.android.dictionary.engine.Dictionary;
30
31 public class DictionaryV6Writer {
32     private final Dictionary d;
33
34     public DictionaryV6Writer(Dictionary dictionary) {
35         d = dictionary;
36     }
37
38     private void writev6Sources(RandomAccessFile out) throws IOException {
39         ByteArrayOutputStream toc = new ByteArrayOutputStream();
40         DataOutputStream tocout = new DataOutputStream(toc);
41
42         out.writeInt(d.sources.size());
43         long tocPos = out.getFilePointer();
44         out.seek(tocPos + d.sources.size() * 8 + 8);
45         for (EntrySource s : d.sources) {
46             long dataPos = out.getFilePointer();
47             tocout.writeLong(dataPos);
48
49             out.writeUTF(s.getName());
50             out.writeInt(s.getNumEntries());
51         }
52         long dataPos = out.getFilePointer();
53         tocout.writeLong(dataPos);
54         tocout.close();
55
56         out.seek(tocPos);
57         out.write(toc.toByteArray());
58         out.seek(dataPos);
59     }
60
61     private void writev6PairEntries(RandomAccessFile out) throws IOException {
62         ByteArrayOutputStream toc = new ByteArrayOutputStream();
63         DataOutputStream tocout = new DataOutputStream(toc);
64
65         long tocPos = out.getFilePointer();
66         long dataPos = tocPos + 4 + d.pairEntries.size() * 8 + 8;
67
68         out.seek(dataPos);
69         DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD())));
70
71         tocout.writeInt(d.pairEntries.size());
72         for (PairEntry pe : d.pairEntries) {
73             tocout.writeLong(dataPos + outb.size());
74
75             outb.writeShort(pe.entrySource.index());
76             outb.writeInt(pe.pairs.size());
77             for (PairEntry.Pair p : pe.pairs) {
78                 outb.writeUTF(p.lang1);
79                 outb.writeUTF(p.lang2);
80             }
81         }
82         dataPos += outb.size();
83         outb.flush();
84         tocout.writeLong(dataPos);
85         tocout.close();
86
87         out.seek(tocPos);
88         out.write(toc.toByteArray());
89         out.seek(dataPos);
90     }
91
92     private void writev6TextEntries(RandomAccessFile out) throws IOException {
93         ByteArrayOutputStream toc = new ByteArrayOutputStream();
94         DataOutputStream tocout = new DataOutputStream(toc);
95
96         out.writeInt(d.textEntries.size());
97         long tocPos = out.getFilePointer();
98         out.seek(tocPos + d.textEntries.size() * 8 + 8);
99         for (TextEntry t : d.textEntries) {
100             long dataPos = out.getFilePointer();
101             tocout.writeLong(dataPos);
102
103             out.writeShort(t.entrySource.index());
104             out.writeUTF(t.text);
105         }
106         long dataPos = out.getFilePointer();
107         tocout.writeLong(dataPos);
108         tocout.close();
109
110         out.seek(tocPos);
111         out.write(toc.toByteArray());
112         out.seek(dataPos);
113     }
114
115     private void writev6EmptyList(RandomAccessFile out) throws IOException {
116         out.writeInt(0);
117         out.writeLong(out.getFilePointer() + 8);
118     }
119
120     private void writev6HtmlEntries(RandomAccessFile out) throws IOException {
121         ByteArrayOutputStream toc = new ByteArrayOutputStream();
122         DataOutputStream tocout = new DataOutputStream(toc);
123
124         long tocPos = out.getFilePointer();
125         long dataPos = tocPos + 4 + d.htmlEntries.size() * 8 + 8;
126
127         out.seek(dataPos);
128         DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD())));
129
130         tocout.writeInt(d.htmlEntries.size());
131         for (HtmlEntry h : d.htmlEntries) {
132             tocout.writeLong(dataPos + outb.size());
133
134             outb.writeShort(h.entrySource.index());
135             outb.writeUTF(h.title);
136             byte[] data = h.getHtml().getBytes(StandardCharsets.UTF_8);
137             outb.writeInt(data.length);
138             ByteArrayOutputStream baos = new ByteArrayOutputStream();
139             GZIPOutputStream gzout = new GZIPOutputStream(baos);
140             gzout.write(data);
141             gzout.close();
142             outb.writeInt(baos.size());
143             outb.write(baos.toByteArray());
144         }
145         dataPos += outb.size();
146         outb.flush();
147         tocout.writeLong(dataPos);
148         tocout.close();
149
150         out.seek(tocPos);
151         out.write(toc.toByteArray());
152         out.seek(dataPos);
153     }
154
155     private void writev6HtmlIndices(DataOutputStream out, long pos, List<HtmlEntry> entries) throws IOException {
156         long dataPos = pos + 4 + entries.size() * 8 + 8;
157
158         out.writeInt(entries.size());
159
160         // TOC is trivial, so optimize writing it
161         for (int i = 0; i < entries.size(); i++) {
162             out.writeLong(dataPos);
163             dataPos += 4;
164         }
165         out.writeLong(dataPos);
166
167         for (HtmlEntry e : entries) {
168             out.writeInt(e.index());
169         }
170     }
171
172     private void writev6IndexEntries(RandomAccessFile out, List<Index.IndexEntry> entries, int[] prunedRowIdx) throws IOException {
173         ByteArrayOutputStream toc = new ByteArrayOutputStream();
174         DataOutputStream tocout = new DataOutputStream(toc);
175
176         long tocPos = out.getFilePointer();
177         long dataPos = tocPos + 4 + entries.size() * 8 + 8;
178
179         out.seek(dataPos);
180         DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD())));
181
182         tocout.writeInt(entries.size());
183         for (Index.IndexEntry e : entries) {
184             tocout.writeLong(dataPos + outb.size());
185
186             outb.writeUTF(e.token);
187
188             int startRow = e.startRow;
189             int numRows = e.numRows;
190             if (prunedRowIdx != null) {
191                 // note: the start row will always be a TokenRow
192                 // and thus never be pruned
193                 int newNumRows = 1;
194                 for (int i = 1; i < numRows; i++) {
195                     if (prunedRowIdx[startRow + i] >= 0) newNumRows++;
196                 }
197                 startRow = prunedRowIdx[startRow];
198                 numRows = newNumRows;
199             }
200
201             outb.writeInt(startRow);
202             outb.writeInt(numRows);
203             final boolean hasNormalizedForm = !e.token.equals(e.normalizedToken());
204             outb.writeBoolean(hasNormalizedForm);
205             if (hasNormalizedForm) outb.writeUTF(e.normalizedToken());
206             writev6HtmlIndices(outb, dataPos + outb.size(),
207                                prunedRowIdx == null ? e.htmlEntries : Collections.<HtmlEntry>emptyList());
208         }
209         dataPos += outb.size();
210         outb.flush();
211         tocout.writeLong(dataPos);
212         tocout.close();
213
214         out.seek(tocPos);
215         out.write(toc.toByteArray());
216         out.seek(dataPos);
217     }
218
219     private void writev6Index(RandomAccessFile out, boolean skipHtml) throws IOException {
220         ByteArrayOutputStream toc = new ByteArrayOutputStream();
221         DataOutputStream tocout = new DataOutputStream(toc);
222
223         out.writeInt(d.indices.size());
224         long tocPos = out.getFilePointer();
225         out.seek(tocPos + d.indices.size() * 8 + 8);
226         for (Index idx : d.indices) {
227             // create pruned index for skipHtml feature
228             int[] prunedRowIdx = null;
229             int prunedSize = 0;
230             if (skipHtml) {
231                 prunedRowIdx = new int[idx.rows.size()];
232                 for (int i = 0; i < idx.rows.size(); i++) {
233                     final RowBase r = idx.rows.get(i);
234                     // prune Html entries
235                     boolean pruned = r instanceof HtmlEntry.Row;
236                     prunedRowIdx[i] = pruned ? -1 : prunedSize;
237                     if (!pruned) prunedSize++;
238                 }
239             }
240
241             long dataPos = out.getFilePointer();
242             tocout.writeLong(dataPos);
243
244             out.writeUTF(idx.shortName);
245             out.writeUTF(idx.longName);
246             out.writeUTF(idx.sortLanguage.getIsoCode());
247             out.writeUTF(idx.normalizerRules);
248             out.writeBoolean(idx.swapPairEntries);
249             out.writeInt(idx.mainTokenCount);
250             writev6IndexEntries(out, idx.sortedIndexEntries, prunedRowIdx);
251
252             // write stoplist, serializing the whole Set *shudder*
253             final ByteArrayOutputStream baos = new ByteArrayOutputStream();
254             final ObjectOutputStream oos = new ObjectOutputStream(baos);
255             oos.writeObject(idx.stoplist);
256             oos.close();
257             final byte[] bytes = baos.toByteArray();
258
259
260             DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD())));
261             outb.writeInt(bytes.length);
262             outb.write(bytes);
263
264             outb.writeInt(skipHtml ? prunedSize : idx.rows.size());
265             outb.writeInt(5);
266             for (RowBase r : idx.rows) {
267                 int type = 0;
268                 if (r instanceof PairEntry.Row) {
269                     type = 0;
270                 } else if (r instanceof TokenRow) {
271                     final TokenRow tokenRow = (TokenRow)r;
272                     type = tokenRow.hasMainEntry ? 1 : 3;
273                 } else if (r instanceof TextEntry.Row) {
274                     type = 2;
275                 } else if (r instanceof HtmlEntry.Row) {
276                     type = 4;
277                     if (skipHtml) continue;
278                 } else {
279                     throw new RuntimeException("Row type not supported for v6");
280                 }
281                 outb.writeByte(type);
282                 outb.writeInt(r.referenceIndex);
283             }
284             outb.flush();
285         }
286         long dataPos = out.getFilePointer();
287         tocout.writeLong(dataPos);
288         tocout.close();
289
290         out.seek(tocPos);
291         out.write(toc.toByteArray());
292         out.seek(dataPos);
293     }
294
295     public void writev6(RandomAccessFile raf, boolean skipHtml) throws IOException {
296         raf.writeInt(6);
297         raf.writeLong(d.creationMillis);
298         raf.writeUTF(d.dictInfo);
299         System.out.println("sources start: " + raf.getFilePointer());
300         writev6Sources(raf);
301         System.out.println("pair start: " + raf.getFilePointer());
302         writev6PairEntries(raf);
303         System.out.println("text start: " + raf.getFilePointer());
304         writev6TextEntries(raf);
305         System.out.println("html index start: " + raf.getFilePointer());
306         if (skipHtml) writev6EmptyList(raf);
307         else writev6HtmlEntries(raf);
308         System.out.println("indices start: " + raf.getFilePointer());
309         writev6Index(raf, skipHtml);
310         System.out.println("end: " + raf.getFilePointer());
311         raf.writeUTF("END OF DICTIONARY");
312     }
313 }