1 // Copyright 2020 Reimar Döffinger. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.engine;
17 import java.io.BufferedOutputStream;
18 import java.io.ByteArrayOutputStream;
19 import java.io.DataOutputStream;
20 import java.io.FileOutputStream;
21 import java.io.IOException;
22 import java.io.ObjectOutputStream;
23 import java.io.RandomAccessFile;
24 import java.nio.charset.StandardCharsets;
25 import java.util.Collections;
26 import java.util.List;
27 import java.util.zip.GZIPOutputStream;
29 public class DictionaryV6Writer {
30 private final Dictionary d;
32 public DictionaryV6Writer(Dictionary dictionary) {
36 private void writev6Sources(RandomAccessFile out) throws IOException {
37 ByteArrayOutputStream toc = new ByteArrayOutputStream();
38 DataOutputStream tocout = new DataOutputStream(toc);
40 out.writeInt(d.sources.size());
41 long tocPos = out.getFilePointer();
42 out.seek(tocPos + d.sources.size() * 8 + 8);
43 for (EntrySource s : d.sources) {
44 long dataPos = out.getFilePointer();
45 tocout.writeLong(dataPos);
47 out.writeUTF(s.getName());
48 out.writeInt(s.getNumEntries());
50 long dataPos = out.getFilePointer();
51 tocout.writeLong(dataPos);
55 out.write(toc.toByteArray());
59 private void writev6PairEntries(RandomAccessFile out) throws IOException {
60 ByteArrayOutputStream toc = new ByteArrayOutputStream();
61 DataOutputStream tocout = new DataOutputStream(toc);
63 long tocPos = out.getFilePointer();
64 long dataPos = tocPos + 4 + d.pairEntries.size() * 8 + 8;
67 DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD())));
69 tocout.writeInt(d.pairEntries.size());
70 for (PairEntry pe : d.pairEntries) {
71 tocout.writeLong(dataPos + outb.size());
73 outb.writeShort(pe.entrySource.index());
74 outb.writeInt(pe.pairs.size());
75 for (PairEntry.Pair p : pe.pairs) {
76 outb.writeUTF(p.lang1);
77 outb.writeUTF(p.lang2);
80 dataPos += outb.size();
82 tocout.writeLong(dataPos);
86 out.write(toc.toByteArray());
90 private void writev6TextEntries(RandomAccessFile out) throws IOException {
91 ByteArrayOutputStream toc = new ByteArrayOutputStream();
92 DataOutputStream tocout = new DataOutputStream(toc);
94 out.writeInt(d.textEntries.size());
95 long tocPos = out.getFilePointer();
96 out.seek(tocPos + d.textEntries.size() * 8 + 8);
97 for (TextEntry t : d.textEntries) {
98 long dataPos = out.getFilePointer();
99 tocout.writeLong(dataPos);
101 out.writeShort(t.entrySource.index());
102 out.writeUTF(t.text);
104 long dataPos = out.getFilePointer();
105 tocout.writeLong(dataPos);
109 out.write(toc.toByteArray());
113 private void writev6EmptyList(RandomAccessFile out) throws IOException {
115 out.writeLong(out.getFilePointer() + 8);
118 private void writev6HtmlEntries(RandomAccessFile out) throws IOException {
119 ByteArrayOutputStream toc = new ByteArrayOutputStream();
120 DataOutputStream tocout = new DataOutputStream(toc);
122 long tocPos = out.getFilePointer();
123 long dataPos = tocPos + 4 + d.htmlEntries.size() * 8 + 8;
126 DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD())));
128 tocout.writeInt(d.htmlEntries.size());
129 for (HtmlEntry h : d.htmlEntries) {
130 tocout.writeLong(dataPos + outb.size());
132 outb.writeShort(h.entrySource.index());
133 outb.writeUTF(h.title);
134 byte[] data = h.getHtml().getBytes(StandardCharsets.UTF_8);
135 outb.writeInt(data.length);
136 ByteArrayOutputStream baos = new ByteArrayOutputStream();
137 GZIPOutputStream gzout = new GZIPOutputStream(baos);
140 outb.writeInt(baos.size());
141 outb.write(baos.toByteArray());
143 dataPos += outb.size();
145 tocout.writeLong(dataPos);
149 out.write(toc.toByteArray());
153 private void writev6HtmlIndices(DataOutputStream out, long pos, List<HtmlEntry> entries) throws IOException {
154 long dataPos = pos + 4 + entries.size() * 8 + 8;
156 out.writeInt(entries.size());
158 // TOC is trivial, so optimize writing it
159 for (int i = 0; i < entries.size(); i++) {
160 out.writeLong(dataPos);
163 out.writeLong(dataPos);
165 for (HtmlEntry e : entries) {
166 out.writeInt(e.index());
170 private void writev6IndexEntries(RandomAccessFile out, List<Index.IndexEntry> entries, int[] prunedRowIdx) throws IOException {
171 ByteArrayOutputStream toc = new ByteArrayOutputStream();
172 DataOutputStream tocout = new DataOutputStream(toc);
174 long tocPos = out.getFilePointer();
175 long dataPos = tocPos + 4 + entries.size() * 8 + 8;
178 DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD())));
180 tocout.writeInt(entries.size());
181 for (Index.IndexEntry e : entries) {
182 tocout.writeLong(dataPos + outb.size());
184 outb.writeUTF(e.token);
186 int startRow = e.startRow;
187 int numRows = e.numRows;
188 if (prunedRowIdx != null) {
189 // note: the start row will always be a TokenRow
190 // and thus never be pruned
192 for (int i = 1; i < numRows; i++) {
193 if (prunedRowIdx[startRow + i] >= 0) newNumRows++;
195 startRow = prunedRowIdx[startRow];
196 numRows = newNumRows;
199 outb.writeInt(startRow);
200 outb.writeInt(numRows);
201 final boolean hasNormalizedForm = !e.token.equals(e.normalizedToken());
202 outb.writeBoolean(hasNormalizedForm);
203 if (hasNormalizedForm) outb.writeUTF(e.normalizedToken());
204 writev6HtmlIndices(outb, dataPos + outb.size(),
205 prunedRowIdx == null ? e.htmlEntries : Collections.emptyList());
207 dataPos += outb.size();
209 tocout.writeLong(dataPos);
213 out.write(toc.toByteArray());
217 private void writev6Index(RandomAccessFile out, boolean skipHtml) throws IOException {
218 ByteArrayOutputStream toc = new ByteArrayOutputStream();
219 DataOutputStream tocout = new DataOutputStream(toc);
221 out.writeInt(d.indices.size());
222 long tocPos = out.getFilePointer();
223 out.seek(tocPos + d.indices.size() * 8 + 8);
224 for (Index idx : d.indices) {
225 // create pruned index for skipHtml feature
226 int[] prunedRowIdx = null;
229 prunedRowIdx = new int[idx.rows.size()];
230 for (int i = 0; i < idx.rows.size(); i++) {
231 final RowBase r = idx.rows.get(i);
232 // prune Html entries
233 boolean pruned = r instanceof HtmlEntry.Row;
234 prunedRowIdx[i] = pruned ? -1 : prunedSize;
235 if (!pruned) prunedSize++;
239 long dataPos = out.getFilePointer();
240 tocout.writeLong(dataPos);
242 out.writeUTF(idx.shortName);
243 out.writeUTF(idx.longName);
244 out.writeUTF(idx.sortLanguage.getIsoCode());
245 out.writeUTF(idx.normalizerRules);
246 out.writeBoolean(idx.swapPairEntries);
247 out.writeInt(idx.mainTokenCount);
248 writev6IndexEntries(out, idx.sortedIndexEntries, prunedRowIdx);
250 // write stoplist, serializing the whole Set *shudder*
251 // Actually just emulate ObjectOutputStream serialization
252 final byte[] hashSetSerialized = {
253 (byte)0xac, (byte)0xed, // magic
254 0x00, 0x05, // version
257 // "java.util.HashSet"
258 0x00, 0x11, 0x6a, 0x61, 0x76, 0x61, 0x2e, 0x75, 0x74, 0x69,
259 0x6c, 0x2e, 0x48, 0x61, 0x73, 0x68, 0x53, 0x65, 0x74,
261 (byte)0xba, 0x44, (byte)0x85, (byte)0x95, (byte)0x96, (byte)0xb8, (byte)0xb7, 0x34,
262 0x03, // flags: serialized, custom serialization function
263 0x00, 0x00, // fields count
264 0x78, // blockdata end
265 0x70, // null (superclass)
266 0x77, 0x0c // blockdata short, 0xc bytes
268 int stoplistlen = hashSetSerialized.length;
269 stoplistlen += 12; // block data: capacity (int), load factor (float), size (int)
270 for (String s : idx.stoplist) {
271 stoplistlen += 3 + s.length();
275 DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD())));
276 outb.writeInt(stoplistlen);
277 outb.write(hashSetSerialized);
278 outb.writeInt(idx.stoplist.size()); // capacity
279 outb.writeFloat(0.75f); // load factor
280 outb.writeInt(idx.stoplist.size()); // size
281 for (String s : idx.stoplist) {
282 outb.writeByte(0x74); // String type
285 outb.writeByte(0x78); // blockdata end
287 outb.writeInt(skipHtml ? prunedSize : idx.rows.size());
289 for (RowBase r : idx.rows) {
291 if (r instanceof PairEntry.Row) {
293 } else if (r instanceof TokenRow) {
294 final TokenRow tokenRow = (TokenRow)r;
295 type = tokenRow.hasMainEntry ? 1 : 3;
296 } else if (r instanceof TextEntry.Row) {
298 } else if (r instanceof HtmlEntry.Row) {
300 if (skipHtml) continue;
302 throw new RuntimeException("Row type not supported for v6");
304 outb.writeByte(type);
305 outb.writeInt(r.referenceIndex);
309 long dataPos = out.getFilePointer();
310 tocout.writeLong(dataPos);
314 out.write(toc.toByteArray());
318 public void writev6(RandomAccessFile raf, boolean skipHtml) throws IOException {
320 raf.writeLong(d.creationMillis);
321 raf.writeUTF(d.dictInfo);
322 System.out.println("sources start: " + raf.getFilePointer());
324 System.out.println("pair start: " + raf.getFilePointer());
325 writev6PairEntries(raf);
326 System.out.println("text start: " + raf.getFilePointer());
327 writev6TextEntries(raf);
328 System.out.println("html index start: " + raf.getFilePointer());
329 if (skipHtml) writev6EmptyList(raf);
330 else writev6HtmlEntries(raf);
331 System.out.println("indices start: " + raf.getFilePointer());
332 writev6Index(raf, skipHtml);
333 System.out.println("end: " + raf.getFilePointer());
334 raf.writeUTF("END OF DICTIONARY");