]> gitweb.fperrin.net Git - Dictionary.git/commitdiff
Add untested support for writing v6 files.
authorReimar Döffinger <Reimar.Doeffinger@gmx.de>
Sat, 11 Apr 2020 21:13:56 +0000 (23:13 +0200)
committerReimar Döffinger <Reimar.Doeffinger@gmx.de>
Sat, 11 Apr 2020 21:14:50 +0000 (23:14 +0200)
Should allow converting v7 to v6 files,
with v6 being suitable for e.g. Tolino devices.
Also add a format specification for v6.

dictionary-format-v6.txt [new file with mode: 0644]
src/com/hughes/android/dictionary/engine/Dictionary.java
src/com/hughes/android/dictionary/engine/Index.java
src/com/hughes/android/dictionary/engine/TextEntry.java

diff --git a/dictionary-format-v6.txt b/dictionary-format-v6.txt
new file mode 100644 (file)
index 0000000..17cbe23
--- /dev/null
@@ -0,0 +1,158 @@
+This is a quick write-up of the old dictionary file format, v6.
+v6 is troublesome as it relies on Java serialization and thus
+there will be references to Java types.
+This hasn't been checked for correctness and likely has some bugs.
+Also, I really should have used some standard format for writing this...
+
+===========================================
+
+Some basic types:
+
+[Short]
+  2 bytes: big-endian, signed value (note: negative values generally not used here)
+
+[Int]
+  4 bytes: big-endian, signed value (note: negative values generally not used here)
+
+[Long]
+  8 bytes: big-endian, signed value (note: negative values generally not used here)
+
+
+[String]
+  [Short]: string length
+  n bytes: string, modified UTF-8, n is value from previous element
+           note: no zero termination
+
+======================================================
+
+[Dictionary]
+
+[Int]: version, fixed value 6
+[Long]: file creation time (in milliseconds since Jan. 1st 1970)
+[String]: dictionary information (human-readable)
+
+list_of([source])
+list_of([pair_entry])
+list_of([text_entry])
+list_of([html_entry]) (since v5)
+list_of([index])
+
+[String]: string "END OF DICTIONARY" (length value 17)
+
+===========================
+
+All list_of entries describe a list of elements.
+These elements can have variable size, thus an index (table-of-contents, TOC)
+is needed.
+To reduce the cost of this table and enable more efficient compression,
+multiple entries can be stored in a block that gets one single index entry.
+I.e. it is only possible to do random-access to the start of a block,
+seeking to elements further inside the block must be done via reading.
+Caching should be used to reduce the performance impact of this (so
+that when entries 5, 4, 3 etc. of a block are read sequentially,
+parsing and decompression is done only once).
+
+These lists have the following base format:
+
+[Int]: number of entries in the list (must be >= 0) (<size>)
+
+<toc size>=<size>*8 + 8 bytes:
+  table-of-contents.
+  [Long] offset value for each block of entries.
+  Followed by a final [Long] offset value to the end of the list data (<end offset>).
+  Each offset is an absolute file position.
+
+<end offset>-<toc size>-<start of toc> bytes:
+  entry data
+
+==========================================================
+
+[source]
+
+[String]: name of source, e.g. "enwiktionary"
+[Int]: number of entries from that source (since v3) (I kind of wouldn't rely on that one
+being useful/correct...)
+
+========================================================
+
+[pair entry]
+
+[Short]: source index (see list_of([source])) (since v1)
+[Int]: number of pairs in this entry (<num_pairs>)
+<num_pairs> times:
+  [String]: in first language
+  [String]: in second language (possibly empty)
+
+=================================================
+
+[text_entry]
+
+[Short]: source index (see list_of([source])) (since v1)
+[String]: text
+
+===========================================
+
+[html_entry]
+
+[Short]: source index (see list_of([source])) (since v1)
+[String]: title for HTML entry
+[Int]: length of decompressed data in bytes (<declen>)
+[Int]: length of compressed data in bytes (<len>)
+<len> bytes: HTML page data, UTF-8 encoded, gzip compressed
+
+=====================================
+
+[index]
+
+Note: this structure is used for binary search.
+It is thus critical that all entries are correctly
+sorted.
+The sorting is according to libicu, however as Java
+and Android versions do not match special hacks
+have been added, like ignoring "-" for the comparison
+(unless that makes them equal, then they are
+compared including the dash).
+
+[String]: index short name
+[String]: index long name
+[String]: language ISO code (sort order depends on this)
+[String]: ICU normalizer rules to apply for sorting/searching
+1 byte: swap pair entries (if != 0, this index is for the second language entries in [pair_entry])
+[Int]: number of main tokens (?) (since v2)
+list_of([index_entry])
+[Int]: size of stop list set following (since v4)
+Set<String> stop list words (since v4)
+uniform_list_of([row])
+
+
+with uniform_list_of:
+[Int]: number of entries in list <num_entries>
+[Int]: size of entry <entry_size>
+<num_entries>*<entry_size> bytes: data
+
+
+================================================
+
+[index_entry]
+
+[String]: token
+[Int]: start index into uniform_list_of([row])
+[Int]: number of rows covered
+1 byte: <has_normalized>
+if <has_normalized> != 0:
+  [String]: normalized token
+list_of([Int]) list of indices into list_of(html_entry) (since v6)
+
+=======================================
+
+[row]
+
+1 byte: <type>
+[Int]: index
+
+<type> means:
+1: index into list_of([pair_entry])
+2: index into list_of([index_entry]) (mark as "main word header" entry)
+3: index into list_of([text_entry])
+4: index into list_of([index_entry]) (mark as "extra info/translation" entry)
+5: index into list_of([html_entry])
index c378f8d6edee7e34ae6eff490f5a756c7e2dcdee..82ba2cfa6d90c319c1cbcb8d01e6950dbab19402 100644 (file)
@@ -20,11 +20,13 @@ import com.hughes.util.raf.RAFList;
 import com.hughes.util.raf.RAFListSerializer;
 import com.hughes.util.raf.RAFSerializable;
 
+import java.io.ByteArrayOutputStream;
 import java.io.DataInput;
 import java.io.DataInputStream;
 import java.io.DataOutput;
 import java.io.File;
 import java.io.IOException;
+import java.io.ObjectOutputStream;
 import java.io.PrintStream;
 import java.io.RandomAccessFile;
 import java.nio.channels.Channels;
@@ -32,6 +34,7 @@ import java.nio.channels.FileChannel;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.zip.GZIPOutputStream;
 
 public class Dictionary implements RAFSerializable<Dictionary> {
 
@@ -117,6 +120,7 @@ public class Dictionary implements RAFSerializable<Dictionary> {
     @Override
     public void write(DataOutput out) throws IOException {
         RandomAccessFile raf = (RandomAccessFile)out;
+        if (dictFileVersion < 7) throw new RuntimeException("write function cannot write formats older than v7!");
         raf.writeInt(dictFileVersion);
         raf.writeLong(creationMillis);
         raf.writeUTF(dictInfo);
@@ -137,6 +141,208 @@ public class Dictionary implements RAFSerializable<Dictionary> {
         raf.writeUTF(END_OF_DICTIONARY);
     }
 
+    private void writev6Sources(RandomAccessFile out) throws IOException {
+        out.writeInt(sources.size());
+        long tocPos = out.getFilePointer();
+        out.seek(tocPos + sources.size() * 8 + 8);
+        for (EntrySource s : sources) {
+            long dataPos = out.getFilePointer();
+            out.seek(tocPos);
+            out.writeLong(dataPos);
+            tocPos += 8;
+            out.seek(dataPos);
+            out.writeUTF(s.getName());
+            out.writeInt(s.getNumEntries());
+        }
+        long dataPos = out.getFilePointer();
+        out.seek(tocPos);
+        out.writeLong(dataPos);
+        out.seek(dataPos);
+    }
+
+    private void writev6PairEntries(RandomAccessFile out) throws IOException {
+        out.writeInt(pairEntries.size());
+        long tocPos = out.getFilePointer();
+        out.seek(tocPos + pairEntries.size() * 8 + 8);
+        for (PairEntry pe : pairEntries) {
+            long dataPos = out.getFilePointer();
+            out.seek(tocPos);
+            out.writeLong(dataPos);
+            tocPos += 8;
+            out.seek(dataPos);
+            out.writeShort(pe.entrySource.index());
+            out.writeInt(pe.pairs.size());
+            for (PairEntry.Pair p : pe.pairs) {
+                out.writeUTF(p.lang1);
+                out.writeUTF(p.lang2);
+            }
+        }
+        long dataPos = out.getFilePointer();
+        out.seek(tocPos);
+        out.writeLong(dataPos);
+        out.seek(dataPos);
+    }
+
+    private void writev6TextEntries(RandomAccessFile out) throws IOException {
+        out.writeInt(textEntries.size());
+        long tocPos = out.getFilePointer();
+        out.seek(tocPos + textEntries.size() * 8 + 8);
+        for (TextEntry t : textEntries) {
+            long dataPos = out.getFilePointer();
+            out.seek(tocPos);
+            out.writeLong(dataPos);
+            tocPos += 8;
+            out.seek(dataPos);
+            out.writeShort(t.entrySource.index());
+            out.writeUTF(t.text);
+        }
+        long dataPos = out.getFilePointer();
+        out.seek(tocPos);
+        out.writeLong(dataPos);
+        out.seek(dataPos);
+    }
+
+    private void writev6HtmlEntries(RandomAccessFile out) throws IOException {
+        out.writeInt(htmlEntries.size());
+        long tocPos = out.getFilePointer();
+        out.seek(tocPos + htmlEntries.size() * 8 + 8);
+        for (HtmlEntry h : htmlEntries) {
+            long dataPos = out.getFilePointer();
+            out.seek(tocPos);
+            out.writeLong(dataPos);
+            tocPos += 8;
+            out.seek(dataPos);
+            out.writeShort(h.entrySource.index());
+            out.writeUTF(h.title);
+            byte[] data = h.getHtml().getBytes("UTF-8");
+            out.writeInt(data.length);
+            ByteArrayOutputStream baos = new ByteArrayOutputStream();
+            GZIPOutputStream gzout = new GZIPOutputStream(baos);
+            gzout.write(data);
+            gzout.close();
+            out.writeInt(baos.size());
+            out.write(baos.toByteArray());
+        }
+        long dataPos = out.getFilePointer();
+        out.seek(tocPos);
+        out.writeLong(dataPos);
+        out.seek(dataPos);
+    }
+
+    private void writev6HtmlIndices(RandomAccessFile out, List<HtmlEntry> entries) throws IOException {
+        out.writeInt(entries.size());
+        long tocPos = out.getFilePointer();
+        out.seek(tocPos + entries.size() * 8 + 8);
+        for (HtmlEntry e : entries) {
+            long dataPos = out.getFilePointer();
+            out.seek(tocPos);
+            out.writeLong(dataPos);
+            tocPos += 8;
+            out.seek(dataPos);
+            out.writeInt(e.index());
+        }
+        long dataPos = out.getFilePointer();
+        out.seek(tocPos);
+        out.writeLong(dataPos);
+        out.seek(dataPos);
+    }
+
+    private void writev6IndexEntries(RandomAccessFile out, List<Index.IndexEntry> entries) throws IOException {
+        out.writeInt(entries.size());
+        long tocPos = out.getFilePointer();
+        out.seek(tocPos + entries.size() * 8 + 8);
+        for (Index.IndexEntry e : entries) {
+            long dataPos = out.getFilePointer();
+            out.seek(tocPos);
+            out.writeLong(dataPos);
+            tocPos += 8;
+            out.seek(dataPos);
+            out.writeUTF(e.token);
+            out.writeInt(e.startRow);
+            out.writeInt(e.numRows);
+            final boolean hasNormalizedForm = !e.token.equals(e.normalizedToken());
+            out.writeBoolean(hasNormalizedForm);
+            if (hasNormalizedForm) out.writeUTF(e.normalizedToken());
+            writev6HtmlIndices(out, e.htmlEntries);
+        }
+        long dataPos = out.getFilePointer();
+        out.seek(tocPos);
+        out.writeLong(dataPos);
+        out.seek(dataPos);
+    }
+
+    private void writev6Index(RandomAccessFile out) throws IOException {
+        out.writeInt(indices.size());
+        long tocPos = out.getFilePointer();
+        out.seek(tocPos + indices.size() * 8 + 8);
+        for (Index idx : indices) {
+            long dataPos = out.getFilePointer();
+            out.seek(tocPos);
+            out.writeLong(dataPos);
+            tocPos += 8;
+            out.seek(dataPos);
+            out.writeUTF(idx.shortName);
+            out.writeUTF(idx.longName);
+            out.writeUTF(idx.sortLanguage.getIsoCode());
+            out.writeUTF(idx.normalizerRules);
+            out.writeBoolean(idx.swapPairEntries);
+            out.writeInt(idx.mainTokenCount);
+            writev6IndexEntries(out, idx.sortedIndexEntries);
+
+            // write stoplist, serializing the whole Set *shudder*
+            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+            final ObjectOutputStream oos = new ObjectOutputStream(baos);
+            oos.writeObject(idx.stoplist);
+            oos.close();
+            final byte[] bytes = baos.toByteArray();
+            out.writeInt(bytes.length);
+            out.write(bytes);
+
+            out.writeInt(idx.rows.size());
+            out.writeInt(5);
+            for (RowBase r : idx.rows) {
+                int type = 0;
+                if (r instanceof PairEntry.Row) {
+                    type = 0;
+                } else if (r instanceof TokenRow) {
+                    final TokenRow tokenRow = (TokenRow)r;
+                    type = tokenRow.hasMainEntry ? 1 : 3;
+                } else if (r instanceof TextEntry.Row) {
+                    type = 2;
+                } else if (r instanceof HtmlEntry.Row) {
+                    type = 4;
+                } else {
+                    throw new RuntimeException("Row type not supported for v6");
+                }
+                out.writeByte(type);
+                out.writeInt(r.referenceIndex);
+            }
+        }
+        long dataPos = out.getFilePointer();
+        out.seek(tocPos);
+        out.writeLong(dataPos);
+        out.seek(dataPos);
+    }
+
+    public void writev6(DataOutput out) throws IOException {
+        RandomAccessFile raf = (RandomAccessFile)out;
+        raf.writeInt(6);
+        raf.writeLong(creationMillis);
+        raf.writeUTF(dictInfo);
+        System.out.println("sources start: " + raf.getFilePointer());
+        writev6Sources(raf);
+        System.out.println("pair start: " + raf.getFilePointer());
+        writev6PairEntries(raf);
+        System.out.println("text start: " + raf.getFilePointer());
+        writev6TextEntries(raf);
+        System.out.println("html index start: " + raf.getFilePointer());
+        writev6HtmlEntries(raf);
+        System.out.println("indices start: " + raf.getFilePointer());
+        writev6Index(raf);
+        System.out.println("end: " + raf.getFilePointer());
+        raf.writeUTF(END_OF_DICTIONARY);
+    }
+
     private final class IndexSerializer implements RAFListSerializer<Index> {
         private final FileChannel ch;
 
index 7304dcd2a06e1eead94fcb7fef9a86dcd9279dc4..e8d9789920a017cdc994296178a04653f869e534 100644 (file)
@@ -58,7 +58,7 @@ public final class Index implements RAFSerializable<Index> {
 
     // persisted: tells how the entries are sorted.
     public final Language sortLanguage;
-    private final String normalizerRules;
+    public final String normalizerRules;
 
     // Built from the two above.
     private Transliterator normalizer;
@@ -67,7 +67,7 @@ public final class Index implements RAFSerializable<Index> {
     public final List<IndexEntry> sortedIndexEntries;
 
     // persisted.
-    private final Set<String> stoplist;
+    public final Set<String> stoplist;
 
     // One big list!
     // Various sub-types.
index 22f0af2abcd7294dba9fe92cb495491a6b5126a8..26cdc5586b4fff70a8ca3cefd673e9f60e6c12ca 100644 (file)
@@ -27,7 +27,7 @@ import java.util.regex.Pattern;
 
 public class TextEntry extends AbstractEntry implements RAFSerializable<TextEntry> {
 
-    private final String text;
+    public final String text;
 
     private TextEntry(final Dictionary dictionary, final DataInput raf, final int index)
     throws IOException {