From acfb5ff7f1ab0cafad4bc6a00d854ef829738ae3 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Reimar=20D=C3=B6ffinger?= Date: Sat, 11 Apr 2020 23:13:56 +0200 Subject: [PATCH] Add untested support for writing v6 files. Should allow converting v7 to v6 files, with v6 being suitable for e.g. Tolino devices. Also add a format specification for v6. --- dictionary-format-v6.txt | 158 ++++++++++++++ .../android/dictionary/engine/Dictionary.java | 206 ++++++++++++++++++ .../android/dictionary/engine/Index.java | 4 +- .../android/dictionary/engine/TextEntry.java | 2 +- 4 files changed, 367 insertions(+), 3 deletions(-) create mode 100644 dictionary-format-v6.txt diff --git a/dictionary-format-v6.txt b/dictionary-format-v6.txt new file mode 100644 index 0000000..17cbe23 --- /dev/null +++ b/dictionary-format-v6.txt @@ -0,0 +1,158 @@ +This is a quick write-up of the old dictionary file format, v6. +v6 is troublesome as it relies on Java serialization and thus +there will be references to Java types. +This hasn't been checked for correctness and likely has some bugs. +Also, I really should have used some standard format for writing this... + +=========================================== + +Some basic types: + +[Short] + 2 bytes: big-endian, signed value (note: negative values generally not used here) + +[Int] + 4 bytes: big-endian, signed value (note: negative values generally not used here) + +[Long] + 8 bytes: big-endian, signed value (note: negative values generally not used here) + + +[String] + [Short]: string length + n bytes: string, modified UTF-8, n is value from previous element + note: no zero termination + +====================================================== + +[Dictionary] + +[Int]: version, fixed value 6 +[Long]: file creation time (in milliseconds since Jan. 1st 1970) +[String]: dictionary information (human-readable) + +list_of([source]) +list_of([pair_entry]) +list_of([text_entry]) +list_of([html_entry]) (since v5) +list_of([index]) + +[String]: string "END OF DICTIONARY" (length value 17) + +=========================== + +All list_of entries describe a list of elements. +These elements can have variable size, thus an index (table-of-contents, TOC) +is needed. +To reduce the cost of this table and enable more efficient compression, +multiple entries can be stored in a block that gets one single index entry. +I.e. it is only possible to do random-access to the start of a block, +seeking to elements further inside the block must be done via reading. +Caching should be used to reduce the performance impact of this (so +that when entries 5, 4, 3 etc. of a block are read sequentially, +parsing and decompression is done only once). + +These lists have the following base format: + +[Int]: number of entries in the list (must be >= 0) () + +=*8 + 8 bytes: + table-of-contents. + [Long] offset value for each block of entries. + Followed by a final [Long] offset value to the end of the list data (). + Each offset is an absolute file position. + +-- bytes: + entry data + +========================================================== + +[source] + +[String]: name of source, e.g. "enwiktionary" +[Int]: number of entries from that source (since v3) (I kind of wouldn't rely on that one +being useful/correct...) + +======================================================== + +[pair entry] + +[Short]: source index (see list_of([source])) (since v1) +[Int]: number of pairs in this entry () + times: + [String]: in first language + [String]: in second language (possibly empty) + +================================================= + +[text_entry] + +[Short]: source index (see list_of([source])) (since v1) +[String]: text + +=========================================== + +[html_entry] + +[Short]: source index (see list_of([source])) (since v1) +[String]: title for HTML entry +[Int]: length of decompressed data in bytes () +[Int]: length of compressed data in bytes () + bytes: HTML page data, UTF-8 encoded, gzip compressed + +===================================== + +[index] + +Note: this structure is used for binary search. +It is thus critical that all entries are correctly +sorted. +The sorting is according to libicu, however as Java +and Android versions do not match special hacks +have been added, like ignoring "-" for the comparison +(unless that makes them equal, then they are +compared including the dash). + +[String]: index short name +[String]: index long name +[String]: language ISO code (sort order depends on this) +[String]: ICU normalizer rules to apply for sorting/searching +1 byte: swap pair entries (if != 0, this index is for the second language entries in [pair_entry]) +[Int]: number of main tokens (?) (since v2) +list_of([index_entry]) +[Int]: size of stop list set following (since v4) +Set stop list words (since v4) +uniform_list_of([row]) + + +with uniform_list_of: +[Int]: number of entries in list +[Int]: size of entry +* bytes: data + + +================================================ + +[index_entry] + +[String]: token +[Int]: start index into uniform_list_of([row]) +[Int]: number of rows covered +1 byte: +if != 0: + [String]: normalized token +list_of([Int]) list of indices into list_of(html_entry) (since v6) + +======================================= + +[row] + +1 byte: +[Int]: index + + means: +1: index into list_of([pair_entry]) +2: index into list_of([index_entry]) (mark as "main word header" entry) +3: index into list_of([text_entry]) +4: index into list_of([index_entry]) (mark as "extra info/translation" entry) +5: index into list_of([html_entry]) diff --git a/src/com/hughes/android/dictionary/engine/Dictionary.java b/src/com/hughes/android/dictionary/engine/Dictionary.java index c378f8d..82ba2cf 100644 --- a/src/com/hughes/android/dictionary/engine/Dictionary.java +++ b/src/com/hughes/android/dictionary/engine/Dictionary.java @@ -20,11 +20,13 @@ import com.hughes.util.raf.RAFList; import com.hughes.util.raf.RAFListSerializer; import com.hughes.util.raf.RAFSerializable; +import java.io.ByteArrayOutputStream; import java.io.DataInput; import java.io.DataInputStream; import java.io.DataOutput; import java.io.File; import java.io.IOException; +import java.io.ObjectOutputStream; import java.io.PrintStream; import java.io.RandomAccessFile; import java.nio.channels.Channels; @@ -32,6 +34,7 @@ import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.zip.GZIPOutputStream; public class Dictionary implements RAFSerializable { @@ -117,6 +120,7 @@ public class Dictionary implements RAFSerializable { @Override public void write(DataOutput out) throws IOException { RandomAccessFile raf = (RandomAccessFile)out; + if (dictFileVersion < 7) throw new RuntimeException("write function cannot write formats older than v7!"); raf.writeInt(dictFileVersion); raf.writeLong(creationMillis); raf.writeUTF(dictInfo); @@ -137,6 +141,208 @@ public class Dictionary implements RAFSerializable { raf.writeUTF(END_OF_DICTIONARY); } + private void writev6Sources(RandomAccessFile out) throws IOException { + out.writeInt(sources.size()); + long tocPos = out.getFilePointer(); + out.seek(tocPos + sources.size() * 8 + 8); + for (EntrySource s : sources) { + long dataPos = out.getFilePointer(); + out.seek(tocPos); + out.writeLong(dataPos); + tocPos += 8; + out.seek(dataPos); + out.writeUTF(s.getName()); + out.writeInt(s.getNumEntries()); + } + long dataPos = out.getFilePointer(); + out.seek(tocPos); + out.writeLong(dataPos); + out.seek(dataPos); + } + + private void writev6PairEntries(RandomAccessFile out) throws IOException { + out.writeInt(pairEntries.size()); + long tocPos = out.getFilePointer(); + out.seek(tocPos + pairEntries.size() * 8 + 8); + for (PairEntry pe : pairEntries) { + long dataPos = out.getFilePointer(); + out.seek(tocPos); + out.writeLong(dataPos); + tocPos += 8; + out.seek(dataPos); + out.writeShort(pe.entrySource.index()); + out.writeInt(pe.pairs.size()); + for (PairEntry.Pair p : pe.pairs) { + out.writeUTF(p.lang1); + out.writeUTF(p.lang2); + } + } + long dataPos = out.getFilePointer(); + out.seek(tocPos); + out.writeLong(dataPos); + out.seek(dataPos); + } + + private void writev6TextEntries(RandomAccessFile out) throws IOException { + out.writeInt(textEntries.size()); + long tocPos = out.getFilePointer(); + out.seek(tocPos + textEntries.size() * 8 + 8); + for (TextEntry t : textEntries) { + long dataPos = out.getFilePointer(); + out.seek(tocPos); + out.writeLong(dataPos); + tocPos += 8; + out.seek(dataPos); + out.writeShort(t.entrySource.index()); + out.writeUTF(t.text); + } + long dataPos = out.getFilePointer(); + out.seek(tocPos); + out.writeLong(dataPos); + out.seek(dataPos); + } + + private void writev6HtmlEntries(RandomAccessFile out) throws IOException { + out.writeInt(htmlEntries.size()); + long tocPos = out.getFilePointer(); + out.seek(tocPos + htmlEntries.size() * 8 + 8); + for (HtmlEntry h : htmlEntries) { + long dataPos = out.getFilePointer(); + out.seek(tocPos); + out.writeLong(dataPos); + tocPos += 8; + out.seek(dataPos); + out.writeShort(h.entrySource.index()); + out.writeUTF(h.title); + byte[] data = h.getHtml().getBytes("UTF-8"); + out.writeInt(data.length); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + GZIPOutputStream gzout = new GZIPOutputStream(baos); + gzout.write(data); + gzout.close(); + out.writeInt(baos.size()); + out.write(baos.toByteArray()); + } + long dataPos = out.getFilePointer(); + out.seek(tocPos); + out.writeLong(dataPos); + out.seek(dataPos); + } + + private void writev6HtmlIndices(RandomAccessFile out, List entries) throws IOException { + out.writeInt(entries.size()); + long tocPos = out.getFilePointer(); + out.seek(tocPos + entries.size() * 8 + 8); + for (HtmlEntry e : entries) { + long dataPos = out.getFilePointer(); + out.seek(tocPos); + out.writeLong(dataPos); + tocPos += 8; + out.seek(dataPos); + out.writeInt(e.index()); + } + long dataPos = out.getFilePointer(); + out.seek(tocPos); + out.writeLong(dataPos); + out.seek(dataPos); + } + + private void writev6IndexEntries(RandomAccessFile out, List entries) throws IOException { + out.writeInt(entries.size()); + long tocPos = out.getFilePointer(); + out.seek(tocPos + entries.size() * 8 + 8); + for (Index.IndexEntry e : entries) { + long dataPos = out.getFilePointer(); + out.seek(tocPos); + out.writeLong(dataPos); + tocPos += 8; + out.seek(dataPos); + out.writeUTF(e.token); + out.writeInt(e.startRow); + out.writeInt(e.numRows); + final boolean hasNormalizedForm = !e.token.equals(e.normalizedToken()); + out.writeBoolean(hasNormalizedForm); + if (hasNormalizedForm) out.writeUTF(e.normalizedToken()); + writev6HtmlIndices(out, e.htmlEntries); + } + long dataPos = out.getFilePointer(); + out.seek(tocPos); + out.writeLong(dataPos); + out.seek(dataPos); + } + + private void writev6Index(RandomAccessFile out) throws IOException { + out.writeInt(indices.size()); + long tocPos = out.getFilePointer(); + out.seek(tocPos + indices.size() * 8 + 8); + for (Index idx : indices) { + long dataPos = out.getFilePointer(); + out.seek(tocPos); + out.writeLong(dataPos); + tocPos += 8; + out.seek(dataPos); + out.writeUTF(idx.shortName); + out.writeUTF(idx.longName); + out.writeUTF(idx.sortLanguage.getIsoCode()); + out.writeUTF(idx.normalizerRules); + out.writeBoolean(idx.swapPairEntries); + out.writeInt(idx.mainTokenCount); + writev6IndexEntries(out, idx.sortedIndexEntries); + + // write stoplist, serializing the whole Set *shudder* + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final ObjectOutputStream oos = new ObjectOutputStream(baos); + oos.writeObject(idx.stoplist); + oos.close(); + final byte[] bytes = baos.toByteArray(); + out.writeInt(bytes.length); + out.write(bytes); + + out.writeInt(idx.rows.size()); + out.writeInt(5); + for (RowBase r : idx.rows) { + int type = 0; + if (r instanceof PairEntry.Row) { + type = 0; + } else if (r instanceof TokenRow) { + final TokenRow tokenRow = (TokenRow)r; + type = tokenRow.hasMainEntry ? 1 : 3; + } else if (r instanceof TextEntry.Row) { + type = 2; + } else if (r instanceof HtmlEntry.Row) { + type = 4; + } else { + throw new RuntimeException("Row type not supported for v6"); + } + out.writeByte(type); + out.writeInt(r.referenceIndex); + } + } + long dataPos = out.getFilePointer(); + out.seek(tocPos); + out.writeLong(dataPos); + out.seek(dataPos); + } + + public void writev6(DataOutput out) throws IOException { + RandomAccessFile raf = (RandomAccessFile)out; + raf.writeInt(6); + raf.writeLong(creationMillis); + raf.writeUTF(dictInfo); + System.out.println("sources start: " + raf.getFilePointer()); + writev6Sources(raf); + System.out.println("pair start: " + raf.getFilePointer()); + writev6PairEntries(raf); + System.out.println("text start: " + raf.getFilePointer()); + writev6TextEntries(raf); + System.out.println("html index start: " + raf.getFilePointer()); + writev6HtmlEntries(raf); + System.out.println("indices start: " + raf.getFilePointer()); + writev6Index(raf); + System.out.println("end: " + raf.getFilePointer()); + raf.writeUTF(END_OF_DICTIONARY); + } + private final class IndexSerializer implements RAFListSerializer { private final FileChannel ch; diff --git a/src/com/hughes/android/dictionary/engine/Index.java b/src/com/hughes/android/dictionary/engine/Index.java index 7304dcd..e8d9789 100644 --- a/src/com/hughes/android/dictionary/engine/Index.java +++ b/src/com/hughes/android/dictionary/engine/Index.java @@ -58,7 +58,7 @@ public final class Index implements RAFSerializable { // persisted: tells how the entries are sorted. public final Language sortLanguage; - private final String normalizerRules; + public final String normalizerRules; // Built from the two above. private Transliterator normalizer; @@ -67,7 +67,7 @@ public final class Index implements RAFSerializable { public final List sortedIndexEntries; // persisted. - private final Set stoplist; + public final Set stoplist; // One big list! // Various sub-types. diff --git a/src/com/hughes/android/dictionary/engine/TextEntry.java b/src/com/hughes/android/dictionary/engine/TextEntry.java index 22f0af2..26cdc55 100644 --- a/src/com/hughes/android/dictionary/engine/TextEntry.java +++ b/src/com/hughes/android/dictionary/engine/TextEntry.java @@ -27,7 +27,7 @@ import java.util.regex.Pattern; public class TextEntry extends AbstractEntry implements RAFSerializable { - private final String text; + public final String text; private TextEntry(final Dictionary dictionary, final DataInput raf, final int index) throws IOException { -- 2.43.0