--- /dev/null
+This is a quick write-up of the old dictionary file format, v6.
+v6 is troublesome as it relies on Java serialization and thus
+there will be references to Java types.
+This hasn't been checked for correctness and likely has some bugs.
+Also, I really should have used some standard format for writing this...
+
+===========================================
+
+Some basic types:
+
+[Short]
+ 2 bytes: big-endian, signed value (note: negative values generally not used here)
+
+[Int]
+ 4 bytes: big-endian, signed value (note: negative values generally not used here)
+
+[Long]
+ 8 bytes: big-endian, signed value (note: negative values generally not used here)
+
+
+[String]
+ [Short]: string length
+ n bytes: string, modified UTF-8, n is value from previous element
+ note: no zero termination
+
+======================================================
+
+[Dictionary]
+
+[Int]: version, fixed value 6
+[Long]: file creation time (in milliseconds since Jan. 1st 1970)
+[String]: dictionary information (human-readable)
+
+list_of([source])
+list_of([pair_entry])
+list_of([text_entry])
+list_of([html_entry]) (since v5)
+list_of([index])
+
+[String]: string "END OF DICTIONARY" (length value 17)
+
+===========================
+
+All list_of entries describe a list of elements.
+These elements can have variable size, thus an index (table-of-contents, TOC)
+is needed.
+To reduce the cost of this table and enable more efficient compression,
+multiple entries can be stored in a block that gets one single index entry.
+I.e. it is only possible to do random-access to the start of a block,
+seeking to elements further inside the block must be done via reading.
+Caching should be used to reduce the performance impact of this (so
+that when entries 5, 4, 3 etc. of a block are read sequentially,
+parsing and decompression is done only once).
+
+These lists have the following base format:
+
+[Int]: number of entries in the list (must be >= 0) (<size>)
+
+<toc size>=<size>*8 + 8 bytes:
+ table-of-contents.
+ [Long] offset value for each block of entries.
+ Followed by a final [Long] offset value to the end of the list data (<end offset>).
+ Each offset is an absolute file position.
+
+<end offset>-<toc size>-<start of toc> bytes:
+ entry data
+
+==========================================================
+
+[source]
+
+[String]: name of source, e.g. "enwiktionary"
+[Int]: number of entries from that source (since v3) (I kind of wouldn't rely on that one
+being useful/correct...)
+
+========================================================
+
+[pair entry]
+
+[Short]: source index (see list_of([source])) (since v1)
+[Int]: number of pairs in this entry (<num_pairs>)
+<num_pairs> times:
+ [String]: in first language
+ [String]: in second language (possibly empty)
+
+=================================================
+
+[text_entry]
+
+[Short]: source index (see list_of([source])) (since v1)
+[String]: text
+
+===========================================
+
+[html_entry]
+
+[Short]: source index (see list_of([source])) (since v1)
+[String]: title for HTML entry
+[Int]: length of decompressed data in bytes (<declen>)
+[Int]: length of compressed data in bytes (<len>)
+<len> bytes: HTML page data, UTF-8 encoded, gzip compressed
+
+=====================================
+
+[index]
+
+Note: this structure is used for binary search.
+It is thus critical that all entries are correctly
+sorted.
+The sorting is according to libicu, however as Java
+and Android versions do not match special hacks
+have been added, like ignoring "-" for the comparison
+(unless that makes them equal, then they are
+compared including the dash).
+
+[String]: index short name
+[String]: index long name
+[String]: language ISO code (sort order depends on this)
+[String]: ICU normalizer rules to apply for sorting/searching
+1 byte: swap pair entries (if != 0, this index is for the second language entries in [pair_entry])
+[Int]: number of main tokens (?) (since v2)
+list_of([index_entry])
+[Int]: size of stop list set following (since v4)
+Set<String> stop list words (since v4)
+uniform_list_of([row])
+
+
+with uniform_list_of:
+[Int]: number of entries in list <num_entries>
+[Int]: size of entry <entry_size>
+<num_entries>*<entry_size> bytes: data
+
+
+================================================
+
+[index_entry]
+
+[String]: token
+[Int]: start index into uniform_list_of([row])
+[Int]: number of rows covered
+1 byte: <has_normalized>
+if <has_normalized> != 0:
+ [String]: normalized token
+list_of([Int]) list of indices into list_of(html_entry) (since v6)
+
+=======================================
+
+[row]
+
+1 byte: <type>
+[Int]: index
+
+<type> means:
+1: index into list_of([pair_entry])
+2: index into list_of([index_entry]) (mark as "main word header" entry)
+3: index into list_of([text_entry])
+4: index into list_of([index_entry]) (mark as "extra info/translation" entry)
+5: index into list_of([html_entry])
import com.hughes.util.raf.RAFListSerializer;
import com.hughes.util.raf.RAFSerializable;
+import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
+import java.io.ObjectOutputStream;
import java.io.PrintStream;
import java.io.RandomAccessFile;
import java.nio.channels.Channels;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
+import java.util.zip.GZIPOutputStream;
public class Dictionary implements RAFSerializable<Dictionary> {
@Override
public void write(DataOutput out) throws IOException {
RandomAccessFile raf = (RandomAccessFile)out;
+ if (dictFileVersion < 7) throw new RuntimeException("write function cannot write formats older than v7!");
raf.writeInt(dictFileVersion);
raf.writeLong(creationMillis);
raf.writeUTF(dictInfo);
raf.writeUTF(END_OF_DICTIONARY);
}
+ private void writev6Sources(RandomAccessFile out) throws IOException {
+ out.writeInt(sources.size());
+ long tocPos = out.getFilePointer();
+ out.seek(tocPos + sources.size() * 8 + 8);
+ for (EntrySource s : sources) {
+ long dataPos = out.getFilePointer();
+ out.seek(tocPos);
+ out.writeLong(dataPos);
+ tocPos += 8;
+ out.seek(dataPos);
+ out.writeUTF(s.getName());
+ out.writeInt(s.getNumEntries());
+ }
+ long dataPos = out.getFilePointer();
+ out.seek(tocPos);
+ out.writeLong(dataPos);
+ out.seek(dataPos);
+ }
+
+ private void writev6PairEntries(RandomAccessFile out) throws IOException {
+ out.writeInt(pairEntries.size());
+ long tocPos = out.getFilePointer();
+ out.seek(tocPos + pairEntries.size() * 8 + 8);
+ for (PairEntry pe : pairEntries) {
+ long dataPos = out.getFilePointer();
+ out.seek(tocPos);
+ out.writeLong(dataPos);
+ tocPos += 8;
+ out.seek(dataPos);
+ out.writeShort(pe.entrySource.index());
+ out.writeInt(pe.pairs.size());
+ for (PairEntry.Pair p : pe.pairs) {
+ out.writeUTF(p.lang1);
+ out.writeUTF(p.lang2);
+ }
+ }
+ long dataPos = out.getFilePointer();
+ out.seek(tocPos);
+ out.writeLong(dataPos);
+ out.seek(dataPos);
+ }
+
+ private void writev6TextEntries(RandomAccessFile out) throws IOException {
+ out.writeInt(textEntries.size());
+ long tocPos = out.getFilePointer();
+ out.seek(tocPos + textEntries.size() * 8 + 8);
+ for (TextEntry t : textEntries) {
+ long dataPos = out.getFilePointer();
+ out.seek(tocPos);
+ out.writeLong(dataPos);
+ tocPos += 8;
+ out.seek(dataPos);
+ out.writeShort(t.entrySource.index());
+ out.writeUTF(t.text);
+ }
+ long dataPos = out.getFilePointer();
+ out.seek(tocPos);
+ out.writeLong(dataPos);
+ out.seek(dataPos);
+ }
+
+ private void writev6HtmlEntries(RandomAccessFile out) throws IOException {
+ out.writeInt(htmlEntries.size());
+ long tocPos = out.getFilePointer();
+ out.seek(tocPos + htmlEntries.size() * 8 + 8);
+ for (HtmlEntry h : htmlEntries) {
+ long dataPos = out.getFilePointer();
+ out.seek(tocPos);
+ out.writeLong(dataPos);
+ tocPos += 8;
+ out.seek(dataPos);
+ out.writeShort(h.entrySource.index());
+ out.writeUTF(h.title);
+ byte[] data = h.getHtml().getBytes("UTF-8");
+ out.writeInt(data.length);
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ GZIPOutputStream gzout = new GZIPOutputStream(baos);
+ gzout.write(data);
+ gzout.close();
+ out.writeInt(baos.size());
+ out.write(baos.toByteArray());
+ }
+ long dataPos = out.getFilePointer();
+ out.seek(tocPos);
+ out.writeLong(dataPos);
+ out.seek(dataPos);
+ }
+
+ private void writev6HtmlIndices(RandomAccessFile out, List<HtmlEntry> entries) throws IOException {
+ out.writeInt(entries.size());
+ long tocPos = out.getFilePointer();
+ out.seek(tocPos + entries.size() * 8 + 8);
+ for (HtmlEntry e : entries) {
+ long dataPos = out.getFilePointer();
+ out.seek(tocPos);
+ out.writeLong(dataPos);
+ tocPos += 8;
+ out.seek(dataPos);
+ out.writeInt(e.index());
+ }
+ long dataPos = out.getFilePointer();
+ out.seek(tocPos);
+ out.writeLong(dataPos);
+ out.seek(dataPos);
+ }
+
+ private void writev6IndexEntries(RandomAccessFile out, List<Index.IndexEntry> entries) throws IOException {
+ out.writeInt(entries.size());
+ long tocPos = out.getFilePointer();
+ out.seek(tocPos + entries.size() * 8 + 8);
+ for (Index.IndexEntry e : entries) {
+ long dataPos = out.getFilePointer();
+ out.seek(tocPos);
+ out.writeLong(dataPos);
+ tocPos += 8;
+ out.seek(dataPos);
+ out.writeUTF(e.token);
+ out.writeInt(e.startRow);
+ out.writeInt(e.numRows);
+ final boolean hasNormalizedForm = !e.token.equals(e.normalizedToken());
+ out.writeBoolean(hasNormalizedForm);
+ if (hasNormalizedForm) out.writeUTF(e.normalizedToken());
+ writev6HtmlIndices(out, e.htmlEntries);
+ }
+ long dataPos = out.getFilePointer();
+ out.seek(tocPos);
+ out.writeLong(dataPos);
+ out.seek(dataPos);
+ }
+
+ private void writev6Index(RandomAccessFile out) throws IOException {
+ out.writeInt(indices.size());
+ long tocPos = out.getFilePointer();
+ out.seek(tocPos + indices.size() * 8 + 8);
+ for (Index idx : indices) {
+ long dataPos = out.getFilePointer();
+ out.seek(tocPos);
+ out.writeLong(dataPos);
+ tocPos += 8;
+ out.seek(dataPos);
+ out.writeUTF(idx.shortName);
+ out.writeUTF(idx.longName);
+ out.writeUTF(idx.sortLanguage.getIsoCode());
+ out.writeUTF(idx.normalizerRules);
+ out.writeBoolean(idx.swapPairEntries);
+ out.writeInt(idx.mainTokenCount);
+ writev6IndexEntries(out, idx.sortedIndexEntries);
+
+ // write stoplist, serializing the whole Set *shudder*
+ final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ final ObjectOutputStream oos = new ObjectOutputStream(baos);
+ oos.writeObject(idx.stoplist);
+ oos.close();
+ final byte[] bytes = baos.toByteArray();
+ out.writeInt(bytes.length);
+ out.write(bytes);
+
+ out.writeInt(idx.rows.size());
+ out.writeInt(5);
+ for (RowBase r : idx.rows) {
+ int type = 0;
+ if (r instanceof PairEntry.Row) {
+ type = 0;
+ } else if (r instanceof TokenRow) {
+ final TokenRow tokenRow = (TokenRow)r;
+ type = tokenRow.hasMainEntry ? 1 : 3;
+ } else if (r instanceof TextEntry.Row) {
+ type = 2;
+ } else if (r instanceof HtmlEntry.Row) {
+ type = 4;
+ } else {
+ throw new RuntimeException("Row type not supported for v6");
+ }
+ out.writeByte(type);
+ out.writeInt(r.referenceIndex);
+ }
+ }
+ long dataPos = out.getFilePointer();
+ out.seek(tocPos);
+ out.writeLong(dataPos);
+ out.seek(dataPos);
+ }
+
+ public void writev6(DataOutput out) throws IOException {
+ RandomAccessFile raf = (RandomAccessFile)out;
+ raf.writeInt(6);
+ raf.writeLong(creationMillis);
+ raf.writeUTF(dictInfo);
+ System.out.println("sources start: " + raf.getFilePointer());
+ writev6Sources(raf);
+ System.out.println("pair start: " + raf.getFilePointer());
+ writev6PairEntries(raf);
+ System.out.println("text start: " + raf.getFilePointer());
+ writev6TextEntries(raf);
+ System.out.println("html index start: " + raf.getFilePointer());
+ writev6HtmlEntries(raf);
+ System.out.println("indices start: " + raf.getFilePointer());
+ writev6Index(raf);
+ System.out.println("end: " + raf.getFilePointer());
+ raf.writeUTF(END_OF_DICTIONARY);
+ }
+
private final class IndexSerializer implements RAFListSerializer<Index> {
private final FileChannel ch;