]> gitweb.fperrin.net Git - Dictionary.git/commitdiff
Experiments with new dictionary format.
authorReimar Döffinger <Reimar.Doeffinger@gmx.de>
Mon, 7 Dec 2015 15:35:35 +0000 (16:35 +0100)
committerReimar Döffinger <Reimar.Doeffinger@gmx.de>
Mon, 7 Dec 2015 21:33:57 +0000 (22:33 +0100)
res/raw/dictionary_info.txt
src/com/hughes/android/dictionary/engine/Dictionary.java
src/com/hughes/android/dictionary/engine/HtmlEntry.java
src/com/hughes/android/dictionary/engine/Index.java
src/com/hughes/android/dictionary/engine/PairEntry.java

index 281da0b29e9707dfef3b9c9c4846ec69650d81ed..00a226896cf8bb0f9a90741971f277c28c14ded7 100644 (file)
@@ -2,7 +2,7 @@ AR-DE.quickdic  http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-d
 DE-CA.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-CA.quickdic.v006.zip    1442056078905   4811184 2807340 2       DE      18490   17773   CA      15490   14102   (DE)Wiktionary-based DE-CA dictionary.
 DE-CS.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-CS.quickdic.v006.zip    1442056193569   8280756 5450496 2       DE      23504   22264   CS      24042   19644   (DE)Wiktionary-based DE-CS dictionary.
 DE-EO.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-EO.quickdic.v006.zip    1442056315613   4811254 2404134 2       DE      14741   14061   EO      17158   16319   (DE)Wiktionary-based DE-EO dictionary.
-DE-ES.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-ES.quickdic.v006.zip    1442056427916   7416740 3343255 2       DE      34399   32197   ES      29606   26911   (DE)Wiktionary-based DE-ES dictionary.
+DE-ES.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-ES.quickdic.v007.zip    1449523882659   4000266 3273614 2       DE      34399   32197   ES      29606   26911   (DE)Wiktionary-based DE-ES dictionary.
 DE-FR.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-FR.quickdic.v006.zip    1442056552895   20761086        15018822        2       DE      48386   45902   FR      32854   29253   (DE)Wiktionary-based DE-FR dictionary.
 DE-HE.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-HE.quickdic.v006.zip    1357252532185   1672312 577438  2       DE      8503    7935    HE      12590   5651    Wikitionary-based DE-HE dictionary.
 DE-HU.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-HU.quickdic.v006.zip    1442056707554   3771060 1667008 2       DE      19685   18595   HU      17566   15923   (DE)Wiktionary-based DE-HU dictionary.
index 2cc05b4f573213659b17636fbb2d2a52a91cd003..57ae6e73cd5f6c41ed392fa7d2f050eaa4b76251 100644 (file)
@@ -16,6 +16,7 @@ package com.hughes.android.dictionary.engine;
 
 import com.hughes.android.dictionary.DictionaryInfo;
 import com.hughes.util.CachingList;
+import com.hughes.util.StringUtil;
 import com.hughes.util.raf.RAFList;
 import com.hughes.util.raf.RAFListSerializer;
 import com.hughes.util.raf.RAFSerializable;
@@ -34,7 +35,7 @@ public class Dictionary implements RAFSerializable<Dictionary> {
 
     static final int CACHE_SIZE = 5000;
 
-    static final int CURRENT_DICT_VERSION = 6;
+    static final int CURRENT_DICT_VERSION = 7;
     static final String END_OF_DICTIONARY = "END OF DICTIONARY";
 
     // persisted
@@ -75,25 +76,25 @@ public class Dictionary implements RAFSerializable<Dictionary> {
         // disrupts the offset.
         try {
             final RAFList<EntrySource> rafSources = RAFList.create(raf, new EntrySource.Serializer(
-                    this), raf.getFilePointer());
+                    this), raf.getFilePointer(), dictFileVersion);
             sources = new ArrayList<EntrySource>(rafSources);
             raf.seek(rafSources.getEndOffset());
 
             pairEntries = CachingList.create(
-                    RAFList.create(raf, new PairEntry.Serializer(this), raf.getFilePointer()),
+                    RAFList.create(raf, new PairEntry.Serializer(this), raf.getFilePointer(), dictFileVersion, dictFileVersion >= 7 ? 64 : 1, dictFileVersion >= 7),
                     CACHE_SIZE);
             textEntries = CachingList.create(
-                    RAFList.create(raf, new TextEntry.Serializer(this), raf.getFilePointer()),
+                    RAFList.create(raf, new TextEntry.Serializer(this), raf.getFilePointer(), dictFileVersion),
                     CACHE_SIZE);
             if (dictFileVersion >= 5) {
                 htmlEntries = CachingList.create(
-                        RAFList.create(raf, new HtmlEntry.Serializer(this), raf.getFilePointer()),
+                        RAFList.create(raf, new HtmlEntry.Serializer(this), raf.getFilePointer(), dictFileVersion),
                         CACHE_SIZE);
             } else {
                 htmlEntries = Collections.emptyList();
             }
             indices = CachingList.createFullyCached(RAFList.create(raf, indexSerializer,
-                    raf.getFilePointer()));
+                    raf.getFilePointer(), dictFileVersion));
         } catch (RuntimeException e) {
             final IOException ioe = new IOException("RuntimeException loading dictionary");
             ioe.initCause(e);
@@ -111,11 +112,17 @@ public class Dictionary implements RAFSerializable<Dictionary> {
         raf.writeInt(dictFileVersion);
         raf.writeLong(creationMillis);
         raf.writeUTF(dictInfo);
+        System.out.println("sources start: " + raf.getFilePointer());
         RAFList.write(raf, sources, new EntrySource.Serializer(this));
-        RAFList.write(raf, pairEntries, new PairEntry.Serializer(this));
+        System.out.println("pair start: " + raf.getFilePointer());
+        RAFList.write(raf, pairEntries, new PairEntry.Serializer(this), 64, true);
+        System.out.println("text start: " + raf.getFilePointer());
         RAFList.write(raf, textEntries, new TextEntry.Serializer(this));
+        System.out.println("html start: " + raf.getFilePointer());
         RAFList.write(raf, htmlEntries, new HtmlEntry.Serializer(this));
+        System.out.println("indices start: " + raf.getFilePointer());
         RAFList.write(raf, indices, indexSerializer);
+        System.out.println("end: " + raf.getFilePointer());
         raf.writeUTF(END_OF_DICTIONARY);
     }
 
index e37d19c0abc33bb7c0f95806c57fb86a9ad4dcd0..7694fe9488ec7fee49bc28dfa44aa96cf5cacc88 100644 (file)
@@ -33,7 +33,7 @@ public class HtmlEntry extends AbstractEntry implements RAFSerializable<HtmlEntr
             throws IOException {
         super(dictionary, raf, index);
         title = raf.readUTF();
-        lazyHtmlLoader = new LazyHtmlLoader(raf);
+        lazyHtmlLoader = new LazyHtmlLoader(raf, dictionary.dictFileVersion);
         html = null;
     }
 
@@ -44,8 +44,7 @@ public class HtmlEntry extends AbstractEntry implements RAFSerializable<HtmlEntr
 
         final byte[] bytes = getHtml().getBytes("UTF-8");
         final byte[] zipBytes = StringUtil.zipBytes(bytes);
-        raf.writeInt(bytes.length);
-        raf.writeInt(zipBytes.length);
+        StringUtil.writeVarInt(raf, zipBytes.length);
         raf.write(zipBytes);
     }
 
@@ -188,10 +187,15 @@ public class HtmlEntry extends AbstractEntry implements RAFSerializable<HtmlEntr
         // Not sure this volatile is right, but oh well.
         volatile SoftReference<String> htmlRef = new SoftReference<String>(null);
 
-        private LazyHtmlLoader(final DataInput inp) throws IOException {
+        private LazyHtmlLoader(final DataInput inp, int version) throws IOException {
             raf = (RandomAccessFile)inp;
-            numBytes = raf.readInt();
-            numZipBytes = raf.readInt();
+            if (version >= 7) {
+                numBytes = -1;
+                numZipBytes = StringUtil.readVarInt(raf);
+            } else {
+                numBytes = raf.readInt();
+                numZipBytes = raf.readInt();
+            }
             offset = raf.getFilePointer();
             raf.skipBytes(numZipBytes);
         }
@@ -203,7 +207,6 @@ public class HtmlEntry extends AbstractEntry implements RAFSerializable<HtmlEntr
             }
             System.out.println("Loading Html: numBytes=" + numBytes + ", numZipBytes="
                     + numZipBytes);
-            final byte[] bytes = new byte[numBytes];
             final byte[] zipBytes = new byte[numZipBytes];
             synchronized (raf) {
                 try {
@@ -214,7 +217,7 @@ public class HtmlEntry extends AbstractEntry implements RAFSerializable<HtmlEntr
                 }
             }
             try {
-                StringUtil.unzipFully(zipBytes, bytes);
+                final byte[] bytes = StringUtil.unzipFully(zipBytes, numBytes);
                 html = new String(bytes, "UTF-8");
             } catch (IOException e) {
                 throw new RuntimeException(e);
index ce6947768ee77a8fd345eedbb53fb65e6c7f1d96..cb29bac28ab989a40d6f12c7d647f77b39c974ab 100644 (file)
@@ -22,6 +22,7 @@ import com.hughes.android.dictionary.DictionaryInfo;
 import com.hughes.android.dictionary.DictionaryInfo.IndexInfo;
 import com.hughes.android.dictionary.engine.RowBase.RowKey;
 import com.hughes.util.CachingList;
+import com.hughes.util.StringUtil;
 import com.hughes.util.TransformingList;
 import com.hughes.util.raf.RAFList;
 import com.hughes.util.raf.RAFSerializable;
@@ -36,6 +37,7 @@ import java.io.DataOutput;
 import java.io.IOException;
 import java.io.PrintStream;
 import java.io.RandomAccessFile;
+import java.util.AbstractList;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@@ -133,7 +135,8 @@ public final class Index implements RAFSerializable<Index> {
             mainTokenCount = raf.readInt();
         }
         sortedIndexEntries = CachingList.create(
-                RAFList.create(raf, indexEntrySerializer, raf.getFilePointer()), CACHE_SIZE);
+                RAFList.create(raf, indexEntrySerializer, raf.getFilePointer(), dict.dictFileVersion,
+                dict.dictFileVersion >= 7 ? 16 : 1, dict.dictFileVersion >= 7), CACHE_SIZE);
         if (dict.dictFileVersion >= 4) {
             stoplist = new SerializableSerializer<Set<String>>().read(raf);
         } else {
@@ -155,7 +158,7 @@ public final class Index implements RAFSerializable<Index> {
         if (dict.dictFileVersion >= 2) {
             raf.writeInt(mainTokenCount);
         }
-        RAFList.write(raf, sortedIndexEntries, indexEntrySerializer);
+        RAFList.write(raf, sortedIndexEntries, indexEntrySerializer, 16, true);
         new SerializableSerializer<Set<String>>().write(raf, stoplist);
         UniformRAFList.write(raf, rows, new RowBase.Serializer(this), 5 /*
                                                                                                * bytes
@@ -188,7 +191,8 @@ public final class Index implements RAFSerializable<Index> {
         private final String normalizedToken;
         public final int startRow;
         public final int numRows; // doesn't count the token row!
-        public final List<HtmlEntry> htmlEntries;
+        public List<HtmlEntry> htmlEntries;
+        private int[] htmlEntryIndices;
 
         public IndexEntry(final Index index, final String token, final String normalizedToken,
                 final int startRow, final int numRows) {
@@ -202,34 +206,56 @@ public final class Index implements RAFSerializable<Index> {
             this.htmlEntries = new ArrayList<HtmlEntry>();
         }
 
-        public IndexEntry(final Index index, final DataInput inp) throws IOException {
+        public IndexEntry(final Index index, final DataInput raf) throws IOException {
             this.index = index;
-            RandomAccessFile raf = (RandomAccessFile)inp;
             token = raf.readUTF();
-            startRow = raf.readInt();
-            numRows = raf.readInt();
+            if (index.dict.dictFileVersion >= 7) {
+                startRow = StringUtil.readVarInt(raf);
+                numRows = StringUtil.readVarInt(raf);
+            } else {
+                startRow = raf.readInt();
+                numRows = raf.readInt();
+            }
             final boolean hasNormalizedForm = raf.readBoolean();
             normalizedToken = hasNormalizedForm ? raf.readUTF() : token;
-            if (index.dict.dictFileVersion >= 6) {
+            htmlEntryIndices = null;
+            if (index.dict.dictFileVersion >= 7) {
+                int size = StringUtil.readVarInt(raf);
+                htmlEntryIndices = new int[size];
+                for (int i = 0; i < size; ++i) {
+                    htmlEntryIndices[i] = StringUtil.readVarInt(raf);
+                }
+                this.htmlEntries = CachingList.create(new AbstractList<HtmlEntry>() {
+                    @Override
+                    public HtmlEntry get(int i) {
+                        return index.dict.htmlEntries.get(htmlEntryIndices[i]);
+                    }
+                    @Override
+                    public int size() {
+                        return htmlEntryIndices.length;
+                    }
+                    }, 1);
+            } else if (index.dict.dictFileVersion >= 6) {
                 this.htmlEntries = CachingList.create(
-                        RAFList.create(raf, index.dict.htmlEntryIndexSerializer,
-                                raf.getFilePointer()), 1);
+                        RAFList.create((RandomAccessFile)raf, index.dict.htmlEntryIndexSerializer,
+                                ((RandomAccessFile)raf).getFilePointer(), index.dict.dictFileVersion), 1);
             } else {
                 this.htmlEntries = Collections.emptyList();
             }
         }
 
-        public void write(DataOutput out) throws IOException {
-            RandomAccessFile raf = (RandomAccessFile)out;
+        public void write(DataOutput raf) throws IOException {
             raf.writeUTF(token);
-            raf.writeInt(startRow);
-            raf.writeInt(numRows);
+            StringUtil.writeVarInt(raf, startRow);
+            StringUtil.writeVarInt(raf, numRows);
             final boolean hasNormalizedForm = !token.equals(normalizedToken);
             raf.writeBoolean(hasNormalizedForm);
             if (hasNormalizedForm) {
                 raf.writeUTF(normalizedToken);
             }
-            RAFList.write(raf, htmlEntries, index.dict.htmlEntryIndexSerializer);
+            StringUtil.writeVarInt(raf, htmlEntries.size());
+            for (HtmlEntry e : htmlEntries)
+                StringUtil.writeVarInt(raf, e.index());
         }
 
         public String toString() {
index 00f690a3b0a7d4462b70844ed1048a4f51049434..ad9625caaf3ac7ec450ceecb788dee36bd54a9d5 100644 (file)
@@ -14,6 +14,7 @@
 
 package com.hughes.android.dictionary.engine;
 
+import com.hughes.util.StringUtil;
 import com.hughes.util.raf.RAFListSerializer;
 import com.hughes.util.raf.RAFSerializable;
 import com.ibm.icu.text.Transliterator;
@@ -44,7 +45,7 @@ public class PairEntry extends AbstractEntry implements RAFSerializable<PairEntr
     public PairEntry(final Dictionary dictionary, final DataInput raf, final int index)
             throws IOException {
         super(dictionary, raf, index);
-        final int size = raf.readInt();
+        final int size = StringUtil.readVarInt(raf);
         pairs = new ArrayList<PairEntry.Pair>(size);
         for (int i = 0; i < size; ++i) {
             pairs.add(new Pair(raf.readUTF(), raf.readUTF()));
@@ -54,8 +55,7 @@ public class PairEntry extends AbstractEntry implements RAFSerializable<PairEntr
     @Override
     public void write(DataOutput raf) throws IOException {
         super.write(raf);
-        // TODO: this could be a short.
-        raf.writeInt(pairs.size());
+        StringUtil.writeVarInt(raf, pairs.size());
         for (int i = 0; i < pairs.size(); ++i) {
             assert pairs.get(i).lang1.length() > 0;
             raf.writeUTF(pairs.get(i).lang1);