From: Reimar Döffinger Date: Mon, 7 Dec 2015 15:35:35 +0000 (+0100) Subject: Experiments with new dictionary format. X-Git-Url: http://gitweb.fperrin.net/?p=Dictionary.git;a=commitdiff_plain;h=f8329ca5a93f93c26bc9f014a831da876f32867d Experiments with new dictionary format. --- diff --git a/res/raw/dictionary_info.txt b/res/raw/dictionary_info.txt index 281da0b..00a2268 100644 --- a/res/raw/dictionary_info.txt +++ b/res/raw/dictionary_info.txt @@ -2,7 +2,7 @@ AR-DE.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-d DE-CA.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-CA.quickdic.v006.zip 1442056078905 4811184 2807340 2 DE 18490 17773 CA 15490 14102 (DE)Wiktionary-based DE-CA dictionary. DE-CS.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-CS.quickdic.v006.zip 1442056193569 8280756 5450496 2 DE 23504 22264 CS 24042 19644 (DE)Wiktionary-based DE-CS dictionary. DE-EO.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-EO.quickdic.v006.zip 1442056315613 4811254 2404134 2 DE 14741 14061 EO 17158 16319 (DE)Wiktionary-based DE-EO dictionary. -DE-ES.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-ES.quickdic.v006.zip 1442056427916 7416740 3343255 2 DE 34399 32197 ES 29606 26911 (DE)Wiktionary-based DE-ES dictionary. +DE-ES.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-ES.quickdic.v007.zip 1449523882659 4000266 3273614 2 DE 34399 32197 ES 29606 26911 (DE)Wiktionary-based DE-ES dictionary. DE-FR.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-FR.quickdic.v006.zip 1442056552895 20761086 15018822 2 DE 48386 45902 FR 32854 29253 (DE)Wiktionary-based DE-FR dictionary. DE-HE.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-HE.quickdic.v006.zip 1357252532185 1672312 577438 2 DE 8503 7935 HE 12590 5651 Wikitionary-based DE-HE dictionary. DE-HU.quickdic http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/DE-HU.quickdic.v006.zip 1442056707554 3771060 1667008 2 DE 19685 18595 HU 17566 15923 (DE)Wiktionary-based DE-HU dictionary. diff --git a/src/com/hughes/android/dictionary/engine/Dictionary.java b/src/com/hughes/android/dictionary/engine/Dictionary.java index 2cc05b4..57ae6e7 100644 --- a/src/com/hughes/android/dictionary/engine/Dictionary.java +++ b/src/com/hughes/android/dictionary/engine/Dictionary.java @@ -16,6 +16,7 @@ package com.hughes.android.dictionary.engine; import com.hughes.android.dictionary.DictionaryInfo; import com.hughes.util.CachingList; +import com.hughes.util.StringUtil; import com.hughes.util.raf.RAFList; import com.hughes.util.raf.RAFListSerializer; import com.hughes.util.raf.RAFSerializable; @@ -34,7 +35,7 @@ public class Dictionary implements RAFSerializable { static final int CACHE_SIZE = 5000; - static final int CURRENT_DICT_VERSION = 6; + static final int CURRENT_DICT_VERSION = 7; static final String END_OF_DICTIONARY = "END OF DICTIONARY"; // persisted @@ -75,25 +76,25 @@ public class Dictionary implements RAFSerializable { // disrupts the offset. try { final RAFList rafSources = RAFList.create(raf, new EntrySource.Serializer( - this), raf.getFilePointer()); + this), raf.getFilePointer(), dictFileVersion); sources = new ArrayList(rafSources); raf.seek(rafSources.getEndOffset()); pairEntries = CachingList.create( - RAFList.create(raf, new PairEntry.Serializer(this), raf.getFilePointer()), + RAFList.create(raf, new PairEntry.Serializer(this), raf.getFilePointer(), dictFileVersion, dictFileVersion >= 7 ? 64 : 1, dictFileVersion >= 7), CACHE_SIZE); textEntries = CachingList.create( - RAFList.create(raf, new TextEntry.Serializer(this), raf.getFilePointer()), + RAFList.create(raf, new TextEntry.Serializer(this), raf.getFilePointer(), dictFileVersion), CACHE_SIZE); if (dictFileVersion >= 5) { htmlEntries = CachingList.create( - RAFList.create(raf, new HtmlEntry.Serializer(this), raf.getFilePointer()), + RAFList.create(raf, new HtmlEntry.Serializer(this), raf.getFilePointer(), dictFileVersion), CACHE_SIZE); } else { htmlEntries = Collections.emptyList(); } indices = CachingList.createFullyCached(RAFList.create(raf, indexSerializer, - raf.getFilePointer())); + raf.getFilePointer(), dictFileVersion)); } catch (RuntimeException e) { final IOException ioe = new IOException("RuntimeException loading dictionary"); ioe.initCause(e); @@ -111,11 +112,17 @@ public class Dictionary implements RAFSerializable { raf.writeInt(dictFileVersion); raf.writeLong(creationMillis); raf.writeUTF(dictInfo); + System.out.println("sources start: " + raf.getFilePointer()); RAFList.write(raf, sources, new EntrySource.Serializer(this)); - RAFList.write(raf, pairEntries, new PairEntry.Serializer(this)); + System.out.println("pair start: " + raf.getFilePointer()); + RAFList.write(raf, pairEntries, new PairEntry.Serializer(this), 64, true); + System.out.println("text start: " + raf.getFilePointer()); RAFList.write(raf, textEntries, new TextEntry.Serializer(this)); + System.out.println("html start: " + raf.getFilePointer()); RAFList.write(raf, htmlEntries, new HtmlEntry.Serializer(this)); + System.out.println("indices start: " + raf.getFilePointer()); RAFList.write(raf, indices, indexSerializer); + System.out.println("end: " + raf.getFilePointer()); raf.writeUTF(END_OF_DICTIONARY); } diff --git a/src/com/hughes/android/dictionary/engine/HtmlEntry.java b/src/com/hughes/android/dictionary/engine/HtmlEntry.java index e37d19c..7694fe9 100644 --- a/src/com/hughes/android/dictionary/engine/HtmlEntry.java +++ b/src/com/hughes/android/dictionary/engine/HtmlEntry.java @@ -33,7 +33,7 @@ public class HtmlEntry extends AbstractEntry implements RAFSerializable htmlRef = new SoftReference(null); - private LazyHtmlLoader(final DataInput inp) throws IOException { + private LazyHtmlLoader(final DataInput inp, int version) throws IOException { raf = (RandomAccessFile)inp; - numBytes = raf.readInt(); - numZipBytes = raf.readInt(); + if (version >= 7) { + numBytes = -1; + numZipBytes = StringUtil.readVarInt(raf); + } else { + numBytes = raf.readInt(); + numZipBytes = raf.readInt(); + } offset = raf.getFilePointer(); raf.skipBytes(numZipBytes); } @@ -203,7 +207,6 @@ public class HtmlEntry extends AbstractEntry implements RAFSerializable { mainTokenCount = raf.readInt(); } sortedIndexEntries = CachingList.create( - RAFList.create(raf, indexEntrySerializer, raf.getFilePointer()), CACHE_SIZE); + RAFList.create(raf, indexEntrySerializer, raf.getFilePointer(), dict.dictFileVersion, + dict.dictFileVersion >= 7 ? 16 : 1, dict.dictFileVersion >= 7), CACHE_SIZE); if (dict.dictFileVersion >= 4) { stoplist = new SerializableSerializer>().read(raf); } else { @@ -155,7 +158,7 @@ public final class Index implements RAFSerializable { if (dict.dictFileVersion >= 2) { raf.writeInt(mainTokenCount); } - RAFList.write(raf, sortedIndexEntries, indexEntrySerializer); + RAFList.write(raf, sortedIndexEntries, indexEntrySerializer, 16, true); new SerializableSerializer>().write(raf, stoplist); UniformRAFList.write(raf, rows, new RowBase.Serializer(this), 5 /* * bytes @@ -188,7 +191,8 @@ public final class Index implements RAFSerializable { private final String normalizedToken; public final int startRow; public final int numRows; // doesn't count the token row! - public final List htmlEntries; + public List htmlEntries; + private int[] htmlEntryIndices; public IndexEntry(final Index index, final String token, final String normalizedToken, final int startRow, final int numRows) { @@ -202,34 +206,56 @@ public final class Index implements RAFSerializable { this.htmlEntries = new ArrayList(); } - public IndexEntry(final Index index, final DataInput inp) throws IOException { + public IndexEntry(final Index index, final DataInput raf) throws IOException { this.index = index; - RandomAccessFile raf = (RandomAccessFile)inp; token = raf.readUTF(); - startRow = raf.readInt(); - numRows = raf.readInt(); + if (index.dict.dictFileVersion >= 7) { + startRow = StringUtil.readVarInt(raf); + numRows = StringUtil.readVarInt(raf); + } else { + startRow = raf.readInt(); + numRows = raf.readInt(); + } final boolean hasNormalizedForm = raf.readBoolean(); normalizedToken = hasNormalizedForm ? raf.readUTF() : token; - if (index.dict.dictFileVersion >= 6) { + htmlEntryIndices = null; + if (index.dict.dictFileVersion >= 7) { + int size = StringUtil.readVarInt(raf); + htmlEntryIndices = new int[size]; + for (int i = 0; i < size; ++i) { + htmlEntryIndices[i] = StringUtil.readVarInt(raf); + } + this.htmlEntries = CachingList.create(new AbstractList() { + @Override + public HtmlEntry get(int i) { + return index.dict.htmlEntries.get(htmlEntryIndices[i]); + } + @Override + public int size() { + return htmlEntryIndices.length; + } + }, 1); + } else if (index.dict.dictFileVersion >= 6) { this.htmlEntries = CachingList.create( - RAFList.create(raf, index.dict.htmlEntryIndexSerializer, - raf.getFilePointer()), 1); + RAFList.create((RandomAccessFile)raf, index.dict.htmlEntryIndexSerializer, + ((RandomAccessFile)raf).getFilePointer(), index.dict.dictFileVersion), 1); } else { this.htmlEntries = Collections.emptyList(); } } - public void write(DataOutput out) throws IOException { - RandomAccessFile raf = (RandomAccessFile)out; + public void write(DataOutput raf) throws IOException { raf.writeUTF(token); - raf.writeInt(startRow); - raf.writeInt(numRows); + StringUtil.writeVarInt(raf, startRow); + StringUtil.writeVarInt(raf, numRows); final boolean hasNormalizedForm = !token.equals(normalizedToken); raf.writeBoolean(hasNormalizedForm); if (hasNormalizedForm) { raf.writeUTF(normalizedToken); } - RAFList.write(raf, htmlEntries, index.dict.htmlEntryIndexSerializer); + StringUtil.writeVarInt(raf, htmlEntries.size()); + for (HtmlEntry e : htmlEntries) + StringUtil.writeVarInt(raf, e.index()); } public String toString() { diff --git a/src/com/hughes/android/dictionary/engine/PairEntry.java b/src/com/hughes/android/dictionary/engine/PairEntry.java index 00f690a..ad9625c 100644 --- a/src/com/hughes/android/dictionary/engine/PairEntry.java +++ b/src/com/hughes/android/dictionary/engine/PairEntry.java @@ -14,6 +14,7 @@ package com.hughes.android.dictionary.engine; +import com.hughes.util.StringUtil; import com.hughes.util.raf.RAFListSerializer; import com.hughes.util.raf.RAFSerializable; import com.ibm.icu.text.Transliterator; @@ -44,7 +45,7 @@ public class PairEntry extends AbstractEntry implements RAFSerializable(size); for (int i = 0; i < size; ++i) { pairs.add(new Pair(raf.readUTF(), raf.readUTF())); @@ -54,8 +55,7 @@ public class PairEntry extends AbstractEntry implements RAFSerializable 0; raf.writeUTF(pairs.get(i).lang1);