X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FDictionary.java;h=cb7371442e3bd380cfafe31d32690d02e70546ca;hb=5f7b259669237dad4cbfdec8536537815846979b;hp=e74876da9f42daf85166c2bf917240927eb7271f;hpb=ca8920678fca725851fb18fdb1bd01752391e38a;p=Dictionary.git diff --git a/src/com/hughes/android/dictionary/engine/Dictionary.java b/src/com/hughes/android/dictionary/engine/Dictionary.java index e74876d..cb73714 100644 --- a/src/com/hughes/android/dictionary/engine/Dictionary.java +++ b/src/com/hughes/android/dictionary/engine/Dictionary.java @@ -14,30 +14,25 @@ package com.hughes.android.dictionary.engine; -import java.io.ByteArrayOutputStream; import java.io.DataInput; -import java.io.DataInputStream; import java.io.DataOutput; import java.io.File; import java.io.IOException; -import java.io.ObjectOutputStream; import java.io.PrintStream; import java.io.RandomAccessFile; -import java.nio.channels.Channels; +import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.zip.GZIPOutputStream; import com.hughes.android.dictionary.DictionaryInfo; import com.hughes.util.CachingList; +import com.hughes.util.DataInputBuffer; import com.hughes.util.raf.RAFList; import com.hughes.util.raf.RAFListSerializer; -import com.hughes.util.raf.RAFSerializable; -public class Dictionary implements RAFSerializable { +public class Dictionary { private static final int CACHE_SIZE = 5000; @@ -46,14 +41,17 @@ public class Dictionary implements RAFSerializable { // persisted final int dictFileVersion; - private final long creationMillis; + public final long creationMillis; public final String dictInfo; public final List pairEntries; public final List textEntries; public final List htmlEntries; - public final List htmlData; + public final List htmlData; public final List sources; public final List indices; + // Could be a local variable in constructor, but + // this way avoids a native-image VM bug. + private final MappedByteBuffer wholefile; /** * dictFileVersion 1 adds:
  • links to sources? dictFileVersion 2 adds:
  • @@ -70,55 +68,56 @@ public class Dictionary implements RAFSerializable { htmlData = null; sources = new ArrayList<>(); indices = new ArrayList<>(); + wholefile = null; } public Dictionary(final FileChannel ch) throws IOException { - DataInput raf = new DataInputStream(Channels.newInputStream(ch)); - dictFileVersion = raf.readInt(); + wholefile = ch.map(FileChannel.MapMode.READ_ONLY, 0, ch.size()); + DataInputBuffer in = new DataInputBuffer(wholefile, 0); + dictFileVersion = in.readInt(); if (dictFileVersion < 0 || dictFileVersion > CURRENT_DICT_VERSION) { throw new IOException("Invalid dictionary version: " + dictFileVersion); } - creationMillis = raf.readLong(); - dictInfo = raf.readUTF(); + creationMillis = in.readLong(); + dictInfo = in.readUTF(); // Load the sources, then seek past them, because reading them later // disrupts the offset. try { - final RAFList rafSources = RAFList.create(ch, new EntrySource.Serializer( - this), ch.position(), dictFileVersion, dictInfo + " sources: "); + final RAFList rafSources = RAFList.create(in, new EntrySource.Serializer( + this), dictFileVersion, dictInfo + " sources: "); sources = new ArrayList<>(rafSources); ch.position(rafSources.getEndOffset()); pairEntries = CachingList.create( - RAFList.create(ch, new PairEntry.Serializer(this), ch.position(), dictFileVersion, dictInfo + " pairs: "), + RAFList.create(in, new PairEntry.Serializer(this), dictFileVersion, dictInfo + " pairs: "), CACHE_SIZE, false); textEntries = CachingList.create( - RAFList.create(ch, new TextEntry.Serializer(this), ch.position(), dictFileVersion, dictInfo + " text: "), + RAFList.create(in, new TextEntry.Serializer(this), dictFileVersion, dictInfo + " text: "), CACHE_SIZE, true); if (dictFileVersion >= 5) { htmlEntries = CachingList.create( - RAFList.create(ch, new HtmlEntry.Serializer(this, ch), ch.position(), dictFileVersion, dictInfo + " html: "), + RAFList.create(in, new HtmlEntry.Serializer(this), dictFileVersion, dictInfo + " html: "), CACHE_SIZE, false); } else { htmlEntries = Collections.emptyList(); } if (dictFileVersion >= 7) { - htmlData = RAFList.create(ch, new HtmlEntry.DataDeserializer(), ch.position(), dictFileVersion, dictInfo + " html: "); + htmlData = RAFList.create(in, new HtmlEntry.DataDeserializer(), dictFileVersion, dictInfo + " html: "); } else { htmlData = null; } - indices = CachingList.createFullyCached(RAFList.create(ch, new IndexSerializer(ch), - ch.position(), dictFileVersion, dictInfo + " index: ")); + indices = CachingList.createFullyCached(RAFList.create(in, new IndexSerializer(), + dictFileVersion, dictInfo + " index: ")); } catch (RuntimeException e) { throw new IOException("RuntimeException loading dictionary", e); } - final String end = raf.readUTF(); + final String end = in.readUTF(); if (!end.equals(END_OF_DICTIONARY)) { throw new IOException("Dictionary seems corrupt: " + end); } } - @Override public void write(DataOutput out) throws IOException { RandomAccessFile raf = (RandomAccessFile)out; if (dictFileVersion < 7) throw new RuntimeException("write function cannot write formats older than v7!"); @@ -132,263 +131,20 @@ public class Dictionary implements RAFSerializable { System.out.println("text start: " + raf.getFilePointer()); RAFList.write(raf, textEntries, new TextEntry.Serializer(this)); System.out.println("html index start: " + raf.getFilePointer()); - RAFList.write(raf, htmlEntries, new HtmlEntry.Serializer(this, null), 64, true); + RAFList.write(raf, htmlEntries, new HtmlEntry.Serializer(this), 64, true); System.out.println("html data start: " + raf.getFilePointer()); assert htmlData == null; RAFList.write(raf, htmlEntries, new HtmlEntry.DataSerializer(), 128, true); System.out.println("indices start: " + raf.getFilePointer()); - RAFList.write(raf, indices, new IndexSerializer(null)); - System.out.println("end: " + raf.getFilePointer()); - raf.writeUTF(END_OF_DICTIONARY); - } - - private void writev6Sources(RandomAccessFile out) throws IOException { - out.writeInt(sources.size()); - long tocPos = out.getFilePointer(); - out.seek(tocPos + sources.size() * 8 + 8); - for (EntrySource s : sources) { - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - tocPos += 8; - out.seek(dataPos); - out.writeUTF(s.getName()); - out.writeInt(s.getNumEntries()); - } - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - out.seek(dataPos); - } - - private void writev6PairEntries(RandomAccessFile out) throws IOException { - out.writeInt(pairEntries.size()); - long tocPos = out.getFilePointer(); - out.seek(tocPos + pairEntries.size() * 8 + 8); - for (PairEntry pe : pairEntries) { - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - tocPos += 8; - out.seek(dataPos); - out.writeShort(pe.entrySource.index()); - out.writeInt(pe.pairs.size()); - for (PairEntry.Pair p : pe.pairs) { - out.writeUTF(p.lang1); - out.writeUTF(p.lang2); - } - } - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - out.seek(dataPos); - } - - private void writev6TextEntries(RandomAccessFile out) throws IOException { - out.writeInt(textEntries.size()); - long tocPos = out.getFilePointer(); - out.seek(tocPos + textEntries.size() * 8 + 8); - for (TextEntry t : textEntries) { - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - tocPos += 8; - out.seek(dataPos); - out.writeShort(t.entrySource.index()); - out.writeUTF(t.text); - } - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - out.seek(dataPos); - } - - private void writev6EmptyList(RandomAccessFile out) throws IOException { - out.writeInt(0); - out.writeLong(out.getFilePointer() + 8); - } - - private void writev6HtmlEntries(RandomAccessFile out) throws IOException { - out.writeInt(htmlEntries.size()); - long tocPos = out.getFilePointer(); - out.seek(tocPos + htmlEntries.size() * 8 + 8); - for (HtmlEntry h : htmlEntries) { - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - tocPos += 8; - out.seek(dataPos); - out.writeShort(h.entrySource.index()); - out.writeUTF(h.title); - byte[] data = h.getHtml().getBytes(StandardCharsets.UTF_8); - out.writeInt(data.length); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - GZIPOutputStream gzout = new GZIPOutputStream(baos); - gzout.write(data); - gzout.close(); - out.writeInt(baos.size()); - out.write(baos.toByteArray()); - } - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - out.seek(dataPos); - } - - private void writev6HtmlIndices(RandomAccessFile out, List entries) throws IOException { - out.writeInt(entries.size()); - long tocPos = out.getFilePointer(); - out.seek(tocPos + entries.size() * 8 + 8); - for (HtmlEntry e : entries) { - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - tocPos += 8; - out.seek(dataPos); - out.writeInt(e.index()); - } - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - out.seek(dataPos); - } - - private void writev6IndexEntries(RandomAccessFile out, List entries, int[] prunedRowIdx) throws IOException { - out.writeInt(entries.size()); - long tocPos = out.getFilePointer(); - out.seek(tocPos + entries.size() * 8 + 8); - for (Index.IndexEntry e : entries) { - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - tocPos += 8; - out.seek(dataPos); - out.writeUTF(e.token); - - int startRow = e.startRow; - int numRows = e.numRows; - if (prunedRowIdx != null) { - // note: the start row will always be a TokenRow - // and thus never be pruned - int newNumRows = 1; - for (int i = 1; i < numRows; i++) { - if (prunedRowIdx[startRow + i] >= 0) newNumRows++; - } - startRow = prunedRowIdx[startRow]; - numRows = newNumRows; - } - - out.writeInt(startRow); - out.writeInt(numRows); - final boolean hasNormalizedForm = !e.token.equals(e.normalizedToken()); - out.writeBoolean(hasNormalizedForm); - if (hasNormalizedForm) out.writeUTF(e.normalizedToken()); - writev6HtmlIndices(out, prunedRowIdx == null ? e.htmlEntries : Collections.emptyList()); - } - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - out.seek(dataPos); - } - - private void writev6Index(RandomAccessFile out, boolean skipHtml) throws IOException { - out.writeInt(indices.size()); - long tocPos = out.getFilePointer(); - out.seek(tocPos + indices.size() * 8 + 8); - for (Index idx : indices) { - // create pruned index for skipHtml feature - int[] prunedRowIdx = null; - int prunedSize = 0; - if (skipHtml) { - prunedRowIdx = new int[idx.rows.size()]; - for (int i = 0; i < idx.rows.size(); i++) { - final RowBase r = idx.rows.get(i); - // prune Html entries - boolean pruned = r instanceof HtmlEntry.Row; - prunedRowIdx[i] = pruned ? -1 : prunedSize; - if (!pruned) prunedSize++; - } - } - - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - tocPos += 8; - out.seek(dataPos); - out.writeUTF(idx.shortName); - out.writeUTF(idx.longName); - out.writeUTF(idx.sortLanguage.getIsoCode()); - out.writeUTF(idx.normalizerRules); - out.writeBoolean(idx.swapPairEntries); - out.writeInt(idx.mainTokenCount); - writev6IndexEntries(out, idx.sortedIndexEntries, prunedRowIdx); - - // write stoplist, serializing the whole Set *shudder* - final ByteArrayOutputStream baos = new ByteArrayOutputStream(); - final ObjectOutputStream oos = new ObjectOutputStream(baos); - oos.writeObject(idx.stoplist); - oos.close(); - final byte[] bytes = baos.toByteArray(); - out.writeInt(bytes.length); - out.write(bytes); - - out.writeInt(skipHtml ? prunedSize : idx.rows.size()); - out.writeInt(5); - for (RowBase r : idx.rows) { - int type = 0; - if (r instanceof PairEntry.Row) { - type = 0; - } else if (r instanceof TokenRow) { - final TokenRow tokenRow = (TokenRow)r; - type = tokenRow.hasMainEntry ? 1 : 3; - } else if (r instanceof TextEntry.Row) { - type = 2; - } else if (r instanceof HtmlEntry.Row) { - type = 4; - if (skipHtml) continue; - } else { - throw new RuntimeException("Row type not supported for v6"); - } - out.writeByte(type); - out.writeInt(r.referenceIndex); - } - } - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - out.seek(dataPos); - } - - public void writev6(DataOutput out, boolean skipHtml) throws IOException { - RandomAccessFile raf = (RandomAccessFile)out; - raf.writeInt(6); - raf.writeLong(creationMillis); - raf.writeUTF(dictInfo); - System.out.println("sources start: " + raf.getFilePointer()); - writev6Sources(raf); - System.out.println("pair start: " + raf.getFilePointer()); - writev6PairEntries(raf); - System.out.println("text start: " + raf.getFilePointer()); - writev6TextEntries(raf); - System.out.println("html index start: " + raf.getFilePointer()); - if (skipHtml) writev6EmptyList(raf); - else writev6HtmlEntries(raf); - System.out.println("indices start: " + raf.getFilePointer()); - writev6Index(raf, skipHtml); + RAFList.write(raf, indices, new IndexSerializer()); System.out.println("end: " + raf.getFilePointer()); raf.writeUTF(END_OF_DICTIONARY); } private final class IndexSerializer implements RAFListSerializer { - private final FileChannel ch; - - IndexSerializer(FileChannel ch) { - this.ch = ch; - } - @Override public Index read(DataInput raf, final int readIndex) throws IOException { - return new Index(Dictionary.this, ch, raf); + return new Index(Dictionary.this, (DataInputBuffer)raf); } @Override