X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FDictionary.java;h=6539bc9b0f0a52afce3eae135fa681bf527ddc53;hb=2a18ab8b97ba0254a0655d595f05c492eb0eecd4;hp=35c3143606f544e71cc2fd569a9d4e7cc0378d97;hpb=63044069c666fd697175065706d2ee5858a4e153;p=Dictionary.git diff --git a/src/com/hughes/android/dictionary/engine/Dictionary.java b/src/com/hughes/android/dictionary/engine/Dictionary.java index 35c3143..6539bc9 100644 --- a/src/com/hughes/android/dictionary/engine/Dictionary.java +++ b/src/com/hughes/android/dictionary/engine/Dictionary.java @@ -14,28 +14,21 @@ package com.hughes.android.dictionary.engine; -import java.io.BufferedOutputStream; -import java.io.ByteArrayOutputStream; import java.io.DataInput; -import java.io.DataInputStream; import java.io.DataOutput; -import java.io.DataOutputStream; import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; -import java.io.ObjectOutputStream; import java.io.PrintStream; import java.io.RandomAccessFile; -import java.nio.channels.Channels; +import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.zip.GZIPOutputStream; import com.hughes.android.dictionary.DictionaryInfo; import com.hughes.util.CachingList; +import com.hughes.util.DataInputBuffer; import com.hughes.util.raf.RAFList; import com.hughes.util.raf.RAFListSerializer; import com.hughes.util.raf.RAFSerializable; @@ -49,12 +42,12 @@ public class Dictionary implements RAFSerializable { // persisted final int dictFileVersion; - private final long creationMillis; + public final long creationMillis; public final String dictInfo; public final List pairEntries; public final List textEntries; public final List htmlEntries; - public final List htmlData; + public final List htmlData; public final List sources; public final List indices; @@ -76,46 +69,47 @@ public class Dictionary implements RAFSerializable { } public Dictionary(final FileChannel ch) throws IOException { - DataInput raf = new DataInputStream(Channels.newInputStream(ch)); - dictFileVersion = raf.readInt(); + MappedByteBuffer wholefile = ch.map(FileChannel.MapMode.READ_ONLY, 0, ch.size()); + DataInputBuffer in = new DataInputBuffer(wholefile, 0); + dictFileVersion = in.readInt(); if (dictFileVersion < 0 || dictFileVersion > CURRENT_DICT_VERSION) { throw new IOException("Invalid dictionary version: " + dictFileVersion); } - creationMillis = raf.readLong(); - dictInfo = raf.readUTF(); + creationMillis = in.readLong(); + dictInfo = in.readUTF(); // Load the sources, then seek past them, because reading them later // disrupts the offset. try { - final RAFList rafSources = RAFList.create(ch, new EntrySource.Serializer( - this), ch.position(), dictFileVersion, dictInfo + " sources: "); + final RAFList rafSources = RAFList.create(in, new EntrySource.Serializer( + this), dictFileVersion, dictInfo + " sources: "); sources = new ArrayList<>(rafSources); ch.position(rafSources.getEndOffset()); pairEntries = CachingList.create( - RAFList.create(ch, new PairEntry.Serializer(this), ch.position(), dictFileVersion, dictInfo + " pairs: "), + RAFList.create(in, new PairEntry.Serializer(this), dictFileVersion, dictInfo + " pairs: "), CACHE_SIZE, false); textEntries = CachingList.create( - RAFList.create(ch, new TextEntry.Serializer(this), ch.position(), dictFileVersion, dictInfo + " text: "), + RAFList.create(in, new TextEntry.Serializer(this), dictFileVersion, dictInfo + " text: "), CACHE_SIZE, true); if (dictFileVersion >= 5) { htmlEntries = CachingList.create( - RAFList.create(ch, new HtmlEntry.Serializer(this, ch), ch.position(), dictFileVersion, dictInfo + " html: "), + RAFList.create(in, new HtmlEntry.Serializer(this), dictFileVersion, dictInfo + " html: "), CACHE_SIZE, false); } else { htmlEntries = Collections.emptyList(); } if (dictFileVersion >= 7) { - htmlData = RAFList.create(ch, new HtmlEntry.DataDeserializer(), ch.position(), dictFileVersion, dictInfo + " html: "); + htmlData = RAFList.create(in, new HtmlEntry.DataDeserializer(), dictFileVersion, dictInfo + " html: "); } else { htmlData = null; } - indices = CachingList.createFullyCached(RAFList.create(ch, new IndexSerializer(ch), - ch.position(), dictFileVersion, dictInfo + " index: ")); + indices = CachingList.createFullyCached(RAFList.create(in, new IndexSerializer(), + dictFileVersion, dictInfo + " index: ")); } catch (RuntimeException e) { throw new IOException("RuntimeException loading dictionary", e); } - final String end = raf.readUTF(); + final String end = in.readUTF(); if (!end.equals(END_OF_DICTIONARY)) { throw new IOException("Dictionary seems corrupt: " + end); } @@ -135,303 +129,20 @@ public class Dictionary implements RAFSerializable { System.out.println("text start: " + raf.getFilePointer()); RAFList.write(raf, textEntries, new TextEntry.Serializer(this)); System.out.println("html index start: " + raf.getFilePointer()); - RAFList.write(raf, htmlEntries, new HtmlEntry.Serializer(this, null), 64, true); + RAFList.write(raf, htmlEntries, new HtmlEntry.Serializer(this), 64, true); System.out.println("html data start: " + raf.getFilePointer()); assert htmlData == null; RAFList.write(raf, htmlEntries, new HtmlEntry.DataSerializer(), 128, true); System.out.println("indices start: " + raf.getFilePointer()); - RAFList.write(raf, indices, new IndexSerializer(null)); - System.out.println("end: " + raf.getFilePointer()); - raf.writeUTF(END_OF_DICTIONARY); - } - - private void writev6Sources(RandomAccessFile out) throws IOException { - ByteArrayOutputStream toc = new ByteArrayOutputStream(); - DataOutputStream tocout = new DataOutputStream(toc); - - out.writeInt(sources.size()); - long tocPos = out.getFilePointer(); - out.seek(tocPos + sources.size() * 8 + 8); - for (EntrySource s : sources) { - long dataPos = out.getFilePointer(); - tocout.writeLong(dataPos); - - out.writeUTF(s.getName()); - out.writeInt(s.getNumEntries()); - } - long dataPos = out.getFilePointer(); - tocout.writeLong(dataPos); - tocout.close(); - - out.seek(tocPos); - out.write(toc.toByteArray()); - out.seek(dataPos); - } - - private void writev6PairEntries(RandomAccessFile out) throws IOException { - ByteArrayOutputStream toc = new ByteArrayOutputStream(); - DataOutputStream tocout = new DataOutputStream(toc); - - long tocPos = out.getFilePointer(); - long dataPos = tocPos + 4 + pairEntries.size() * 8 + 8; - - out.seek(dataPos); - DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD()))); - - tocout.writeInt(pairEntries.size()); - for (PairEntry pe : pairEntries) { - tocout.writeLong(dataPos + outb.size()); - - outb.writeShort(pe.entrySource.index()); - outb.writeInt(pe.pairs.size()); - for (PairEntry.Pair p : pe.pairs) { - outb.writeUTF(p.lang1); - outb.writeUTF(p.lang2); - } - } - dataPos += outb.size(); - outb.flush(); - tocout.writeLong(dataPos); - tocout.close(); - - out.seek(tocPos); - out.write(toc.toByteArray()); - out.seek(dataPos); - } - - private void writev6TextEntries(RandomAccessFile out) throws IOException { - ByteArrayOutputStream toc = new ByteArrayOutputStream(); - DataOutputStream tocout = new DataOutputStream(toc); - - out.writeInt(textEntries.size()); - long tocPos = out.getFilePointer(); - out.seek(tocPos + textEntries.size() * 8 + 8); - for (TextEntry t : textEntries) { - long dataPos = out.getFilePointer(); - tocout.writeLong(dataPos); - - out.writeShort(t.entrySource.index()); - out.writeUTF(t.text); - } - long dataPos = out.getFilePointer(); - tocout.writeLong(dataPos); - tocout.close(); - - out.seek(tocPos); - out.write(toc.toByteArray()); - out.seek(dataPos); - } - - private void writev6EmptyList(RandomAccessFile out) throws IOException { - out.writeInt(0); - out.writeLong(out.getFilePointer() + 8); - } - - private void writev6HtmlEntries(RandomAccessFile out) throws IOException { - ByteArrayOutputStream toc = new ByteArrayOutputStream(); - DataOutputStream tocout = new DataOutputStream(toc); - - long tocPos = out.getFilePointer(); - long dataPos = tocPos + 4 + htmlEntries.size() * 8 + 8; - - out.seek(dataPos); - DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD()))); - - tocout.writeInt(htmlEntries.size()); - for (HtmlEntry h : htmlEntries) { - tocout.writeLong(dataPos + outb.size()); - - outb.writeShort(h.entrySource.index()); - outb.writeUTF(h.title); - byte[] data = h.getHtml().getBytes(StandardCharsets.UTF_8); - outb.writeInt(data.length); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - GZIPOutputStream gzout = new GZIPOutputStream(baos); - gzout.write(data); - gzout.close(); - outb.writeInt(baos.size()); - outb.write(baos.toByteArray()); - } - dataPos += outb.size(); - outb.flush(); - tocout.writeLong(dataPos); - tocout.close(); - - out.seek(tocPos); - out.write(toc.toByteArray()); - out.seek(dataPos); - } - - private void writev6HtmlIndices(DataOutputStream out, long pos, List entries) throws IOException { - long dataPos = pos + 4 + entries.size() * 8 + 8; - - out.writeInt(entries.size()); - - // TOC is trivial, so optimize writing it - for (int i = 0; i < entries.size(); i++) { - out.writeLong(dataPos); - dataPos += 4; - } - out.writeLong(dataPos); - - for (HtmlEntry e : entries) { - out.writeInt(e.index()); - } - } - - private void writev6IndexEntries(RandomAccessFile out, List entries, int[] prunedRowIdx) throws IOException { - ByteArrayOutputStream toc = new ByteArrayOutputStream(); - DataOutputStream tocout = new DataOutputStream(toc); - - long tocPos = out.getFilePointer(); - long dataPos = tocPos + 4 + entries.size() * 8 + 8; - - out.seek(dataPos); - DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD()))); - - tocout.writeInt(entries.size()); - for (Index.IndexEntry e : entries) { - tocout.writeLong(dataPos + outb.size()); - - outb.writeUTF(e.token); - - int startRow = e.startRow; - int numRows = e.numRows; - if (prunedRowIdx != null) { - // note: the start row will always be a TokenRow - // and thus never be pruned - int newNumRows = 1; - for (int i = 1; i < numRows; i++) { - if (prunedRowIdx[startRow + i] >= 0) newNumRows++; - } - startRow = prunedRowIdx[startRow]; - numRows = newNumRows; - } - - outb.writeInt(startRow); - outb.writeInt(numRows); - final boolean hasNormalizedForm = !e.token.equals(e.normalizedToken()); - outb.writeBoolean(hasNormalizedForm); - if (hasNormalizedForm) outb.writeUTF(e.normalizedToken()); - writev6HtmlIndices(outb, dataPos + outb.size(), - prunedRowIdx == null ? e.htmlEntries : Collections.emptyList()); - } - dataPos += outb.size(); - outb.flush(); - tocout.writeLong(dataPos); - tocout.close(); - - out.seek(tocPos); - out.write(toc.toByteArray()); - out.seek(dataPos); - } - - private void writev6Index(RandomAccessFile out, boolean skipHtml) throws IOException { - ByteArrayOutputStream toc = new ByteArrayOutputStream(); - DataOutputStream tocout = new DataOutputStream(toc); - - out.writeInt(indices.size()); - long tocPos = out.getFilePointer(); - out.seek(tocPos + indices.size() * 8 + 8); - for (Index idx : indices) { - // create pruned index for skipHtml feature - int[] prunedRowIdx = null; - int prunedSize = 0; - if (skipHtml) { - prunedRowIdx = new int[idx.rows.size()]; - for (int i = 0; i < idx.rows.size(); i++) { - final RowBase r = idx.rows.get(i); - // prune Html entries - boolean pruned = r instanceof HtmlEntry.Row; - prunedRowIdx[i] = pruned ? -1 : prunedSize; - if (!pruned) prunedSize++; - } - } - - long dataPos = out.getFilePointer(); - tocout.writeLong(dataPos); - - out.writeUTF(idx.shortName); - out.writeUTF(idx.longName); - out.writeUTF(idx.sortLanguage.getIsoCode()); - out.writeUTF(idx.normalizerRules); - out.writeBoolean(idx.swapPairEntries); - out.writeInt(idx.mainTokenCount); - writev6IndexEntries(out, idx.sortedIndexEntries, prunedRowIdx); - - // write stoplist, serializing the whole Set *shudder* - final ByteArrayOutputStream baos = new ByteArrayOutputStream(); - final ObjectOutputStream oos = new ObjectOutputStream(baos); - oos.writeObject(idx.stoplist); - oos.close(); - final byte[] bytes = baos.toByteArray(); - - - DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD()))); - outb.writeInt(bytes.length); - outb.write(bytes); - - outb.writeInt(skipHtml ? prunedSize : idx.rows.size()); - outb.writeInt(5); - for (RowBase r : idx.rows) { - int type = 0; - if (r instanceof PairEntry.Row) { - type = 0; - } else if (r instanceof TokenRow) { - final TokenRow tokenRow = (TokenRow)r; - type = tokenRow.hasMainEntry ? 1 : 3; - } else if (r instanceof TextEntry.Row) { - type = 2; - } else if (r instanceof HtmlEntry.Row) { - type = 4; - if (skipHtml) continue; - } else { - throw new RuntimeException("Row type not supported for v6"); - } - outb.writeByte(type); - outb.writeInt(r.referenceIndex); - } - outb.flush(); - } - long dataPos = out.getFilePointer(); - tocout.writeLong(dataPos); - tocout.close(); - - out.seek(tocPos); - out.write(toc.toByteArray()); - out.seek(dataPos); - } - - public void writev6(DataOutput out, boolean skipHtml) throws IOException { - RandomAccessFile raf = (RandomAccessFile)out; - raf.writeInt(6); - raf.writeLong(creationMillis); - raf.writeUTF(dictInfo); - System.out.println("sources start: " + raf.getFilePointer()); - writev6Sources(raf); - System.out.println("pair start: " + raf.getFilePointer()); - writev6PairEntries(raf); - System.out.println("text start: " + raf.getFilePointer()); - writev6TextEntries(raf); - System.out.println("html index start: " + raf.getFilePointer()); - if (skipHtml) writev6EmptyList(raf); - else writev6HtmlEntries(raf); - System.out.println("indices start: " + raf.getFilePointer()); - writev6Index(raf, skipHtml); + RAFList.write(raf, indices, new IndexSerializer()); System.out.println("end: " + raf.getFilePointer()); raf.writeUTF(END_OF_DICTIONARY); } private final class IndexSerializer implements RAFListSerializer { - private final FileChannel ch; - - IndexSerializer(FileChannel ch) { - this.ch = ch; - } - @Override public Index read(DataInput raf, final int readIndex) throws IOException { - return new Index(Dictionary.this, ch, raf); + return new Index(Dictionary.this, (DataInputBuffer)raf); } @Override