X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FDictionary.java;h=35c3143606f544e71cc2fd569a9d4e7cc0378d97;hb=63044069c666fd697175065706d2ee5858a4e153;hp=fef86bd6577f023a535fe6d528cdcd83a660f341;hpb=e79165503392ed6a7cb7a8eadc15eaae0cda9443;p=Dictionary.git diff --git a/src/com/hughes/android/dictionary/engine/Dictionary.java b/src/com/hughes/android/dictionary/engine/Dictionary.java index fef86bd..35c3143 100644 --- a/src/com/hughes/android/dictionary/engine/Dictionary.java +++ b/src/com/hughes/android/dictionary/engine/Dictionary.java @@ -14,17 +14,14 @@ package com.hughes.android.dictionary.engine; -import com.hughes.android.dictionary.DictionaryInfo; -import com.hughes.util.CachingList; -import com.hughes.util.raf.RAFList; -import com.hughes.util.raf.RAFListSerializer; -import com.hughes.util.raf.RAFSerializable; - +import java.io.BufferedOutputStream; import java.io.ByteArrayOutputStream; import java.io.DataInput; import java.io.DataInputStream; import java.io.DataOutput; +import java.io.DataOutputStream; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectOutputStream; import java.io.PrintStream; @@ -37,6 +34,12 @@ import java.util.Collections; import java.util.List; import java.util.zip.GZIPOutputStream; +import com.hughes.android.dictionary.DictionaryInfo; +import com.hughes.util.CachingList; +import com.hughes.util.raf.RAFList; +import com.hughes.util.raf.RAFListSerializer; +import com.hughes.util.raf.RAFSerializable; + public class Dictionary implements RAFSerializable { private static final int CACHE_SIZE = 5000; @@ -143,152 +146,218 @@ public class Dictionary implements RAFSerializable { } private void writev6Sources(RandomAccessFile out) throws IOException { + ByteArrayOutputStream toc = new ByteArrayOutputStream(); + DataOutputStream tocout = new DataOutputStream(toc); + out.writeInt(sources.size()); long tocPos = out.getFilePointer(); out.seek(tocPos + sources.size() * 8 + 8); for (EntrySource s : sources) { long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - tocPos += 8; - out.seek(dataPos); + tocout.writeLong(dataPos); + out.writeUTF(s.getName()); out.writeInt(s.getNumEntries()); } long dataPos = out.getFilePointer(); + tocout.writeLong(dataPos); + tocout.close(); + out.seek(tocPos); - out.writeLong(dataPos); + out.write(toc.toByteArray()); out.seek(dataPos); } private void writev6PairEntries(RandomAccessFile out) throws IOException { - out.writeInt(pairEntries.size()); + ByteArrayOutputStream toc = new ByteArrayOutputStream(); + DataOutputStream tocout = new DataOutputStream(toc); + long tocPos = out.getFilePointer(); - out.seek(tocPos + pairEntries.size() * 8 + 8); + long dataPos = tocPos + 4 + pairEntries.size() * 8 + 8; + + out.seek(dataPos); + DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD()))); + + tocout.writeInt(pairEntries.size()); for (PairEntry pe : pairEntries) { - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - tocPos += 8; - out.seek(dataPos); - out.writeShort(pe.entrySource.index()); - out.writeInt(pe.pairs.size()); + tocout.writeLong(dataPos + outb.size()); + + outb.writeShort(pe.entrySource.index()); + outb.writeInt(pe.pairs.size()); for (PairEntry.Pair p : pe.pairs) { - out.writeUTF(p.lang1); - out.writeUTF(p.lang2); + outb.writeUTF(p.lang1); + outb.writeUTF(p.lang2); } } - long dataPos = out.getFilePointer(); + dataPos += outb.size(); + outb.flush(); + tocout.writeLong(dataPos); + tocout.close(); + out.seek(tocPos); - out.writeLong(dataPos); + out.write(toc.toByteArray()); out.seek(dataPos); } private void writev6TextEntries(RandomAccessFile out) throws IOException { + ByteArrayOutputStream toc = new ByteArrayOutputStream(); + DataOutputStream tocout = new DataOutputStream(toc); + out.writeInt(textEntries.size()); long tocPos = out.getFilePointer(); out.seek(tocPos + textEntries.size() * 8 + 8); for (TextEntry t : textEntries) { long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - tocPos += 8; - out.seek(dataPos); + tocout.writeLong(dataPos); + out.writeShort(t.entrySource.index()); out.writeUTF(t.text); } long dataPos = out.getFilePointer(); + tocout.writeLong(dataPos); + tocout.close(); + out.seek(tocPos); - out.writeLong(dataPos); + out.write(toc.toByteArray()); out.seek(dataPos); } + private void writev6EmptyList(RandomAccessFile out) throws IOException { + out.writeInt(0); + out.writeLong(out.getFilePointer() + 8); + } + private void writev6HtmlEntries(RandomAccessFile out) throws IOException { - out.writeInt(htmlEntries.size()); + ByteArrayOutputStream toc = new ByteArrayOutputStream(); + DataOutputStream tocout = new DataOutputStream(toc); + long tocPos = out.getFilePointer(); - out.seek(tocPos + htmlEntries.size() * 8 + 8); + long dataPos = tocPos + 4 + htmlEntries.size() * 8 + 8; + + out.seek(dataPos); + DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD()))); + + tocout.writeInt(htmlEntries.size()); for (HtmlEntry h : htmlEntries) { - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - tocPos += 8; - out.seek(dataPos); - out.writeShort(h.entrySource.index()); - out.writeUTF(h.title); + tocout.writeLong(dataPos + outb.size()); + + outb.writeShort(h.entrySource.index()); + outb.writeUTF(h.title); byte[] data = h.getHtml().getBytes(StandardCharsets.UTF_8); - out.writeInt(data.length); + outb.writeInt(data.length); ByteArrayOutputStream baos = new ByteArrayOutputStream(); GZIPOutputStream gzout = new GZIPOutputStream(baos); gzout.write(data); gzout.close(); - out.writeInt(baos.size()); - out.write(baos.toByteArray()); + outb.writeInt(baos.size()); + outb.write(baos.toByteArray()); } - long dataPos = out.getFilePointer(); + dataPos += outb.size(); + outb.flush(); + tocout.writeLong(dataPos); + tocout.close(); + out.seek(tocPos); - out.writeLong(dataPos); + out.write(toc.toByteArray()); out.seek(dataPos); } - private void writev6HtmlIndices(RandomAccessFile out, List entries) throws IOException { + private void writev6HtmlIndices(DataOutputStream out, long pos, List entries) throws IOException { + long dataPos = pos + 4 + entries.size() * 8 + 8; + out.writeInt(entries.size()); - long tocPos = out.getFilePointer(); - out.seek(tocPos + entries.size() * 8 + 8); - for (HtmlEntry e : entries) { - long dataPos = out.getFilePointer(); - out.seek(tocPos); + + // TOC is trivial, so optimize writing it + for (int i = 0; i < entries.size(); i++) { out.writeLong(dataPos); - tocPos += 8; - out.seek(dataPos); - out.writeInt(e.index()); + dataPos += 4; } - long dataPos = out.getFilePointer(); - out.seek(tocPos); out.writeLong(dataPos); - out.seek(dataPos); + + for (HtmlEntry e : entries) { + out.writeInt(e.index()); + } } - private void writev6IndexEntries(RandomAccessFile out, List entries) throws IOException { - out.writeInt(entries.size()); + private void writev6IndexEntries(RandomAccessFile out, List entries, int[] prunedRowIdx) throws IOException { + ByteArrayOutputStream toc = new ByteArrayOutputStream(); + DataOutputStream tocout = new DataOutputStream(toc); + long tocPos = out.getFilePointer(); - out.seek(tocPos + entries.size() * 8 + 8); + long dataPos = tocPos + 4 + entries.size() * 8 + 8; + + out.seek(dataPos); + DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD()))); + + tocout.writeInt(entries.size()); for (Index.IndexEntry e : entries) { - long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - tocPos += 8; - out.seek(dataPos); - out.writeUTF(e.token); - out.writeInt(e.startRow); - out.writeInt(e.numRows); + tocout.writeLong(dataPos + outb.size()); + + outb.writeUTF(e.token); + + int startRow = e.startRow; + int numRows = e.numRows; + if (prunedRowIdx != null) { + // note: the start row will always be a TokenRow + // and thus never be pruned + int newNumRows = 1; + for (int i = 1; i < numRows; i++) { + if (prunedRowIdx[startRow + i] >= 0) newNumRows++; + } + startRow = prunedRowIdx[startRow]; + numRows = newNumRows; + } + + outb.writeInt(startRow); + outb.writeInt(numRows); final boolean hasNormalizedForm = !e.token.equals(e.normalizedToken()); - out.writeBoolean(hasNormalizedForm); - if (hasNormalizedForm) out.writeUTF(e.normalizedToken()); - writev6HtmlIndices(out, e.htmlEntries); + outb.writeBoolean(hasNormalizedForm); + if (hasNormalizedForm) outb.writeUTF(e.normalizedToken()); + writev6HtmlIndices(outb, dataPos + outb.size(), + prunedRowIdx == null ? e.htmlEntries : Collections.emptyList()); } - long dataPos = out.getFilePointer(); + dataPos += outb.size(); + outb.flush(); + tocout.writeLong(dataPos); + tocout.close(); + out.seek(tocPos); - out.writeLong(dataPos); + out.write(toc.toByteArray()); out.seek(dataPos); } - private void writev6Index(RandomAccessFile out) throws IOException { + private void writev6Index(RandomAccessFile out, boolean skipHtml) throws IOException { + ByteArrayOutputStream toc = new ByteArrayOutputStream(); + DataOutputStream tocout = new DataOutputStream(toc); + out.writeInt(indices.size()); long tocPos = out.getFilePointer(); out.seek(tocPos + indices.size() * 8 + 8); for (Index idx : indices) { + // create pruned index for skipHtml feature + int[] prunedRowIdx = null; + int prunedSize = 0; + if (skipHtml) { + prunedRowIdx = new int[idx.rows.size()]; + for (int i = 0; i < idx.rows.size(); i++) { + final RowBase r = idx.rows.get(i); + // prune Html entries + boolean pruned = r instanceof HtmlEntry.Row; + prunedRowIdx[i] = pruned ? -1 : prunedSize; + if (!pruned) prunedSize++; + } + } + long dataPos = out.getFilePointer(); - out.seek(tocPos); - out.writeLong(dataPos); - tocPos += 8; - out.seek(dataPos); + tocout.writeLong(dataPos); + out.writeUTF(idx.shortName); out.writeUTF(idx.longName); out.writeUTF(idx.sortLanguage.getIsoCode()); out.writeUTF(idx.normalizerRules); out.writeBoolean(idx.swapPairEntries); out.writeInt(idx.mainTokenCount); - writev6IndexEntries(out, idx.sortedIndexEntries); + writev6IndexEntries(out, idx.sortedIndexEntries, prunedRowIdx); // write stoplist, serializing the whole Set *shudder* final ByteArrayOutputStream baos = new ByteArrayOutputStream(); @@ -296,11 +365,14 @@ public class Dictionary implements RAFSerializable { oos.writeObject(idx.stoplist); oos.close(); final byte[] bytes = baos.toByteArray(); - out.writeInt(bytes.length); - out.write(bytes); - out.writeInt(idx.rows.size()); - out.writeInt(5); + + DataOutputStream outb = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(out.getFD()))); + outb.writeInt(bytes.length); + outb.write(bytes); + + outb.writeInt(skipHtml ? prunedSize : idx.rows.size()); + outb.writeInt(5); for (RowBase r : idx.rows) { int type = 0; if (r instanceof PairEntry.Row) { @@ -312,20 +384,25 @@ public class Dictionary implements RAFSerializable { type = 2; } else if (r instanceof HtmlEntry.Row) { type = 4; + if (skipHtml) continue; } else { throw new RuntimeException("Row type not supported for v6"); } - out.writeByte(type); - out.writeInt(r.referenceIndex); + outb.writeByte(type); + outb.writeInt(r.referenceIndex); } + outb.flush(); } long dataPos = out.getFilePointer(); + tocout.writeLong(dataPos); + tocout.close(); + out.seek(tocPos); - out.writeLong(dataPos); + out.write(toc.toByteArray()); out.seek(dataPos); } - public void writev6(DataOutput out) throws IOException { + public void writev6(DataOutput out, boolean skipHtml) throws IOException { RandomAccessFile raf = (RandomAccessFile)out; raf.writeInt(6); raf.writeLong(creationMillis); @@ -337,9 +414,10 @@ public class Dictionary implements RAFSerializable { System.out.println("text start: " + raf.getFilePointer()); writev6TextEntries(raf); System.out.println("html index start: " + raf.getFilePointer()); - writev6HtmlEntries(raf); + if (skipHtml) writev6EmptyList(raf); + else writev6HtmlEntries(raf); System.out.println("indices start: " + raf.getFilePointer()); - writev6Index(raf); + writev6Index(raf, skipHtml); System.out.println("end: " + raf.getFilePointer()); raf.writeUTF(END_OF_DICTIONARY); }