From 1f446872aa9346c22643d0fb753ec42942b5a4d2 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 5 Jul 2016 16:54:19 +0200 Subject: [PATCH] LUCENE-7371: Better compression of values in Lucene60PointsFormat. --- lucene/CHANGES.txt | 3 + .../simpletext/SimpleTextPointsWriter.java | 16 +- .../org/apache/lucene/util/bkd/BKDReader.java | 65 ++++++- .../org/apache/lucene/util/bkd/BKDWriter.java | 183 +++++++++++++++--- .../org/apache/lucene/util/bkd/TestBKD.java | 29 +++ .../index/BasePointsFormatTestCase.java | 29 +++ 6 files changed, 280 insertions(+), 45 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 83d1782c127..c5e6f5c7dfc 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -92,6 +92,9 @@ Optimizations * LUCENE-7351: Doc id compression for points. (Adrien Grand) +* LUCENE-7351: Point values are now better compressed using run-length + encoding. (Adrien Grand) + Other * LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java index e54e20abe6a..8d5c0344509 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java @@ -20,6 +20,7 @@ package org.apache.lucene.codecs.simpletext; import java.io.IOException; import java.util.HashMap; import java.util.Map; +import java.util.function.IntFunction; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.PointsWriter; @@ -161,12 +162,15 @@ class SimpleTextPointsWriter extends PointsWriter { } @Override - protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes, int bytesOffset) throws IOException { - // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths - write(out, BLOCK_VALUE); - write(out, new BytesRef(bytes, bytesOffset, packedBytesLength).toString()); - newline(out); - } + protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction packedValues) throws IOException { + for (int i = 0; i < count; ++i) { + BytesRef packedValue = packedValues.apply(i); + // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths + write(out, BLOCK_VALUE); + write(out, packedValue.toString()); + newline(out); + } + } }) { values.intersect(fieldInfo.name, new IntersectVisitor() { diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java index 3566bc11d5b..9ca0bb493ac 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Arrays; import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; import org.apache.lucene.store.IndexInput; @@ -345,6 +346,63 @@ public class BKDReader implements Accountable { protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { visitor.grow(count); + + readCommonPrefixes(commonPrefixLengths, scratchPackedValue, in); + + int compressedDim = version < BKDWriter.VERSION_COMPRESSED_VALUES + ? -1 + : readCompressedDim(in); + + if (compressedDim == -1) { + visitRawDocValues(commonPrefixLengths, scratchPackedValue, in, docIDs, count, visitor); + } else { + visitCompressedDocValues(commonPrefixLengths, scratchPackedValue, in, docIDs, count, visitor, compressedDim); + } + } + + // Just read suffixes for every dimension + private void visitRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { + for (int i = 0; i < count; ++i) { + for(int dim=0;dimmaxPointsInLeafNode. Values that fall exactly * on a cell boundary may be in either cell. * - *

The number of dimensions can be 1 to 255, but every byte[] value is fixed length. + *

The number of dimensions can be 1 to 8, but every byte[] value is fixed length. * *

* See this paper for details. @@ -69,7 +71,7 @@ import org.apache.lucene.util.StringHelper; * and then uses up to the specified {@code maxMBSortInHeap} heap space for writing. * *

- * NOTE: This can write at most Integer.MAX_VALUE * maxPointsInLeafNode total points, and + * NOTE: This can write at most Integer.MAX_VALUE * maxPointsInLeafNode total points. * * @lucene.experimental */ @@ -78,7 +80,8 @@ public class BKDWriter implements Closeable { public static final String CODEC_NAME = "BKD"; public static final int VERSION_START = 0; public static final int VERSION_COMPRESSED_DOC_IDS = 1; - public static final int VERSION_CURRENT = VERSION_COMPRESSED_DOC_IDS; + public static final int VERSION_COMPRESSED_VALUES = 2; + public static final int VERSION_CURRENT = VERSION_COMPRESSED_VALUES; /** How many bytes each docs takes in the fixed-width offline format */ private final int bytesPerDoc; @@ -312,6 +315,8 @@ public class BKDWriter implements Closeable { /** Which leaf block we are up to */ private int blockID; + private final byte[] packedValues; + public MergeReader(BKDReader bkd, MergeState.DocMap docMap) throws IOException { this.bkd = bkd; state = new BKDReader.IntersectState(bkd.in.clone(), @@ -327,6 +332,7 @@ public class BKDWriter implements Closeable { //System.out.println(" leaf fp=" + fp); } state.in.seek(minFP); + this.packedValues = new byte[bkd.maxPointsInLeafNode * bkd.packedBytesLength]; } public boolean next() throws IOException { @@ -341,18 +347,33 @@ public class BKDWriter implements Closeable { docsInBlock = bkd.readDocIDs(state.in, state.in.getFilePointer(), state.scratchDocIDs); assert docsInBlock > 0; docBlockUpto = 0; - for(int dim=0;dim 0) { - state.in.readBytes(state.scratchPackedValue, dim*bkd.bytesPerDim, prefix); + bkd.visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, docsInBlock, new IntersectVisitor() { + int i = 0; + + @Override + public void visit(int docID) throws IOException { + throw new UnsupportedOperationException(); } - } + + @Override + public void visit(int docID, byte[] packedValue) throws IOException { + assert docID == state.scratchDocIDs[i]; + System.arraycopy(packedValue, 0, packedValues, i * bkd.packedBytesLength, bkd.packedBytesLength); + i++; + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + throw new UnsupportedOperationException(); + } + + }); blockID++; } - int oldDocID = state.scratchDocIDs[docBlockUpto++]; + final int index = docBlockUpto++; + int oldDocID = state.scratchDocIDs[index]; int mappedDocID; if (docMap == null) { @@ -360,13 +381,11 @@ public class BKDWriter implements Closeable { } else { mappedDocID = docMap.get(oldDocID); } - for(int dim=0;dim packedValues = new IntFunction() { + final BytesRef scratch = new BytesRef(); + + { + scratch.length = packedBytesLength; + scratch.offset = 0; + } + + @Override + public BytesRef apply(int i) { + scratch.bytes = leafBlockPackedValues[i]; + return scratch; + } + }; + writeLeafBlockPackedValues(out, commonPrefixLengths, leafCount, 0, packedValues); leafCount = 0; } @@ -896,13 +926,57 @@ public class BKDWriter implements Closeable { DocIdsWriter.writeDocIds(docIDs, start, count, out); } - protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes, int offset) throws IOException { - for(int dim=0;dim packedValues) throws IOException { + int prefixLenSum = Arrays.stream(commonPrefixLengths).sum(); + if (prefixLenSum == packedBytesLength) { + // all values in this block are equal + out.writeByte((byte) -1); + } else { + assert commonPrefixLengths[sortedDim] < bytesPerDim; + out.writeByte((byte) sortedDim); + int compressedByteOffset = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim]; + commonPrefixLengths[sortedDim]++; + for (int i = 0; i < count; ) { + // do run-length compression on the byte at compressedByteOffset + int runLen = runLen(packedValues, i, Math.min(i + 0xff, count), compressedByteOffset); + assert runLen <= 0xff; + BytesRef first = packedValues.apply(i); + byte prefixByte = first.bytes[first.offset + compressedByteOffset]; + out.writeByte(prefixByte); + out.writeByte((byte) runLen); + writeLeafBlockPackedValuesRange(out, commonPrefixLengths, i, i + runLen, packedValues); + i += runLen; + assert i <= count; + } } } + private void writeLeafBlockPackedValuesRange(IndexOutput out, int[] commonPrefixLengths, int start, int end, IntFunction packedValues) throws IOException { + for (int i = start; i < end; ++i) { + BytesRef ref = packedValues.apply(i); + assert ref.length == packedBytesLength; + + for(int dim=0;dim packedValues, int start, int end, int byteOffset) { + BytesRef first = packedValues.apply(start); + byte b = first.bytes[first.offset + byteOffset]; + for (int i = start + 1; i < end; ++i) { + BytesRef ref = packedValues.apply(i); + byte b2 = ref.bytes[ref.offset + byteOffset]; + assert Byte.toUnsignedInt(b2) >= Byte.toUnsignedInt(b); + if (b != b2) { + return i - start; + } + } + return end - start; + } + protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException { for(int dim=0;dim= leafNodeOffset) { // Leaf node: write block + // We can write the block in any order so by default we write it sorted by the dimension that has the + // least number of unique bytes at commonPrefixLengths[dim], which makes compression more efficient + int sortedDim = 0; + int sortedDimCardinality = Integer.MAX_VALUE; + for (int dim=0;dim= maxPointsInLeafNode, so we better be in heap at this point: HeapPointWriter heapSource = (HeapPointWriter) source.writer; @@ -1105,15 +1204,21 @@ public class BKDWriter implements Closeable { writeCommonPrefixes(out, commonPrefixLengths, scratch1); // Write the full values: - byte[] lastPackedValue = new byte[bytesPerDim]; - for (int i=0;i packedValues = new IntFunction() { + final BytesRef scratch = new BytesRef(); - // Make sure this value does in fact fall within this leaf cell: - assert valueInBounds(scratchBytesRef, minPackedValue, maxPackedValue); - writeLeafBlockPackedValue(out, commonPrefixLengths, scratchBytesRef.bytes, scratchBytesRef.offset); - } + { + scratch.length = packedBytesLength; + } + + @Override + public BytesRef apply(int i) { + heapSource.getPackedValueSlice(Math.toIntExact(source.start + i), scratch); + return scratch; + } + }; + assert valuesInOrderAndBounds(count, minPackedValue, maxPackedValue, packedValues); + writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues); } else { // Inner node: partition/recurse @@ -1215,6 +1320,20 @@ public class BKDWriter implements Closeable { } } + // only called from assert + private boolean valuesInOrderAndBounds(int count, byte[] minPackedValue, byte[] maxPackedValue, IntFunction values) throws IOException { + byte[] lastPackedValue = new byte[bytesPerDim]; + for (int i=0;i 0 && StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, packedValueOffset) > 0) { diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java index e8b88fc8d8e..9eb1fd3ff09 100644 --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java @@ -507,6 +507,35 @@ public class TestBKD extends LuceneTestCase { verify(docValues, null, numDims, numBytesPerDim); } + // this should trigger run-length compression with lengths that are greater than 255 + public void testOneDimTwoValues() throws Exception { + int numBytesPerDim = TestUtil.nextInt(random(), 2, 30); + int numDims = TestUtil.nextInt(random(), 1, 5); + + int numDocs = atLeast(1000); + int theDim = random().nextInt(numDims); + byte[] value1 = new byte[numBytesPerDim]; + random().nextBytes(value1); + byte[] value2 = new byte[numBytesPerDim]; + random().nextBytes(value2); + byte[][][] docValues = new byte[numDocs][][]; + + for(int docID=0;docID