LUCENE-8868: New storing strategy for BKD tree leaves with low cardinality (#730)

When a leaf has only few distinct values, we store the distinct values with the cardinality.
This commit is contained in:
Ignacio Vera 2019-06-26 10:16:12 +02:00 committed by GitHub
parent 36eaf75b1f
commit dac4310129
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 241 additions and 35 deletions

View File

@ -128,6 +128,10 @@ Optimizations
Now caller threads execute at least one search on an index even if there is
an executor provided to minimize thread context switching. (Simon Willnauer)
* LUCENE-8868: New storing strategy for BKD tree leaves with low cardinality.
It stores the distinct values once with the cardinality value reducing the
storage cost.
Test Framework
* LUCENE-8825: CheckHits now display the shard index in case of mismatch

View File

@ -441,16 +441,22 @@ public final class BKDReader extends PointValues implements Accountable {
void visitDocValues(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue,
IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
if (version >= BKDWriter.VERSION_LOW_CARDINALITY_LEAVES) {
visitDocValuesWithCardinality(commonPrefixLengths, scratchDataPackedValue, scratchMinIndexPackedValue, scratchMaxIndexPackedValue, in, docIDs, count, visitor);
} else {
visitDocValuesNoCardinality(commonPrefixLengths, scratchDataPackedValue, scratchMinIndexPackedValue, scratchMaxIndexPackedValue, in, docIDs, count, visitor);
}
}
void visitDocValuesNoCardinality(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue,
IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
readCommonPrefixes(commonPrefixLengths, scratchDataPackedValue, in);
if (numIndexDims != 1 && version >= BKDWriter.VERSION_LEAF_STORES_BOUNDS) {
byte[] minPackedValue = scratchMinIndexPackedValue;
System.arraycopy(scratchDataPackedValue, 0, minPackedValue, 0, packedIndexBytesLength);
byte[] maxPackedValue = scratchMaxIndexPackedValue;
//Copy common prefixes before reading adjusted
// box
// Copy common prefixes before reading adjusted box
System.arraycopy(minPackedValue, 0, maxPackedValue, 0, packedIndexBytesLength);
readMinMax(commonPrefixLengths, minPackedValue, maxPackedValue, in);
@ -480,12 +486,61 @@ public final class BKDReader extends PointValues implements Accountable {
int compressedDim = readCompressedDim(in);
if (compressedDim == -1) {
visitRawDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor);
visitUniqueRawDocValues(scratchDataPackedValue, docIDs, count, visitor);
} else {
visitCompressedDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor, compressedDim);
}
}
void visitDocValuesWithCardinality(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue,
IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
readCommonPrefixes(commonPrefixLengths, scratchDataPackedValue, in);
int compressedDim = readCompressedDim(in);
if (compressedDim == -1) {
// all values are the same
visitor.grow(count);
visitUniqueRawDocValues(scratchDataPackedValue, docIDs, count, visitor);
} else {
if (numIndexDims != 1) {
byte[] minPackedValue = scratchMinIndexPackedValue;
System.arraycopy(scratchDataPackedValue, 0, minPackedValue, 0, packedIndexBytesLength);
byte[] maxPackedValue = scratchMaxIndexPackedValue;
// Copy common prefixes before reading adjusted box
System.arraycopy(minPackedValue, 0, maxPackedValue, 0, packedIndexBytesLength);
readMinMax(commonPrefixLengths, minPackedValue, maxPackedValue, in);
// The index gives us range of values for each dimension, but the actual range of values
// might be much more narrow than what the index told us, so we double check the relation
// here, which is cheap yet might help figure out that the block either entirely matches
// or does not match at all. This is especially more likely in the case that there are
// multiple dimensions that have correlation, ie. splitting on one dimension also
// significantly changes the range of values in another dimension.
Relation r = visitor.compare(minPackedValue, maxPackedValue);
if (r == Relation.CELL_OUTSIDE_QUERY) {
return;
}
visitor.grow(count);
if (r == Relation.CELL_INSIDE_QUERY) {
for (int i = 0; i < count; ++i) {
visitor.visit(docIDs[i]);
}
return;
}
} else {
visitor.grow(count);
}
if (compressedDim == -2) {
// low cardinality values
visitSparseRawDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor);
} else {
// high cardinality
visitCompressedDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor, compressedDim);
}
}
}
private void readMinMax(int[] commonPrefixLengths, byte[] minPackedValue, byte[] maxPackedValue, IndexInput in) throws IOException {
for (int dim = 0; dim < numIndexDims; dim++) {
int prefix = commonPrefixLengths[dim];
@ -494,13 +549,28 @@ public final class BKDReader extends PointValues implements Accountable {
}
}
// Just read suffixes for every dimension
private void visitRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
for (int i = 0; i < count; ++i) {
for(int dim=0;dim<numDataDims;dim++) {
// read cardinality and point
private void visitSparseRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
int i;
for (i = 0; i < count;) {
int length = in.readVInt();
for(int dim = 0; dim < numDataDims; dim++) {
int prefix = commonPrefixLengths[dim];
in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
}
for (int j = i; j < i + length; j++) {
visitor.visit(docIDs[j], scratchPackedValue);
}
i += length;
}
if (i != count) {
throw new CorruptIndexException("Sub blocks do not add up to the expected count: " + count + " != " + i, in);
}
}
// point is under commonPrefix
private void visitUniqueRawDocValues(byte[] scratchPackedValue, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
for (int i = 0; i < count; i++) {
visitor.visit(docIDs[i], scratchPackedValue);
}
}
@ -515,7 +585,7 @@ public final class BKDReader extends PointValues implements Accountable {
scratchPackedValue[compressedByteOffset] = in.readByte();
final int runLen = Byte.toUnsignedInt(in.readByte());
for (int j = 0; j < runLen; ++j) {
for(int dim=0;dim<numDataDims;dim++) {
for(int dim = 0; dim < numDataDims; dim++) {
int prefix = commonPrefixLengths[dim];
in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
}
@ -530,7 +600,7 @@ public final class BKDReader extends PointValues implements Accountable {
private int readCompressedDim(IndexInput in) throws IOException {
int compressedDim = in.readByte();
if (compressedDim < -1 || compressedDim >= numDataDims) {
if (compressedDim < -2 || compressedDim >= numDataDims || (version < BKDWriter.VERSION_LOW_CARDINALITY_LEAVES && compressedDim == -2)) {
throw new CorruptIndexException("Got compressedDim="+compressedDim, in);
}
return compressedDim;

View File

@ -80,7 +80,8 @@ public class BKDWriter implements Closeable {
//public static final int VERSION_CURRENT = VERSION_START;
public static final int VERSION_LEAF_STORES_BOUNDS = 5;
public static final int VERSION_SELECTIVE_INDEXING = 6;
public static final int VERSION_CURRENT = VERSION_SELECTIVE_INDEXING;
public static final int VERSION_LOW_CARDINALITY_LEAVES = 7;
public static final int VERSION_CURRENT = VERSION_LOW_CARDINALITY_LEAVES;
/** How many bytes each docs takes in the fixed-width offline format */
private final int bytesPerDoc;
@ -516,6 +517,7 @@ public class BKDWriter implements Closeable {
final int[] leafDocs = new int[maxPointsInLeafNode];
private long valueCount;
private int leafCount;
private int leafCardinality;
OneDimensionBKDWriter(IndexOutput out) {
if (numIndexDims != 1) {
@ -546,6 +548,9 @@ public class BKDWriter implements Closeable {
assert valueInOrder(valueCount + leafCount,
0, lastPackedValue, packedValue, 0, docID, lastDocID);
if (leafCount == 0 || Arrays.mismatch(leafValues, (leafCount - 1) * bytesPerDim, leafCount * bytesPerDim, packedValue, 0, bytesPerDim) != -1) {
leafCardinality++;
}
System.arraycopy(packedValue, 0, leafValues, leafCount * packedBytesLength, packedBytesLength);
leafDocs[leafCount] = docID;
docsSeen.set(docID);
@ -558,7 +563,8 @@ public class BKDWriter implements Closeable {
if (leafCount == maxPointsInLeafNode) {
// We write a block once we hit exactly the max count ... this is different from
// when we write N > 1 dimensional points where we write between max/2 and max per leaf block
writeLeafBlock();
writeLeafBlock(leafCardinality);
leafCardinality = 0;
leafCount = 0;
}
@ -567,7 +573,8 @@ public class BKDWriter implements Closeable {
public long finish() throws IOException {
if (leafCount > 0) {
writeLeafBlock();
writeLeafBlock(leafCardinality);
leafCardinality = 0;
leafCount = 0;
}
@ -593,7 +600,7 @@ public class BKDWriter implements Closeable {
return indexFP;
}
private void writeLeafBlock() throws IOException {
private void writeLeafBlock(int leafCardinality) throws IOException {
assert leafCount != 0;
if (valueCount == 0) {
System.arraycopy(leafValues, 0, minPackedValue, 0, packedIndexBytesLength);
@ -613,7 +620,7 @@ public class BKDWriter implements Closeable {
int offset = (leafCount - 1) * packedBytesLength;
int prefix = Arrays.mismatch(leafValues, 0, bytesPerDim, leafValues, offset, offset + bytesPerDim);
if (prefix == -1) {
prefix = bytesPerDim;
prefix = bytesPerDim;
}
commonPrefixLengths[0] = prefix;
@ -635,7 +642,7 @@ public class BKDWriter implements Closeable {
assert valuesInOrderAndBounds(leafCount, 0, ArrayUtil.copyOfSubArray(leafValues, 0, packedBytesLength),
ArrayUtil.copyOfSubArray(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength),
packedValues, leafDocs, 0);
writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, leafCount, 0, packedValues);
writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, leafCount, 0, packedValues, leafCardinality);
scratchOut.copyTo(out);
scratchOut.reset();
}
@ -1028,31 +1035,93 @@ public class BKDWriter implements Closeable {
DocIdsWriter.writeDocIds(docIDs, start, count, out);
}
private void writeLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
private void writeLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues, int leafCardinality) throws IOException {
int prefixLenSum = Arrays.stream(commonPrefixLengths).sum();
if (prefixLenSum == packedBytesLength) {
// all values in this block are equal
out.writeByte((byte) -1);
} else {
if (numIndexDims != 1) {
writeActualBounds(out, commonPrefixLengths, count, packedValues);
}
assert commonPrefixLengths[sortedDim] < bytesPerDim;
out.writeByte((byte) sortedDim);
// estimate if storing the values with cardinality is cheaper than storing all values.
int compressedByteOffset = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim];
commonPrefixLengths[sortedDim]++;
for (int i = 0; i < count; ) {
// do run-length compression on the byte at compressedByteOffset
int runLen = runLen(packedValues, i, Math.min(i + 0xff, count), compressedByteOffset);
assert runLen <= 0xff;
BytesRef first = packedValues.apply(i);
byte prefixByte = first.bytes[first.offset + compressedByteOffset];
out.writeByte(prefixByte);
out.writeByte((byte) runLen);
writeLeafBlockPackedValuesRange(out, commonPrefixLengths, i, i + runLen, packedValues);
i += runLen;
assert i <= count;
int highCardinalityCost;
int lowCardinalityCost;
if (count == leafCardinality) {
// all values in this block are different
highCardinalityCost = 0;
lowCardinalityCost = 1;
} else {
// compute cost of runLen compression
int numRunLens = 0;
for (int i = 0; i < count; ) {
// do run-length compression on the byte at compressedByteOffset
int runLen = runLen(packedValues, i, Math.min(i + 0xff, count), compressedByteOffset);
assert runLen <= 0xff;
numRunLens++;
i += runLen;
}
// Add cost of runLen compression
highCardinalityCost = count * (packedBytesLength - prefixLenSum - 1) + 2 * numRunLens;
// +1 is the byte needed for storing the cardinality
lowCardinalityCost = leafCardinality * (packedBytesLength - prefixLenSum + 1);
}
if (lowCardinalityCost <= highCardinalityCost) {
out.writeByte((byte) -2);
writeLowCardinalityLeafBlockPackedValues(out, commonPrefixLengths, count, packedValues);
} else {
out.writeByte((byte) sortedDim);
writeHighCardinalityLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues, compressedByteOffset);
}
}
}
private void writeLowCardinalityLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, IntFunction<BytesRef> packedValues) throws IOException {
if (numIndexDims != 1) {
writeActualBounds(out, commonPrefixLengths, count, packedValues);
}
BytesRef value = packedValues.apply(0);
System.arraycopy(value.bytes, value.offset, scratch1, 0, packedBytesLength);
int cardinality = 1;
for (int i = 1; i < count; i++) {
value = packedValues.apply(i);
for(int dim = 0; dim < numDataDims; dim++) {
final int start = dim * bytesPerDim + commonPrefixLengths[dim];
final int end = dim * bytesPerDim + bytesPerDim;
if (Arrays.mismatch(value.bytes, value.offset + start, value.offset + end, scratch1, start, end) != -1) {
out.writeVInt(cardinality);
for (int j = 0; j < numDataDims; j++) {
out.writeBytes(scratch1, j * bytesPerDim + commonPrefixLengths[j], bytesPerDim - commonPrefixLengths[j]);
}
System.arraycopy(value.bytes, value.offset, scratch1, 0, packedBytesLength);
cardinality = 1;
break;
} else if (dim == numDataDims - 1){
cardinality++;
}
}
}
out.writeVInt(cardinality);
for (int i = 0; i < numDataDims; i++) {
out.writeBytes(scratch1, i * bytesPerDim + commonPrefixLengths[i], bytesPerDim - commonPrefixLengths[i]);
}
}
private void writeHighCardinalityLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues, int compressedByteOffset) throws IOException {
if (numIndexDims != 1) {
writeActualBounds(out, commonPrefixLengths, count, packedValues);
}
commonPrefixLengths[sortedDim]++;
for (int i = 0; i < count; ) {
// do run-length compression on the byte at compressedByteOffset
int runLen = runLen(packedValues, i, Math.min(i + 0xff, count), compressedByteOffset);
assert runLen <= 0xff;
BytesRef first = packedValues.apply(i);
byte prefixByte = first.bytes[first.offset + compressedByteOffset];
out.writeByte(prefixByte);
out.writeByte((byte) runLen);
writeLeafBlockPackedValuesRange(out, commonPrefixLengths, i, i + runLen, packedValues);
i += runLen;
assert i <= count;
}
}
@ -1295,6 +1364,25 @@ public class BKDWriter implements Closeable {
MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths,
reader, from, to, scratchBytesRef1, scratchBytesRef2);
BytesRef comparator = scratchBytesRef1;
BytesRef collector = scratchBytesRef2;
reader.getValue(from, comparator);
int leafCardinality = 1;
for (int i = from + 1; i < to; ++i) {
reader.getValue(i, collector);
for (int dim =0; dim < numDataDims; dim++) {
final int start = dim * bytesPerDim + commonPrefixLengths[dim];
final int end = dim * bytesPerDim + bytesPerDim;
if (Arrays.mismatch(collector.bytes, collector.offset + start, collector.offset + end,
comparator.bytes, comparator.offset + start, comparator.offset + end) != -1) {
leafCardinality++;
BytesRef scratch = collector;
collector = comparator;
comparator = scratch;
break;
}
}
}
// Save the block file pointer:
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
@ -1323,7 +1411,7 @@ public class BKDWriter implements Closeable {
};
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
docIDs, 0);
writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, count, sortedDim, packedValues);
writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, count, sortedDim, packedValues, leafCardinality);
scratchOut.copyTo(out);
scratchOut.reset();
} else {
@ -1426,6 +1514,8 @@ public class BKDWriter implements Closeable {
// sort the chosen dimension
radixSelector.heapRadixSort(heapSource, from, to, sortedDim, commonPrefixLengths[sortedDim]);
// compute cardinality
int leafCardinality = heapSource.computeCardinality(from ,to, numDataDims, bytesPerDim, commonPrefixLengths);
// Save the block file pointer:
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
@ -1459,7 +1549,7 @@ public class BKDWriter implements Closeable {
};
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
heapSource.docIDs, from);
writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues, leafCardinality);
} else {
// Inner node: partition/recurse

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.util.bkd;
import java.util.Arrays;
import org.apache.lucene.util.BytesRef;
/**
@ -93,6 +95,23 @@ public final class HeapPointWriter implements PointWriter {
System.arraycopy(scratch, 0, block, indexJ, packedBytesLength);
}
public int computeCardinality(int from, int to, int numDataDims, int bytesPerDim, int[] commonPrefixLengths) {
assert packedBytesLength == numDataDims * bytesPerDim;
int leafCardinality = 1;
for (int i = from + 1; i < to; i++) {
for (int dim = 0; dim < numDataDims; dim++) {
final int start = dim * bytesPerDim + commonPrefixLengths[dim];
final int end = dim * bytesPerDim + bytesPerDim;
if (Arrays.mismatch(block, i * packedBytesLength + start, i * packedBytesLength + end,
block, (i - 1) * packedBytesLength + start, (i - 1) * packedBytesLength + end) != -1) {
leafCardinality++;
break;
}
}
}
return leafCardinality;
}
@Override
public long count() {
return nextWrite;

View File

@ -614,6 +614,29 @@ public class TestBKD extends LuceneTestCase {
verify(docValues, null, numDataDims, numIndexDims, numBytesPerDim);
}
// this should trigger low cardinality leaves
public void testRandomFewDifferentValues() throws Exception {
int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
int numIndexDims = TestUtil.nextInt(random(), 1, 8);
int numDataDims = TestUtil.nextInt(random(), numIndexDims, 8);
int numDocs = atLeast(10000);
int cardinality = TestUtil.nextInt(random(), 2, 100);
byte[][][] values = new byte[cardinality][numDataDims][numBytesPerDim];
for (int i = 0; i < cardinality; i++) {
for (int j = 0; j < numDataDims; j++) {
random().nextBytes(values[i][j]);
}
}
byte[][][] docValues = new byte[numDocs][][];
for(int docID = 0; docID < numDocs; docID++) {
docValues[docID] = values[random().nextInt(cardinality)];
}
verify(docValues, null, numDataDims, numIndexDims, numBytesPerDim);
}
public void testMultiValued() throws Exception {
int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
int numDataDims = TestUtil.nextInt(random(), 1, 5);