mirror of https://github.com/apache/lucene.git
LUCENE-8868: New storing strategy for BKD tree leaves with low cardinality (#730)
When a leaf has only few distinct values, we store the distinct values with the cardinality.
This commit is contained in:
parent
36eaf75b1f
commit
dac4310129
|
@ -128,6 +128,10 @@ Optimizations
|
|||
Now caller threads execute at least one search on an index even if there is
|
||||
an executor provided to minimize thread context switching. (Simon Willnauer)
|
||||
|
||||
* LUCENE-8868: New storing strategy for BKD tree leaves with low cardinality.
|
||||
It stores the distinct values once with the cardinality value reducing the
|
||||
storage cost.
|
||||
|
||||
Test Framework
|
||||
|
||||
* LUCENE-8825: CheckHits now display the shard index in case of mismatch
|
||||
|
|
|
@ -441,16 +441,22 @@ public final class BKDReader extends PointValues implements Accountable {
|
|||
|
||||
void visitDocValues(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue,
|
||||
IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
|
||||
if (version >= BKDWriter.VERSION_LOW_CARDINALITY_LEAVES) {
|
||||
visitDocValuesWithCardinality(commonPrefixLengths, scratchDataPackedValue, scratchMinIndexPackedValue, scratchMaxIndexPackedValue, in, docIDs, count, visitor);
|
||||
} else {
|
||||
visitDocValuesNoCardinality(commonPrefixLengths, scratchDataPackedValue, scratchMinIndexPackedValue, scratchMaxIndexPackedValue, in, docIDs, count, visitor);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void visitDocValuesNoCardinality(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue,
|
||||
IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
|
||||
readCommonPrefixes(commonPrefixLengths, scratchDataPackedValue, in);
|
||||
|
||||
if (numIndexDims != 1 && version >= BKDWriter.VERSION_LEAF_STORES_BOUNDS) {
|
||||
byte[] minPackedValue = scratchMinIndexPackedValue;
|
||||
System.arraycopy(scratchDataPackedValue, 0, minPackedValue, 0, packedIndexBytesLength);
|
||||
byte[] maxPackedValue = scratchMaxIndexPackedValue;
|
||||
//Copy common prefixes before reading adjusted
|
||||
// box
|
||||
// Copy common prefixes before reading adjusted box
|
||||
System.arraycopy(minPackedValue, 0, maxPackedValue, 0, packedIndexBytesLength);
|
||||
readMinMax(commonPrefixLengths, minPackedValue, maxPackedValue, in);
|
||||
|
||||
|
@ -480,12 +486,61 @@ public final class BKDReader extends PointValues implements Accountable {
|
|||
int compressedDim = readCompressedDim(in);
|
||||
|
||||
if (compressedDim == -1) {
|
||||
visitRawDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor);
|
||||
visitUniqueRawDocValues(scratchDataPackedValue, docIDs, count, visitor);
|
||||
} else {
|
||||
visitCompressedDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor, compressedDim);
|
||||
}
|
||||
}
|
||||
|
||||
void visitDocValuesWithCardinality(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue,
|
||||
IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
|
||||
|
||||
readCommonPrefixes(commonPrefixLengths, scratchDataPackedValue, in);
|
||||
int compressedDim = readCompressedDim(in);
|
||||
if (compressedDim == -1) {
|
||||
// all values are the same
|
||||
visitor.grow(count);
|
||||
visitUniqueRawDocValues(scratchDataPackedValue, docIDs, count, visitor);
|
||||
} else {
|
||||
if (numIndexDims != 1) {
|
||||
byte[] minPackedValue = scratchMinIndexPackedValue;
|
||||
System.arraycopy(scratchDataPackedValue, 0, minPackedValue, 0, packedIndexBytesLength);
|
||||
byte[] maxPackedValue = scratchMaxIndexPackedValue;
|
||||
// Copy common prefixes before reading adjusted box
|
||||
System.arraycopy(minPackedValue, 0, maxPackedValue, 0, packedIndexBytesLength);
|
||||
readMinMax(commonPrefixLengths, minPackedValue, maxPackedValue, in);
|
||||
|
||||
// The index gives us range of values for each dimension, but the actual range of values
|
||||
// might be much more narrow than what the index told us, so we double check the relation
|
||||
// here, which is cheap yet might help figure out that the block either entirely matches
|
||||
// or does not match at all. This is especially more likely in the case that there are
|
||||
// multiple dimensions that have correlation, ie. splitting on one dimension also
|
||||
// significantly changes the range of values in another dimension.
|
||||
Relation r = visitor.compare(minPackedValue, maxPackedValue);
|
||||
if (r == Relation.CELL_OUTSIDE_QUERY) {
|
||||
return;
|
||||
}
|
||||
visitor.grow(count);
|
||||
|
||||
if (r == Relation.CELL_INSIDE_QUERY) {
|
||||
for (int i = 0; i < count; ++i) {
|
||||
visitor.visit(docIDs[i]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
visitor.grow(count);
|
||||
}
|
||||
if (compressedDim == -2) {
|
||||
// low cardinality values
|
||||
visitSparseRawDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor);
|
||||
} else {
|
||||
// high cardinality
|
||||
visitCompressedDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor, compressedDim);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void readMinMax(int[] commonPrefixLengths, byte[] minPackedValue, byte[] maxPackedValue, IndexInput in) throws IOException {
|
||||
for (int dim = 0; dim < numIndexDims; dim++) {
|
||||
int prefix = commonPrefixLengths[dim];
|
||||
|
@ -494,13 +549,28 @@ public final class BKDReader extends PointValues implements Accountable {
|
|||
}
|
||||
}
|
||||
|
||||
// Just read suffixes for every dimension
|
||||
private void visitRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
|
||||
for (int i = 0; i < count; ++i) {
|
||||
for(int dim=0;dim<numDataDims;dim++) {
|
||||
// read cardinality and point
|
||||
private void visitSparseRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
|
||||
int i;
|
||||
for (i = 0; i < count;) {
|
||||
int length = in.readVInt();
|
||||
for(int dim = 0; dim < numDataDims; dim++) {
|
||||
int prefix = commonPrefixLengths[dim];
|
||||
in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
|
||||
}
|
||||
for (int j = i; j < i + length; j++) {
|
||||
visitor.visit(docIDs[j], scratchPackedValue);
|
||||
}
|
||||
i += length;
|
||||
}
|
||||
if (i != count) {
|
||||
throw new CorruptIndexException("Sub blocks do not add up to the expected count: " + count + " != " + i, in);
|
||||
}
|
||||
}
|
||||
|
||||
// point is under commonPrefix
|
||||
private void visitUniqueRawDocValues(byte[] scratchPackedValue, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
|
||||
for (int i = 0; i < count; i++) {
|
||||
visitor.visit(docIDs[i], scratchPackedValue);
|
||||
}
|
||||
}
|
||||
|
@ -515,7 +585,7 @@ public final class BKDReader extends PointValues implements Accountable {
|
|||
scratchPackedValue[compressedByteOffset] = in.readByte();
|
||||
final int runLen = Byte.toUnsignedInt(in.readByte());
|
||||
for (int j = 0; j < runLen; ++j) {
|
||||
for(int dim=0;dim<numDataDims;dim++) {
|
||||
for(int dim = 0; dim < numDataDims; dim++) {
|
||||
int prefix = commonPrefixLengths[dim];
|
||||
in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
|
||||
}
|
||||
|
@ -530,7 +600,7 @@ public final class BKDReader extends PointValues implements Accountable {
|
|||
|
||||
private int readCompressedDim(IndexInput in) throws IOException {
|
||||
int compressedDim = in.readByte();
|
||||
if (compressedDim < -1 || compressedDim >= numDataDims) {
|
||||
if (compressedDim < -2 || compressedDim >= numDataDims || (version < BKDWriter.VERSION_LOW_CARDINALITY_LEAVES && compressedDim == -2)) {
|
||||
throw new CorruptIndexException("Got compressedDim="+compressedDim, in);
|
||||
}
|
||||
return compressedDim;
|
||||
|
|
|
@ -80,7 +80,8 @@ public class BKDWriter implements Closeable {
|
|||
//public static final int VERSION_CURRENT = VERSION_START;
|
||||
public static final int VERSION_LEAF_STORES_BOUNDS = 5;
|
||||
public static final int VERSION_SELECTIVE_INDEXING = 6;
|
||||
public static final int VERSION_CURRENT = VERSION_SELECTIVE_INDEXING;
|
||||
public static final int VERSION_LOW_CARDINALITY_LEAVES = 7;
|
||||
public static final int VERSION_CURRENT = VERSION_LOW_CARDINALITY_LEAVES;
|
||||
|
||||
/** How many bytes each docs takes in the fixed-width offline format */
|
||||
private final int bytesPerDoc;
|
||||
|
@ -516,6 +517,7 @@ public class BKDWriter implements Closeable {
|
|||
final int[] leafDocs = new int[maxPointsInLeafNode];
|
||||
private long valueCount;
|
||||
private int leafCount;
|
||||
private int leafCardinality;
|
||||
|
||||
OneDimensionBKDWriter(IndexOutput out) {
|
||||
if (numIndexDims != 1) {
|
||||
|
@ -546,6 +548,9 @@ public class BKDWriter implements Closeable {
|
|||
assert valueInOrder(valueCount + leafCount,
|
||||
0, lastPackedValue, packedValue, 0, docID, lastDocID);
|
||||
|
||||
if (leafCount == 0 || Arrays.mismatch(leafValues, (leafCount - 1) * bytesPerDim, leafCount * bytesPerDim, packedValue, 0, bytesPerDim) != -1) {
|
||||
leafCardinality++;
|
||||
}
|
||||
System.arraycopy(packedValue, 0, leafValues, leafCount * packedBytesLength, packedBytesLength);
|
||||
leafDocs[leafCount] = docID;
|
||||
docsSeen.set(docID);
|
||||
|
@ -558,7 +563,8 @@ public class BKDWriter implements Closeable {
|
|||
if (leafCount == maxPointsInLeafNode) {
|
||||
// We write a block once we hit exactly the max count ... this is different from
|
||||
// when we write N > 1 dimensional points where we write between max/2 and max per leaf block
|
||||
writeLeafBlock();
|
||||
writeLeafBlock(leafCardinality);
|
||||
leafCardinality = 0;
|
||||
leafCount = 0;
|
||||
}
|
||||
|
||||
|
@ -567,7 +573,8 @@ public class BKDWriter implements Closeable {
|
|||
|
||||
public long finish() throws IOException {
|
||||
if (leafCount > 0) {
|
||||
writeLeafBlock();
|
||||
writeLeafBlock(leafCardinality);
|
||||
leafCardinality = 0;
|
||||
leafCount = 0;
|
||||
}
|
||||
|
||||
|
@ -593,7 +600,7 @@ public class BKDWriter implements Closeable {
|
|||
return indexFP;
|
||||
}
|
||||
|
||||
private void writeLeafBlock() throws IOException {
|
||||
private void writeLeafBlock(int leafCardinality) throws IOException {
|
||||
assert leafCount != 0;
|
||||
if (valueCount == 0) {
|
||||
System.arraycopy(leafValues, 0, minPackedValue, 0, packedIndexBytesLength);
|
||||
|
@ -613,7 +620,7 @@ public class BKDWriter implements Closeable {
|
|||
int offset = (leafCount - 1) * packedBytesLength;
|
||||
int prefix = Arrays.mismatch(leafValues, 0, bytesPerDim, leafValues, offset, offset + bytesPerDim);
|
||||
if (prefix == -1) {
|
||||
prefix = bytesPerDim;
|
||||
prefix = bytesPerDim;
|
||||
}
|
||||
|
||||
commonPrefixLengths[0] = prefix;
|
||||
|
@ -635,7 +642,7 @@ public class BKDWriter implements Closeable {
|
|||
assert valuesInOrderAndBounds(leafCount, 0, ArrayUtil.copyOfSubArray(leafValues, 0, packedBytesLength),
|
||||
ArrayUtil.copyOfSubArray(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength),
|
||||
packedValues, leafDocs, 0);
|
||||
writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, leafCount, 0, packedValues);
|
||||
writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, leafCount, 0, packedValues, leafCardinality);
|
||||
scratchOut.copyTo(out);
|
||||
scratchOut.reset();
|
||||
}
|
||||
|
@ -1028,31 +1035,93 @@ public class BKDWriter implements Closeable {
|
|||
DocIdsWriter.writeDocIds(docIDs, start, count, out);
|
||||
}
|
||||
|
||||
private void writeLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
|
||||
private void writeLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues, int leafCardinality) throws IOException {
|
||||
int prefixLenSum = Arrays.stream(commonPrefixLengths).sum();
|
||||
if (prefixLenSum == packedBytesLength) {
|
||||
// all values in this block are equal
|
||||
out.writeByte((byte) -1);
|
||||
} else {
|
||||
if (numIndexDims != 1) {
|
||||
writeActualBounds(out, commonPrefixLengths, count, packedValues);
|
||||
}
|
||||
assert commonPrefixLengths[sortedDim] < bytesPerDim;
|
||||
out.writeByte((byte) sortedDim);
|
||||
// estimate if storing the values with cardinality is cheaper than storing all values.
|
||||
int compressedByteOffset = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim];
|
||||
commonPrefixLengths[sortedDim]++;
|
||||
for (int i = 0; i < count; ) {
|
||||
// do run-length compression on the byte at compressedByteOffset
|
||||
int runLen = runLen(packedValues, i, Math.min(i + 0xff, count), compressedByteOffset);
|
||||
assert runLen <= 0xff;
|
||||
BytesRef first = packedValues.apply(i);
|
||||
byte prefixByte = first.bytes[first.offset + compressedByteOffset];
|
||||
out.writeByte(prefixByte);
|
||||
out.writeByte((byte) runLen);
|
||||
writeLeafBlockPackedValuesRange(out, commonPrefixLengths, i, i + runLen, packedValues);
|
||||
i += runLen;
|
||||
assert i <= count;
|
||||
int highCardinalityCost;
|
||||
int lowCardinalityCost;
|
||||
if (count == leafCardinality) {
|
||||
// all values in this block are different
|
||||
highCardinalityCost = 0;
|
||||
lowCardinalityCost = 1;
|
||||
} else {
|
||||
// compute cost of runLen compression
|
||||
int numRunLens = 0;
|
||||
for (int i = 0; i < count; ) {
|
||||
// do run-length compression on the byte at compressedByteOffset
|
||||
int runLen = runLen(packedValues, i, Math.min(i + 0xff, count), compressedByteOffset);
|
||||
assert runLen <= 0xff;
|
||||
numRunLens++;
|
||||
i += runLen;
|
||||
}
|
||||
// Add cost of runLen compression
|
||||
highCardinalityCost = count * (packedBytesLength - prefixLenSum - 1) + 2 * numRunLens;
|
||||
// +1 is the byte needed for storing the cardinality
|
||||
lowCardinalityCost = leafCardinality * (packedBytesLength - prefixLenSum + 1);
|
||||
}
|
||||
if (lowCardinalityCost <= highCardinalityCost) {
|
||||
out.writeByte((byte) -2);
|
||||
writeLowCardinalityLeafBlockPackedValues(out, commonPrefixLengths, count, packedValues);
|
||||
} else {
|
||||
out.writeByte((byte) sortedDim);
|
||||
writeHighCardinalityLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues, compressedByteOffset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void writeLowCardinalityLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, IntFunction<BytesRef> packedValues) throws IOException {
|
||||
if (numIndexDims != 1) {
|
||||
writeActualBounds(out, commonPrefixLengths, count, packedValues);
|
||||
}
|
||||
BytesRef value = packedValues.apply(0);
|
||||
System.arraycopy(value.bytes, value.offset, scratch1, 0, packedBytesLength);
|
||||
int cardinality = 1;
|
||||
for (int i = 1; i < count; i++) {
|
||||
value = packedValues.apply(i);
|
||||
for(int dim = 0; dim < numDataDims; dim++) {
|
||||
final int start = dim * bytesPerDim + commonPrefixLengths[dim];
|
||||
final int end = dim * bytesPerDim + bytesPerDim;
|
||||
if (Arrays.mismatch(value.bytes, value.offset + start, value.offset + end, scratch1, start, end) != -1) {
|
||||
out.writeVInt(cardinality);
|
||||
for (int j = 0; j < numDataDims; j++) {
|
||||
out.writeBytes(scratch1, j * bytesPerDim + commonPrefixLengths[j], bytesPerDim - commonPrefixLengths[j]);
|
||||
}
|
||||
System.arraycopy(value.bytes, value.offset, scratch1, 0, packedBytesLength);
|
||||
cardinality = 1;
|
||||
break;
|
||||
} else if (dim == numDataDims - 1){
|
||||
cardinality++;
|
||||
}
|
||||
}
|
||||
}
|
||||
out.writeVInt(cardinality);
|
||||
for (int i = 0; i < numDataDims; i++) {
|
||||
out.writeBytes(scratch1, i * bytesPerDim + commonPrefixLengths[i], bytesPerDim - commonPrefixLengths[i]);
|
||||
}
|
||||
}
|
||||
|
||||
private void writeHighCardinalityLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues, int compressedByteOffset) throws IOException {
|
||||
if (numIndexDims != 1) {
|
||||
writeActualBounds(out, commonPrefixLengths, count, packedValues);
|
||||
}
|
||||
commonPrefixLengths[sortedDim]++;
|
||||
for (int i = 0; i < count; ) {
|
||||
// do run-length compression on the byte at compressedByteOffset
|
||||
int runLen = runLen(packedValues, i, Math.min(i + 0xff, count), compressedByteOffset);
|
||||
assert runLen <= 0xff;
|
||||
BytesRef first = packedValues.apply(i);
|
||||
byte prefixByte = first.bytes[first.offset + compressedByteOffset];
|
||||
out.writeByte(prefixByte);
|
||||
out.writeByte((byte) runLen);
|
||||
writeLeafBlockPackedValuesRange(out, commonPrefixLengths, i, i + runLen, packedValues);
|
||||
i += runLen;
|
||||
assert i <= count;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1295,6 +1364,25 @@ public class BKDWriter implements Closeable {
|
|||
MutablePointsReaderUtils.sortByDim(sortedDim, bytesPerDim, commonPrefixLengths,
|
||||
reader, from, to, scratchBytesRef1, scratchBytesRef2);
|
||||
|
||||
BytesRef comparator = scratchBytesRef1;
|
||||
BytesRef collector = scratchBytesRef2;
|
||||
reader.getValue(from, comparator);
|
||||
int leafCardinality = 1;
|
||||
for (int i = from + 1; i < to; ++i) {
|
||||
reader.getValue(i, collector);
|
||||
for (int dim =0; dim < numDataDims; dim++) {
|
||||
final int start = dim * bytesPerDim + commonPrefixLengths[dim];
|
||||
final int end = dim * bytesPerDim + bytesPerDim;
|
||||
if (Arrays.mismatch(collector.bytes, collector.offset + start, collector.offset + end,
|
||||
comparator.bytes, comparator.offset + start, comparator.offset + end) != -1) {
|
||||
leafCardinality++;
|
||||
BytesRef scratch = collector;
|
||||
collector = comparator;
|
||||
comparator = scratch;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Save the block file pointer:
|
||||
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
|
||||
|
||||
|
@ -1323,7 +1411,7 @@ public class BKDWriter implements Closeable {
|
|||
};
|
||||
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
|
||||
docIDs, 0);
|
||||
writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, count, sortedDim, packedValues);
|
||||
writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, count, sortedDim, packedValues, leafCardinality);
|
||||
scratchOut.copyTo(out);
|
||||
scratchOut.reset();
|
||||
} else {
|
||||
|
@ -1426,6 +1514,8 @@ public class BKDWriter implements Closeable {
|
|||
|
||||
// sort the chosen dimension
|
||||
radixSelector.heapRadixSort(heapSource, from, to, sortedDim, commonPrefixLengths[sortedDim]);
|
||||
// compute cardinality
|
||||
int leafCardinality = heapSource.computeCardinality(from ,to, numDataDims, bytesPerDim, commonPrefixLengths);
|
||||
|
||||
// Save the block file pointer:
|
||||
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
|
||||
|
@ -1459,7 +1549,7 @@ public class BKDWriter implements Closeable {
|
|||
};
|
||||
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
|
||||
heapSource.docIDs, from);
|
||||
writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
|
||||
writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues, leafCardinality);
|
||||
|
||||
} else {
|
||||
// Inner node: partition/recurse
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
*/
|
||||
package org.apache.lucene.util.bkd;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
|
@ -93,6 +95,23 @@ public final class HeapPointWriter implements PointWriter {
|
|||
System.arraycopy(scratch, 0, block, indexJ, packedBytesLength);
|
||||
}
|
||||
|
||||
public int computeCardinality(int from, int to, int numDataDims, int bytesPerDim, int[] commonPrefixLengths) {
|
||||
assert packedBytesLength == numDataDims * bytesPerDim;
|
||||
int leafCardinality = 1;
|
||||
for (int i = from + 1; i < to; i++) {
|
||||
for (int dim = 0; dim < numDataDims; dim++) {
|
||||
final int start = dim * bytesPerDim + commonPrefixLengths[dim];
|
||||
final int end = dim * bytesPerDim + bytesPerDim;
|
||||
if (Arrays.mismatch(block, i * packedBytesLength + start, i * packedBytesLength + end,
|
||||
block, (i - 1) * packedBytesLength + start, (i - 1) * packedBytesLength + end) != -1) {
|
||||
leafCardinality++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return leafCardinality;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long count() {
|
||||
return nextWrite;
|
||||
|
|
|
@ -614,6 +614,29 @@ public class TestBKD extends LuceneTestCase {
|
|||
verify(docValues, null, numDataDims, numIndexDims, numBytesPerDim);
|
||||
}
|
||||
|
||||
// this should trigger low cardinality leaves
|
||||
public void testRandomFewDifferentValues() throws Exception {
|
||||
int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
|
||||
int numIndexDims = TestUtil.nextInt(random(), 1, 8);
|
||||
int numDataDims = TestUtil.nextInt(random(), numIndexDims, 8);
|
||||
|
||||
int numDocs = atLeast(10000);
|
||||
int cardinality = TestUtil.nextInt(random(), 2, 100);
|
||||
byte[][][] values = new byte[cardinality][numDataDims][numBytesPerDim];
|
||||
for (int i = 0; i < cardinality; i++) {
|
||||
for (int j = 0; j < numDataDims; j++) {
|
||||
random().nextBytes(values[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
byte[][][] docValues = new byte[numDocs][][];
|
||||
for(int docID = 0; docID < numDocs; docID++) {
|
||||
docValues[docID] = values[random().nextInt(cardinality)];
|
||||
}
|
||||
|
||||
verify(docValues, null, numDataDims, numIndexDims, numBytesPerDim);
|
||||
}
|
||||
|
||||
public void testMultiValued() throws Exception {
|
||||
int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
|
||||
int numDataDims = TestUtil.nextInt(random(), 1, 5);
|
||||
|
|
Loading…
Reference in New Issue