diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ba1fb2a2081..a8734a3d413 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -128,6 +128,10 @@ Optimizations Now caller threads execute at least one search on an index even if there is an executor provided to minimize thread context switching. (Simon Willnauer) +* LUCENE-8868: New storing strategy for BKD tree leaves with low cardinality. + It stores the distinct values once with the cardinality value reducing the + storage cost. + Test Framework * LUCENE-8825: CheckHits now display the shard index in case of mismatch diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java index c1f5a63d7de..ffabd6a6eef 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java @@ -441,16 +441,22 @@ public final class BKDReader extends PointValues implements Accountable { void visitDocValues(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { + if (version >= BKDWriter.VERSION_LOW_CARDINALITY_LEAVES) { + visitDocValuesWithCardinality(commonPrefixLengths, scratchDataPackedValue, scratchMinIndexPackedValue, scratchMaxIndexPackedValue, in, docIDs, count, visitor); + } else { + visitDocValuesNoCardinality(commonPrefixLengths, scratchDataPackedValue, scratchMinIndexPackedValue, scratchMaxIndexPackedValue, in, docIDs, count, visitor); + } + } - + void visitDocValuesNoCardinality(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue, + IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { readCommonPrefixes(commonPrefixLengths, scratchDataPackedValue, in); if (numIndexDims != 1 && version >= BKDWriter.VERSION_LEAF_STORES_BOUNDS) { byte[] minPackedValue = scratchMinIndexPackedValue; System.arraycopy(scratchDataPackedValue, 0, minPackedValue, 0, packedIndexBytesLength); byte[] maxPackedValue = scratchMaxIndexPackedValue; - //Copy common prefixes before reading adjusted - // box + // Copy common prefixes before reading adjusted box System.arraycopy(minPackedValue, 0, maxPackedValue, 0, packedIndexBytesLength); readMinMax(commonPrefixLengths, minPackedValue, maxPackedValue, in); @@ -480,12 +486,61 @@ public final class BKDReader extends PointValues implements Accountable { int compressedDim = readCompressedDim(in); if (compressedDim == -1) { - visitRawDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor); + visitUniqueRawDocValues(scratchDataPackedValue, docIDs, count, visitor); } else { visitCompressedDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor, compressedDim); } } + void visitDocValuesWithCardinality(int[] commonPrefixLengths, byte[] scratchDataPackedValue, byte[] scratchMinIndexPackedValue, byte[] scratchMaxIndexPackedValue, + IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { + + readCommonPrefixes(commonPrefixLengths, scratchDataPackedValue, in); + int compressedDim = readCompressedDim(in); + if (compressedDim == -1) { + // all values are the same + visitor.grow(count); + visitUniqueRawDocValues(scratchDataPackedValue, docIDs, count, visitor); + } else { + if (numIndexDims != 1) { + byte[] minPackedValue = scratchMinIndexPackedValue; + System.arraycopy(scratchDataPackedValue, 0, minPackedValue, 0, packedIndexBytesLength); + byte[] maxPackedValue = scratchMaxIndexPackedValue; + // Copy common prefixes before reading adjusted box + System.arraycopy(minPackedValue, 0, maxPackedValue, 0, packedIndexBytesLength); + readMinMax(commonPrefixLengths, minPackedValue, maxPackedValue, in); + + // The index gives us range of values for each dimension, but the actual range of values + // might be much more narrow than what the index told us, so we double check the relation + // here, which is cheap yet might help figure out that the block either entirely matches + // or does not match at all. This is especially more likely in the case that there are + // multiple dimensions that have correlation, ie. splitting on one dimension also + // significantly changes the range of values in another dimension. + Relation r = visitor.compare(minPackedValue, maxPackedValue); + if (r == Relation.CELL_OUTSIDE_QUERY) { + return; + } + visitor.grow(count); + + if (r == Relation.CELL_INSIDE_QUERY) { + for (int i = 0; i < count; ++i) { + visitor.visit(docIDs[i]); + } + return; + } + } else { + visitor.grow(count); + } + if (compressedDim == -2) { + // low cardinality values + visitSparseRawDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor); + } else { + // high cardinality + visitCompressedDocValues(commonPrefixLengths, scratchDataPackedValue, in, docIDs, count, visitor, compressedDim); + } + } + } + private void readMinMax(int[] commonPrefixLengths, byte[] minPackedValue, byte[] maxPackedValue, IndexInput in) throws IOException { for (int dim = 0; dim < numIndexDims; dim++) { int prefix = commonPrefixLengths[dim]; @@ -494,13 +549,28 @@ public final class BKDReader extends PointValues implements Accountable { } } - // Just read suffixes for every dimension - private void visitRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { - for (int i = 0; i < count; ++i) { - for(int dim=0;dim