diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 07e57769257..dc26f9be09d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -80,7 +80,10 @@ Improvements Optimizations --------------------- -(No changes) + +* LUCENE-8928: When building a kd-tree for dimensions n > 2, compute exact bounds for an inner node every N splits + to improve the quality of the tree. N is defined by SPLITS_BEFORE_EXACT_BOUNDS which is set to 4. + (Ignacio Vera, Adrien Grand) Bug Fixes --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java index 650d4af82e0..27211c1a7b5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java @@ -95,6 +95,9 @@ public class BKDWriter implements Closeable { /** Maximum number of dimensions */ public static final int MAX_DIMS = 8; + /** Number of splits before we compute the exact bounding box of an inner node. */ + private static final int SPLITS_BEFORE_EXACT_BOUNDS = 4; + /** How many dimensions we are storing at the leaf (data) nodes */ protected final int numDataDims; @@ -233,8 +236,7 @@ public class BKDWriter implements Closeable { int offset = dim*bytesPerDim; if (Arrays.compareUnsigned(packedValue, offset, offset + bytesPerDim, minPackedValue, offset, offset + bytesPerDim) < 0) { System.arraycopy(packedValue, offset, minPackedValue, offset, bytesPerDim); - } - if (Arrays.compareUnsigned(packedValue, offset, offset + bytesPerDim, maxPackedValue, offset, offset + bytesPerDim) > 0) { + } else if (Arrays.compareUnsigned(packedValue, offset, offset + bytesPerDim, maxPackedValue, offset, offset + bytesPerDim) > 0) { System.arraycopy(packedValue, offset, maxPackedValue, offset, bytesPerDim); } } @@ -376,6 +378,27 @@ public class BKDWriter implements Closeable { } } + private void computePackedValueBounds(MutablePointValues values, int from, int to, byte[] minPackedValue, byte[] maxPackedValue, BytesRef scratch) { + if (from == to) { + return; + } + values.getValue(from, scratch); + System.arraycopy(scratch.bytes, scratch.offset, minPackedValue, 0, packedIndexBytesLength); + System.arraycopy(scratch.bytes, scratch.offset, maxPackedValue, 0, packedIndexBytesLength); + for (int i = from + 1 ; i < to; ++i) { + values.getValue(i, scratch); + for(int dim = 0; dim < numIndexDims; dim++) { + final int startOffset = dim * bytesPerDim; + final int endOffset = startOffset + bytesPerDim; + if (Arrays.compareUnsigned(scratch.bytes, scratch.offset + startOffset, scratch.offset + endOffset, minPackedValue, startOffset, endOffset) < 0) { + System.arraycopy(scratch.bytes, scratch.offset + startOffset, minPackedValue, startOffset, bytesPerDim); + } else if (Arrays.compareUnsigned(scratch.bytes, scratch.offset + startOffset, scratch.offset + endOffset, maxPackedValue, startOffset, endOffset) > 0) { + System.arraycopy(scratch.bytes, scratch.offset + startOffset, maxPackedValue, startOffset, bytesPerDim); + } + } + } + } + /* In the 2+D case, we recursively pick the split dimension, compute the * median value and partition other values around it. */ private long writeFieldNDims(IndexOutput out, String fieldName, MutablePointValues values) throws IOException { @@ -407,26 +430,14 @@ public class BKDWriter implements Closeable { final long[] leafBlockFPs = new long[numLeaves]; // compute the min/max for this slice - Arrays.fill(minPackedValue, (byte) 0xff); - Arrays.fill(maxPackedValue, (byte) 0); + computePackedValueBounds(values, 0, Math.toIntExact(pointCount), minPackedValue, maxPackedValue, scratchBytesRef1); for (int i = 0; i < Math.toIntExact(pointCount); ++i) { - values.getValue(i, scratchBytesRef1); - for(int dim=0;dim 0) { - System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + offset, maxPackedValue, offset, bytesPerDim); - } - } - docsSeen.set(values.getDocID(i)); } final int[] parentSplits = new int[numIndexDims]; build(1, numLeaves, values, 0, Math.toIntExact(pointCount), out, - minPackedValue, maxPackedValue, parentSplits, + minPackedValue.clone(), maxPackedValue.clone(), parentSplits, splitPackedValues, leafBlockFPs, new int[maxPointsInLeafNode]); assert Arrays.equals(parentSplits, new int[numIndexDims]); @@ -782,7 +793,7 @@ public class BKDWriter implements Closeable { final int[] parentSplits = new int[numIndexDims]; build(1, numLeaves, points, out, radixSelector, - minPackedValue, maxPackedValue, + minPackedValue.clone(), maxPackedValue.clone(), parentSplits, splitPackedValues, leafBlockFPs, @@ -1418,8 +1429,20 @@ public class BKDWriter implements Closeable { } else { // inner node + final int splitDim; // compute the split dimension and partition around it - final int splitDim = split(minPackedValue, maxPackedValue, parentSplits); + if (numIndexDims == 1) { + splitDim = 0; + } else { + // for dimensions > 2 we recompute the bounds for the current inner node to help the algorithm choose best + // split dimensions. Because it is an expensive operation, the frequency we recompute the bounds is given + // by SPLITS_BEFORE_EXACT_BOUNDS. + if (nodeID > 1 && numIndexDims > 2 && Arrays.stream(parentSplits).sum() % SPLITS_BEFORE_EXACT_BOUNDS == 0) { + computePackedValueBounds(reader, from, to, minPackedValue, maxPackedValue, scratchBytesRef1); + } + splitDim = split(minPackedValue, maxPackedValue, parentSplits); + } + final int mid = (from + to + 1) >>> 1; int commonPrefixLen = Arrays.mismatch(minPackedValue, splitDim * bytesPerDim, @@ -1457,6 +1480,30 @@ public class BKDWriter implements Closeable { } } + + private void computePackedValueBounds(BKDRadixSelector.PathSlice slice, byte[] minPackedValue, byte[] maxPackedValue) throws IOException { + try (PointReader reader = slice.writer.getReader(slice.start, slice.count)) { + if (reader.next() == false) { + return; + } + BytesRef value = reader.pointValue().packedValue(); + System.arraycopy(value.bytes, value.offset, minPackedValue, 0, packedIndexBytesLength); + System.arraycopy(value.bytes, value.offset, maxPackedValue, 0, packedIndexBytesLength); + while (reader.next()) { + value = reader.pointValue().packedValue(); + for(int dim = 0; dim < numIndexDims; dim++) { + final int startOffset = dim * bytesPerDim; + final int endOffset = startOffset + bytesPerDim; + if (Arrays.compareUnsigned(value.bytes, value.offset + startOffset, value.offset + endOffset, minPackedValue, startOffset, endOffset) < 0) { + System.arraycopy(value.bytes, value.offset + startOffset, minPackedValue, startOffset, bytesPerDim); + } else if (Arrays.compareUnsigned(value.bytes, value.offset + startOffset, value.offset + endOffset, maxPackedValue, startOffset, endOffset) > 0) { + System.arraycopy(value.bytes, value.offset + startOffset, maxPackedValue, startOffset, bytesPerDim); + } + } + } + } + } + /** The point writer contains the data that is going to be splitted using radix selection. /* This method is used when we are merging previously written segments, in the numDims > 1 case. */ private void build(int nodeID, int leafNodeOffset, @@ -1562,11 +1609,17 @@ public class BKDWriter implements Closeable { } else { // Inner node: partition/recurse - int splitDim; - if (numIndexDims > 1) { - splitDim = split(minPackedValue, maxPackedValue, parentSplits); - } else { + final int splitDim; + if (numIndexDims == 1) { splitDim = 0; + } else { + // for dimensions > 2 we recompute the bounds for the current inner node to help the algorithm choose best + // split dimensions. Because it is an expensive operation, the frequency we recompute the bounds is given + // by SPLITS_BEFORE_EXACT_BOUNDS. + if (nodeID > 1 && numIndexDims > 2 && Arrays.stream(parentSplits).sum() % SPLITS_BEFORE_EXACT_BOUNDS == 0) { + computePackedValueBounds(points, minPackedValue, maxPackedValue); + } + splitDim = split(minPackedValue, maxPackedValue, parentSplits); } assert nodeID < splitPackedValues.length : "nodeID=" + nodeID + " splitValues.length=" + splitPackedValues.length;