mirror of https://github.com/apache/lucene.git
LUCENE-8562: Speed up merging segments of points with data dimensions by only sorting on the indexed dimensions
This commit is contained in:
parent
68c0774458
commit
72ca4488d1
|
@ -225,6 +225,9 @@ Improvements
|
|||
* LUCENE-8463: TopFieldCollector can now early-terminates queries when sorting by SortField.DOC.
|
||||
(Christophe Bismuth via Jim Ferenczi)
|
||||
|
||||
* LUCENE-8562: Speed up merging segments of points with data dimensions by only sorting on the indexed
|
||||
dimensions. (Ignacio Vera)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-8552: FieldInfos.getMergedFieldInfos no longer does any merging if there is <= 1 segment.
|
||||
|
|
|
@ -767,6 +767,10 @@ public class BKDWriter implements Closeable {
|
|||
/** Sort the heap writer by the specified dim */
|
||||
private void sortHeapPointWriter(final HeapPointWriter writer, int dim) {
|
||||
final int pointCount = Math.toIntExact(this.pointCount);
|
||||
sortHeapPointWriter(writer, pointCount, dim);
|
||||
}
|
||||
/** Sort the heap writer by the specified dim */
|
||||
private void sortHeapPointWriter(final HeapPointWriter writer, int pointCount, int dim) {
|
||||
// Tie-break by docID:
|
||||
|
||||
// No need to tie break on ord, for the case where the same doc has the same value in a given dimension indexed more than once: it
|
||||
|
@ -959,7 +963,7 @@ public class BKDWriter implements Closeable {
|
|||
}
|
||||
|
||||
LongBitSet ordBitSet;
|
||||
if (numDataDims > 1) {
|
||||
if (numIndexDims > 1) {
|
||||
if (singleValuePerDoc) {
|
||||
ordBitSet = new LongBitSet(maxDoc);
|
||||
} else {
|
||||
|
@ -994,7 +998,7 @@ public class BKDWriter implements Closeable {
|
|||
assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointCount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode;
|
||||
|
||||
// Sort all docs once by each dimension:
|
||||
PathSlice[] sortedPointWriters = new PathSlice[numDataDims];
|
||||
PathSlice[] sortedPointWriters = new PathSlice[numIndexDims];
|
||||
|
||||
// This is only used on exception; on normal code paths we close all files we opened:
|
||||
List<Closeable> toCloseHeroically = new ArrayList<>();
|
||||
|
@ -1002,9 +1006,7 @@ public class BKDWriter implements Closeable {
|
|||
boolean success = false;
|
||||
try {
|
||||
//long t0 = System.nanoTime();
|
||||
// even with selective indexing we create the sortedPointWriters so we can compress
|
||||
// the leaf node data by common prefix
|
||||
for(int dim=0;dim<numDataDims;dim++) {
|
||||
for(int dim=0;dim<numIndexDims;dim++) {
|
||||
sortedPointWriters[dim] = new PathSlice(sort(dim), 0, pointCount);
|
||||
}
|
||||
//long t1 = System.nanoTime();
|
||||
|
@ -1445,7 +1447,7 @@ public class BKDWriter implements Closeable {
|
|||
boolean result = reader.next();
|
||||
assert result: "rightCount=" + rightCount + " source.count=" + source.count + " source.writer=" + source.writer;
|
||||
System.arraycopy(reader.packedValue(), splitDim*bytesPerDim, scratch1, 0, bytesPerDim);
|
||||
if (numDataDims > 1) {
|
||||
if (numIndexDims > 1) {
|
||||
assert ordBitSet.get(reader.ord()) == false;
|
||||
ordBitSet.set(reader.ord());
|
||||
// Subtract 1 from rightCount because we already did the first value above (so we could record the split value):
|
||||
|
@ -1619,7 +1621,7 @@ public class BKDWriter implements Closeable {
|
|||
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
|
||||
docIDs, 0);
|
||||
writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, count, sortedDim, packedValues);
|
||||
|
||||
|
||||
out.writeBytes(scratchOut.getBytes(), 0, scratchOut.getPosition());
|
||||
scratchOut.reset();
|
||||
|
||||
|
@ -1678,10 +1680,10 @@ public class BKDWriter implements Closeable {
|
|||
long[] leafBlockFPs,
|
||||
List<Closeable> toCloseHeroically) throws IOException {
|
||||
|
||||
for(PathSlice slice : slices) {
|
||||
for (PathSlice slice : slices) {
|
||||
assert slice.count == slices[0].count;
|
||||
}
|
||||
|
||||
|
||||
if (numDataDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) {
|
||||
// Special case for 1D, to cutover to heap once we recurse deeply enough:
|
||||
slices[0] = switchToHeap(slices[0], toCloseHeroically);
|
||||
|
@ -1695,7 +1697,7 @@ public class BKDWriter implements Closeable {
|
|||
int sortedDim = 0;
|
||||
int sortedDimCardinality = Integer.MAX_VALUE;
|
||||
|
||||
for (int dim=0;dim<numDataDims;dim++) {
|
||||
for (int dim=0;dim<numIndexDims;dim++) {
|
||||
if (slices[dim].writer instanceof HeapPointWriter == false) {
|
||||
// Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started
|
||||
// offline, but then kept splitting only in one dimension, and so never had to rewrite into heap writer
|
||||
|
@ -1740,7 +1742,41 @@ public class BKDWriter implements Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
PathSlice source = slices[sortedDim];
|
||||
PathSlice dataDimPathSlice = null;
|
||||
|
||||
if (numDataDims != numIndexDims) {
|
||||
HeapPointWriter heapSource = (HeapPointWriter) slices[0].writer;
|
||||
int from = (int) slices[0].start;
|
||||
int to = from + (int) slices[0].count;
|
||||
Arrays.fill(commonPrefixLengths, numIndexDims, numDataDims, bytesPerDim);
|
||||
heapSource.readPackedValue(from, scratch1);
|
||||
for (int i = from + 1; i < to; ++i) {
|
||||
heapSource.readPackedValue(i, scratch2);
|
||||
for (int dim = numIndexDims; dim < numDataDims; dim++) {
|
||||
final int offset = dim * bytesPerDim;
|
||||
for (int j = 0; j < commonPrefixLengths[dim]; j++) {
|
||||
if (scratch1[offset + j] != scratch2[offset + j]) {
|
||||
commonPrefixLengths[dim] = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
//handle case when all index dimensions contain the same value but not the data dimensions
|
||||
if (commonPrefixLengths[sortedDim] == bytesPerDim) {
|
||||
for (int dim = numIndexDims; dim < numDataDims; ++dim) {
|
||||
if (commonPrefixLengths[dim] != bytesPerDim) {
|
||||
sortedDim = dim;
|
||||
//create a new slice in memory
|
||||
dataDimPathSlice = switchToHeap(slices[0], toCloseHeroically);
|
||||
sortHeapPointWriter((HeapPointWriter) dataDimPathSlice.writer, (int) dataDimPathSlice.count, sortedDim);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PathSlice source = (dataDimPathSlice != null) ? dataDimPathSlice : slices[sortedDim];
|
||||
|
||||
// We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we better be in heap at this point:
|
||||
HeapPointWriter heapSource = (HeapPointWriter) source.writer;
|
||||
|
@ -1804,8 +1840,8 @@ public class BKDWriter implements Closeable {
|
|||
|
||||
// Partition all PathSlice that are not the split dim into sorted left and right sets, so we can recurse:
|
||||
|
||||
PathSlice[] leftSlices = new PathSlice[numDataDims];
|
||||
PathSlice[] rightSlices = new PathSlice[numDataDims];
|
||||
PathSlice[] leftSlices = new PathSlice[numIndexDims];
|
||||
PathSlice[] rightSlices = new PathSlice[numIndexDims];
|
||||
|
||||
byte[] minSplitPackedValue = new byte[packedIndexBytesLength];
|
||||
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, packedIndexBytesLength);
|
||||
|
@ -1815,13 +1851,13 @@ public class BKDWriter implements Closeable {
|
|||
|
||||
// When we are on this dim, below, we clear the ordBitSet:
|
||||
int dimToClear;
|
||||
if (numDataDims - 1 == splitDim) {
|
||||
dimToClear = numDataDims - 2;
|
||||
if (numIndexDims - 1 == splitDim) {
|
||||
dimToClear = numIndexDims - 2;
|
||||
} else {
|
||||
dimToClear = numDataDims - 1;
|
||||
dimToClear = numIndexDims - 1;
|
||||
}
|
||||
|
||||
for(int dim=0;dim<numDataDims;dim++) {
|
||||
for(int dim=0;dim<numIndexDims;dim++) {
|
||||
|
||||
if (dim == splitDim) {
|
||||
// No need to partition on this dim since it's a simple slice of the incoming already sorted slice, and we
|
||||
|
@ -1858,7 +1894,7 @@ public class BKDWriter implements Closeable {
|
|||
ordBitSet, out,
|
||||
minPackedValue, maxSplitPackedValue, parentSplits,
|
||||
splitPackedValues, leafBlockFPs, toCloseHeroically);
|
||||
for(int dim=0;dim<numDataDims;dim++) {
|
||||
for(int dim=0;dim<numIndexDims;dim++) {
|
||||
// Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
|
||||
if (dim != splitDim) {
|
||||
leftSlices[dim].writer.destroy();
|
||||
|
@ -1871,7 +1907,7 @@ public class BKDWriter implements Closeable {
|
|||
ordBitSet, out,
|
||||
minSplitPackedValue, maxPackedValue, parentSplits,
|
||||
splitPackedValues, leafBlockFPs, toCloseHeroically);
|
||||
for(int dim=0;dim<numDataDims;dim++) {
|
||||
for(int dim=0;dim<numIndexDims;dim++) {
|
||||
// Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
|
||||
if (dim != splitDim) {
|
||||
rightSlices[dim].writer.destroy();
|
||||
|
|
|
@ -492,6 +492,35 @@ public class TestBKD extends LuceneTestCase {
|
|||
verify(docValues, null, numDataDims, numIndexDims, numBytesPerDim);
|
||||
}
|
||||
|
||||
public void testIndexDimEqualDataDimDifferent() throws Exception {
|
||||
int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
|
||||
int numDataDims = TestUtil.nextInt(random(), 2, 5);
|
||||
int numIndexDims = TestUtil.nextInt(random(), 1, numDataDims - 1);
|
||||
|
||||
int numDocs = atLeast(1000);
|
||||
byte[][][] docValues = new byte[numDocs][][];
|
||||
|
||||
byte[][] indexDimensions = new byte[numDataDims][];
|
||||
for(int dim=0;dim<numIndexDims;dim++) {
|
||||
indexDimensions[dim] = new byte[numBytesPerDim];
|
||||
random().nextBytes(indexDimensions[dim]);
|
||||
}
|
||||
|
||||
for(int docID=0;docID<numDocs;docID++) {
|
||||
byte[][] values = new byte[numDataDims][];
|
||||
for(int dim=0;dim<numIndexDims;dim++) {
|
||||
values[dim] = indexDimensions[dim];
|
||||
}
|
||||
for (int dim = numIndexDims; dim < numDataDims; dim++) {
|
||||
values[dim] = new byte[numBytesPerDim];
|
||||
random().nextBytes(values[dim]);
|
||||
}
|
||||
docValues[docID] = values;
|
||||
}
|
||||
|
||||
verify(docValues, null, numDataDims, numIndexDims, numBytesPerDim);
|
||||
}
|
||||
|
||||
public void testOneDimEqual() throws Exception {
|
||||
int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
|
||||
int numDataDims = TestUtil.nextInt(random(), 1, 5);
|
||||
|
|
Loading…
Reference in New Issue