LUCENE-7401: Make sure BKD trees index all dimensions.

This commit is contained in:
Adrien Grand 2016-12-26 10:10:03 +01:00
parent 5c8a70fb57
commit ba47f530d1
4 changed files with 92 additions and 13 deletions

View File

@ -172,6 +172,9 @@ Improvements
is no longer needed. Support for older Java 9 builds was removed.
(Uwe Schindler)
* LUCENE-7401: Changed the way BKD trees pick the split dimension in order to
ensure all dimensions are indexed. (Adrien Grand)
Optimizations
* LUCENE-7568: Optimize merging when index sorting is used but the

View File

@ -479,9 +479,12 @@ public class BKDWriter implements Closeable {
docsSeen.set(values.getDocID(i));
}
final int[] parentSplits = new int[numDims];
build(1, numLeaves, values, 0, Math.toIntExact(pointCount), out,
minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs,
minPackedValue, maxPackedValue, parentSplits,
splitPackedValues, leafBlockFPs,
new int[maxPointsInLeafNode]);
assert Arrays.equals(parentSplits, new int[numDims]);
long indexFP = out.getFilePointer();
writeIndex(out, leafBlockFPs, splitPackedValues);
@ -1001,12 +1004,15 @@ public class BKDWriter implements Closeable {
heapPointWriter = null;
}
final int[] parentSplits = new int[numDims];
build(1, numLeaves, sortedPointWriters,
ordBitSet, out,
minPackedValue, maxPackedValue,
parentSplits,
splitPackedValues,
leafBlockFPs,
toCloseHeroically);
assert Arrays.equals(parentSplits, new int[numDims]);
for(PathSlice slice : sortedPointWriters) {
slice.writer.destroy();
@ -1413,7 +1419,29 @@ public class BKDWriter implements Closeable {
return true;
}
protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
/**
* Pick the next dimension to split.
* @param minPackedValue the min values for all dimensions
* @param maxPackedValue the max values for all dimensions
* @param parentSplits how many times each dim has been split on the parent levels
* @return the dimension to split
*/
protected int split(byte[] minPackedValue, byte[] maxPackedValue, int[] parentSplits) {
// First look at whether there is a dimension that has split less than 2x less than
// the dim that has most splits, and return it if there is such a dimension and it
// does not only have equals values. This helps ensure all dimensions are indexed.
int maxNumSplits = 0;
for (int numSplits : parentSplits) {
maxNumSplits = Math.max(maxNumSplits, numSplits);
}
for (int dim = 0; dim < numDims; ++dim) {
final int offset = dim * bytesPerDim;
if (parentSplits[dim] < maxNumSplits / 2 &&
StringHelper.compare(bytesPerDim, minPackedValue, offset, maxPackedValue, offset) != 0) {
return dim;
}
}
// Find which dim has the largest span so we can split on it:
int splitDim = -1;
for(int dim=0;dim<numDims;dim++) {
@ -1454,6 +1482,7 @@ public class BKDWriter implements Closeable {
MutablePointValues reader, int from, int to,
IndexOutput out,
byte[] minPackedValue, byte[] maxPackedValue,
int[] parentSplits,
byte[] splitPackedValues,
long[] leafBlockFPs,
int[] spareDocIds) throws IOException {
@ -1547,7 +1576,7 @@ public class BKDWriter implements Closeable {
// inner node
// compute the split dimension and partition around it
final int splitDim = split(minPackedValue, maxPackedValue);
final int splitDim = split(minPackedValue, maxPackedValue, parentSplits);
final int mid = (from + to + 1) >>> 1;
int commonPrefixLen = bytesPerDim;
@ -1575,10 +1604,14 @@ public class BKDWriter implements Closeable {
maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
// recurse
parentSplits[splitDim]++;
build(nodeID * 2, leafNodeOffset, reader, from, mid, out,
minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
minPackedValue, maxSplitPackedValue, parentSplits,
splitPackedValues, leafBlockFPs, spareDocIds);
build(nodeID * 2 + 1, leafNodeOffset, reader, mid, to, out,
minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
minSplitPackedValue, maxPackedValue, parentSplits,
splitPackedValues, leafBlockFPs, spareDocIds);
parentSplits[splitDim]--;
}
}
@ -1589,6 +1622,7 @@ public class BKDWriter implements Closeable {
LongBitSet ordBitSet,
IndexOutput out,
byte[] minPackedValue, byte[] maxPackedValue,
int[] parentSplits,
byte[] splitPackedValues,
long[] leafBlockFPs,
List<Closeable> toCloseHeroically) throws IOException {
@ -1699,7 +1733,7 @@ public class BKDWriter implements Closeable {
int splitDim;
if (numDims > 1) {
splitDim = split(minPackedValue, maxPackedValue);
splitDim = split(minPackedValue, maxPackedValue, parentSplits);
} else {
splitDim = 0;
}
@ -1767,10 +1801,11 @@ public class BKDWriter implements Closeable {
}
}
parentSplits[splitDim]++;
// Recurse on left tree:
build(2*nodeID, leafNodeOffset, leftSlices,
ordBitSet, out,
minPackedValue, maxSplitPackedValue,
minPackedValue, maxSplitPackedValue, parentSplits,
splitPackedValues, leafBlockFPs, toCloseHeroically);
for(int dim=0;dim<numDims;dim++) {
// Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
@ -1783,7 +1818,7 @@ public class BKDWriter implements Closeable {
// Recurse on right tree:
build(2*nodeID+1, leafNodeOffset, rightSlices,
ordBitSet, out,
minSplitPackedValue, maxPackedValue,
minSplitPackedValue, maxPackedValue, parentSplits,
splitPackedValues, leafBlockFPs, toCloseHeroically);
for(int dim=0;dim<numDims;dim++) {
// Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
@ -1791,6 +1826,7 @@ public class BKDWriter implements Closeable {
rightSlices[dim].writer.destroy();
}
}
parentSplits[splitDim]--;
}
}

View File

@ -505,7 +505,45 @@ public class TestBKD extends LuceneTestCase {
}
}
verify(docValues, null, numDims, numBytesPerDim);
// Use a small number of points in leaf blocks to trigger a lot of splitting
verify(docValues, null, numDims, numBytesPerDim, TestUtil.nextInt(random(), 20, 50));
}
// This triggers the logic that makes sure all dimensions get indexed
// by looking at how many times each dim has been split
public void testOneDimLowCard() throws Exception {
int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
int numDims = TestUtil.nextInt(random(), 2, 5);
int numDocs = atLeast(10000);
int theLowCardDim = random().nextInt(numDims);
byte[] value1 = new byte[numBytesPerDim];
random().nextBytes(value1);
byte[] value2 = value1.clone();
if (value2[numBytesPerDim-1] == 0 || random().nextBoolean()) {
value2[numBytesPerDim-1]++;
} else {
value2[numBytesPerDim-1]--;
}
byte[][][] docValues = new byte[numDocs][][];
for(int docID=0;docID<numDocs;docID++) {
byte[][] values = new byte[numDims][];
for(int dim=0;dim<numDims;dim++) {
if (dim == theLowCardDim) {
values[dim] = random().nextBoolean() ? value1 : value2;
} else {
values[dim] = new byte[numBytesPerDim];
random().nextBytes(values[dim]);
}
}
docValues[docID] = values;
}
// Use a small number of points in leaf blocks to trigger a lot of splitting
verify(docValues, null, numDims, numBytesPerDim, TestUtil.nextInt(random(), 20, 50));
}
// this should trigger run-length compression with lengths that are greater than 255
@ -567,12 +605,14 @@ public class TestBKD extends LuceneTestCase {
verify(docValuesArray, docIDsArray, numDims, numBytesPerDim);
}
/** docIDs can be null, for the single valued case, else it maps value to docID */
private void verify(byte[][][] docValues, int[] docIDs, int numDims, int numBytesPerDim) throws Exception {
verify(docValues, docIDs, numDims, numBytesPerDim, TestUtil.nextInt(random(), 50, 1000));
}
private void verify(byte[][][] docValues, int[] docIDs, int numDims, int numBytesPerDim,
int maxPointsInLeafNode) throws Exception {
try (Directory dir = getDirectory(docValues.length)) {
int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 1000);
double maxMB = (float) 3.0 + (3*random().nextDouble());
verify(dir, docValues, docIDs, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
}

View File

@ -294,7 +294,7 @@ public class RandomCodec extends AssertingCodec {
}
@Override
protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
protected int split(byte[] minPackedValue, byte[] maxPackedValue, int[] parentDims) {
// BKD normally defaults by the widest dimension, to try to make as squarish cells as possible, but we just pick a random one ;)
return random.nextInt(numDims);
}