LUCENE-7640: Speed up PointValues#estimatePointCount with Relation.CELL_INSIDE_QUERY.

This commit is contained in:
Adrien Grand 2017-01-18 15:07:06 +01:00
parent 3404677e57
commit 71aa463d4b
3 changed files with 319 additions and 8 deletions

View File

@ -223,6 +223,41 @@ public final class BKDReader extends PointValues implements Accountable {
/** Only valid after pushLeft or pushRight, not pop! */
public abstract long getLeafBlockFP();
/** Return the number of leaves below the current node. */
public int getNumLeaves() {
int leftMostLeafNode = nodeID;
while (leftMostLeafNode < leafNodeOffset) {
leftMostLeafNode = leftMostLeafNode * 2;
}
int rightMostLeafNode = nodeID;
while (rightMostLeafNode < leafNodeOffset) {
rightMostLeafNode = rightMostLeafNode * 2 + 1;
}
final int numLeaves;
if (rightMostLeafNode >= leftMostLeafNode) {
// both are on the same level
numLeaves = rightMostLeafNode - leftMostLeafNode + 1;
} else {
// left is one level deeper than right
numLeaves = rightMostLeafNode - leftMostLeafNode + 1 + leafNodeOffset;
}
assert numLeaves == getNumLeavesSlow(nodeID) : numLeaves + " " + getNumLeavesSlow(nodeID);
return numLeaves;
}
// for assertions
private int getNumLeavesSlow(int node) {
if (node >= 2 * leafNodeOffset) {
return 0;
} else if (node >= leafNodeOffset) {
return 1;
} else {
final int leftCount = getNumLeavesSlow(node * 2);
final int rightCount = getNumLeavesSlow(node * 2 + 1);
return leftCount + rightCount;
}
}
}
/** Reads the original simple yet heap-heavy index format */
@ -716,13 +751,11 @@ public final class BKDReader extends PointValues implements Accountable {
if (r == Relation.CELL_OUTSIDE_QUERY) {
// This cell is fully outside of the query shape: stop recursing
return 0L;
} else if (r == Relation.CELL_INSIDE_QUERY) {
return (long) maxPointsInLeafNode * state.index.getNumLeaves();
} else if (state.index.isLeafNode()) {
if (r == Relation.CELL_INSIDE_QUERY) {
return maxPointsInLeafNode;
} else {
// Assume half the points matched
return (maxPointsInLeafNode + 1) / 2;
}
// Assume half the points matched
return (maxPointsInLeafNode + 1) / 2;
} else {
// Non-leaf node: recurse on the split left and right nodes

View File

@ -18,29 +18,43 @@ package org.apache.lucene.codecs.lucene60;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.PointsWriter;
import org.apache.lucene.document.BinaryPoint;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.BasePointsFormatTestCase;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.bkd.BKDWriter;
/**
* Tests Lucene60PointsFormat
*/
public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
private final Codec codec;
private final int maxPointsInLeafNode;
public TestLucene60PointsFormat() {
// standard issue
Codec defaultCodec = TestUtil.getDefaultCodec();
if (random().nextBoolean()) {
// randomize parameters
int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
if (VERBOSE) {
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
@ -66,6 +80,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
} else {
// standard issue
codec = defaultCodec;
maxPointsInLeafNode = BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
}
}
@ -79,5 +94,178 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
assumeFalse("TODO: mess with the parameters and test gets angry!", codec instanceof FilterCodec);
super.testMergeStability();
}
public void testEstimatePointCount() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
byte[] pointValue = new byte[3];
byte[] uniquePointValue = new byte[3];
random().nextBytes(uniquePointValue);
final int numDocs = atLeast(10000); // make sure we have several leaves
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
if (i == numDocs / 2) {
doc.add(new BinaryPoint("f", uniquePointValue));
} else {
do {
random().nextBytes(pointValue);
} while (Arrays.equals(pointValue, uniquePointValue));
doc.add(new BinaryPoint("f", pointValue));
}
w.addDocument(doc);
}
w.forceMerge(1);
final IndexReader r = DirectoryReader.open(w);
w.close();
final LeafReader lr = getOnlyLeafReader(r);
PointValues points = lr.getPointValues("f");
// If all points match, then the point count is numLeaves * maxPointsInLeafNode
final int numLeaves = (int) Math.ceil((double) numDocs / maxPointsInLeafNode);
assertEquals(numLeaves * maxPointsInLeafNode,
points.estimatePointCount(new IntersectVisitor() {
@Override
public void visit(int docID, byte[] packedValue) throws IOException {}
@Override
public void visit(int docID) throws IOException {}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
return Relation.CELL_INSIDE_QUERY;
}
}));
// Return 0 if no points match
assertEquals(0,
points.estimatePointCount(new IntersectVisitor() {
@Override
public void visit(int docID, byte[] packedValue) throws IOException {}
@Override
public void visit(int docID) throws IOException {}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
return Relation.CELL_OUTSIDE_QUERY;
}
}));
// If only one point matches, then the point count is (maxPointsInLeafNode + 1) / 2
assertEquals((maxPointsInLeafNode + 1) / 2,
points.estimatePointCount(new IntersectVisitor() {
@Override
public void visit(int docID, byte[] packedValue) throws IOException {}
@Override
public void visit(int docID) throws IOException {}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
if (StringHelper.compare(3, uniquePointValue, 0, maxPackedValue, 0) > 0 ||
StringHelper.compare(3, uniquePointValue, 0, minPackedValue, 0) < 0) {
return Relation.CELL_OUTSIDE_QUERY;
}
return Relation.CELL_CROSSES_QUERY;
}
}));
r.close();
dir.close();
}
// The tree is always balanced in the N dims case, and leaves are
// not all full so things are a bit different
public void testEstimatePointCount2Dims() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
byte[][] pointValue = new byte[2][];
pointValue[0] = new byte[3];
pointValue[1] = new byte[3];
byte[][] uniquePointValue = new byte[2][];
uniquePointValue[0] = new byte[3];
uniquePointValue[1] = new byte[3];
random().nextBytes(uniquePointValue[0]);
random().nextBytes(uniquePointValue[1]);
final int numDocs = atLeast(10000); // make sure we have several leaves
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
if (i == numDocs / 2) {
doc.add(new BinaryPoint("f", uniquePointValue));
} else {
do {
random().nextBytes(pointValue[0]);
random().nextBytes(pointValue[1]);
} while (Arrays.equals(pointValue[0], uniquePointValue[0]) || Arrays.equals(pointValue[1], uniquePointValue[1]));
doc.add(new BinaryPoint("f", pointValue));
}
w.addDocument(doc);
}
w.forceMerge(1);
final IndexReader r = DirectoryReader.open(w);
w.close();
final LeafReader lr = getOnlyLeafReader(r);
PointValues points = lr.getPointValues("f");
// With >1 dims, the tree is balanced
int actualMaxPointsInLeafNode = numDocs;
while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
}
// If all points match, then the point count is numLeaves * maxPointsInLeafNode
final int numLeaves = Integer.highestOneBit((numDocs - 1) / actualMaxPointsInLeafNode) << 1;
assertEquals(numLeaves * actualMaxPointsInLeafNode,
points.estimatePointCount(new IntersectVisitor() {
@Override
public void visit(int docID, byte[] packedValue) throws IOException {}
@Override
public void visit(int docID) throws IOException {}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
return Relation.CELL_INSIDE_QUERY;
}
}));
// Return 0 if no points match
assertEquals(0,
points.estimatePointCount(new IntersectVisitor() {
@Override
public void visit(int docID, byte[] packedValue) throws IOException {}
@Override
public void visit(int docID) throws IOException {}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
return Relation.CELL_OUTSIDE_QUERY;
}
}));
// If only one point matches, then the point count is (actualMaxPointsInLeafNode + 1) / 2
assertEquals((actualMaxPointsInLeafNode + 1) / 2,
points.estimatePointCount(new IntersectVisitor() {
@Override
public void visit(int docID, byte[] packedValue) throws IOException {}
@Override
public void visit(int docID) throws IOException {}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
for (int dim = 0; dim < 2; ++dim) {
if (StringHelper.compare(3, uniquePointValue[0], 0, maxPackedValue, dim * 3) > 0 ||
StringHelper.compare(3, uniquePointValue[0], 0, minPackedValue, dim * 3) < 0) {
return Relation.CELL_OUTSIDE_QUERY;
}
}
return Relation.CELL_CROSSES_QUERY;
}
}));
r.close();
dir.close();
}
}

View File

@ -1104,4 +1104,94 @@ public class TestBKD extends LuceneTestCase {
in.close();
dir.close();
}
public void testEstimatePointCount() throws IOException {
Directory dir = newDirectory();
final int numValues = atLeast(10000); // make sure to have multiple leaves
final int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
final int numBytesPerDim = TestUtil.nextInt(random(), 1, 4);
final byte[] pointValue = new byte[numBytesPerDim];
final byte[] uniquePointValue = new byte[numBytesPerDim];
random().nextBytes(uniquePointValue);
BKDWriter w = new BKDWriter(numValues, dir, "_temp", 1, numBytesPerDim, maxPointsInLeafNode,
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, numValues, true);
for (int i = 0; i < numValues; ++i) {
if (i == numValues / 2) {
w.add(uniquePointValue, i);
} else {
do {
random().nextBytes(pointValue);
} while (Arrays.equals(pointValue, uniquePointValue));
w.add(pointValue, i);
}
}
final long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
indexFP = w.finish(out);
w.close();
}
IndexInput pointsIn = dir.openInput("bkd", IOContext.DEFAULT);
pointsIn.seek(indexFP);
BKDReader points = new BKDReader(pointsIn);
int actualMaxPointsInLeafNode = numValues;
while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
}
// If all points match, then the point count is numLeaves * maxPointsInLeafNode
final int numLeaves = Integer.highestOneBit((numValues - 1) / actualMaxPointsInLeafNode) << 1;
assertEquals(numLeaves * actualMaxPointsInLeafNode,
points.estimatePointCount(new IntersectVisitor() {
@Override
public void visit(int docID, byte[] packedValue) throws IOException {}
@Override
public void visit(int docID) throws IOException {}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
return Relation.CELL_INSIDE_QUERY;
}
}));
// Return 0 if no points match
assertEquals(0,
points.estimatePointCount(new IntersectVisitor() {
@Override
public void visit(int docID, byte[] packedValue) throws IOException {}
@Override
public void visit(int docID) throws IOException {}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
return Relation.CELL_OUTSIDE_QUERY;
}
}));
// If only one point matches, then the point count is (actualMaxPointsInLeafNode + 1) / 2
assertEquals((actualMaxPointsInLeafNode + 1) / 2,
points.estimatePointCount(new IntersectVisitor() {
@Override
public void visit(int docID, byte[] packedValue) throws IOException {}
@Override
public void visit(int docID) throws IOException {}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
if (StringHelper.compare(3, uniquePointValue, 0, maxPackedValue, 0) > 0 ||
StringHelper.compare(3, uniquePointValue, 0, minPackedValue, 0) < 0) {
return Relation.CELL_OUTSIDE_QUERY;
}
return Relation.CELL_CROSSES_QUERY;
}
}));
pointsIn.close();
dir.close();
}
}