mirror of https://github.com/apache/lucene.git
LUCENE-7640: Speed up PointValues#estimatePointCount with Relation.CELL_INSIDE_QUERY.
This commit is contained in:
parent
3404677e57
commit
71aa463d4b
|
@ -223,6 +223,41 @@ public final class BKDReader extends PointValues implements Accountable {
|
|||
|
||||
/** Only valid after pushLeft or pushRight, not pop! */
|
||||
public abstract long getLeafBlockFP();
|
||||
|
||||
/** Return the number of leaves below the current node. */
|
||||
public int getNumLeaves() {
|
||||
int leftMostLeafNode = nodeID;
|
||||
while (leftMostLeafNode < leafNodeOffset) {
|
||||
leftMostLeafNode = leftMostLeafNode * 2;
|
||||
}
|
||||
int rightMostLeafNode = nodeID;
|
||||
while (rightMostLeafNode < leafNodeOffset) {
|
||||
rightMostLeafNode = rightMostLeafNode * 2 + 1;
|
||||
}
|
||||
final int numLeaves;
|
||||
if (rightMostLeafNode >= leftMostLeafNode) {
|
||||
// both are on the same level
|
||||
numLeaves = rightMostLeafNode - leftMostLeafNode + 1;
|
||||
} else {
|
||||
// left is one level deeper than right
|
||||
numLeaves = rightMostLeafNode - leftMostLeafNode + 1 + leafNodeOffset;
|
||||
}
|
||||
assert numLeaves == getNumLeavesSlow(nodeID) : numLeaves + " " + getNumLeavesSlow(nodeID);
|
||||
return numLeaves;
|
||||
}
|
||||
|
||||
// for assertions
|
||||
private int getNumLeavesSlow(int node) {
|
||||
if (node >= 2 * leafNodeOffset) {
|
||||
return 0;
|
||||
} else if (node >= leafNodeOffset) {
|
||||
return 1;
|
||||
} else {
|
||||
final int leftCount = getNumLeavesSlow(node * 2);
|
||||
final int rightCount = getNumLeavesSlow(node * 2 + 1);
|
||||
return leftCount + rightCount;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Reads the original simple yet heap-heavy index format */
|
||||
|
@ -716,13 +751,11 @@ public final class BKDReader extends PointValues implements Accountable {
|
|||
if (r == Relation.CELL_OUTSIDE_QUERY) {
|
||||
// This cell is fully outside of the query shape: stop recursing
|
||||
return 0L;
|
||||
} else if (r == Relation.CELL_INSIDE_QUERY) {
|
||||
return (long) maxPointsInLeafNode * state.index.getNumLeaves();
|
||||
} else if (state.index.isLeafNode()) {
|
||||
if (r == Relation.CELL_INSIDE_QUERY) {
|
||||
return maxPointsInLeafNode;
|
||||
} else {
|
||||
// Assume half the points matched
|
||||
return (maxPointsInLeafNode + 1) / 2;
|
||||
}
|
||||
// Assume half the points matched
|
||||
return (maxPointsInLeafNode + 1) / 2;
|
||||
} else {
|
||||
|
||||
// Non-leaf node: recurse on the split left and right nodes
|
||||
|
|
|
@ -18,29 +18,43 @@ package org.apache.lucene.codecs.lucene60;
|
|||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.FilterCodec;
|
||||
import org.apache.lucene.codecs.PointsFormat;
|
||||
import org.apache.lucene.codecs.PointsReader;
|
||||
import org.apache.lucene.codecs.PointsWriter;
|
||||
import org.apache.lucene.document.BinaryPoint;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.BasePointsFormatTestCase;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.PointValues;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.PointValues.IntersectVisitor;
|
||||
import org.apache.lucene.index.PointValues.Relation;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.bkd.BKDWriter;
|
||||
|
||||
/**
|
||||
* Tests Lucene60PointsFormat
|
||||
*/
|
||||
public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
|
||||
private final Codec codec;
|
||||
private final int maxPointsInLeafNode;
|
||||
|
||||
public TestLucene60PointsFormat() {
|
||||
// standard issue
|
||||
Codec defaultCodec = TestUtil.getDefaultCodec();
|
||||
if (random().nextBoolean()) {
|
||||
// randomize parameters
|
||||
int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
|
||||
maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
|
||||
double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
|
||||
|
@ -66,6 +80,7 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
|
|||
} else {
|
||||
// standard issue
|
||||
codec = defaultCodec;
|
||||
maxPointsInLeafNode = BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -79,5 +94,178 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
|
|||
assumeFalse("TODO: mess with the parameters and test gets angry!", codec instanceof FilterCodec);
|
||||
super.testMergeStability();
|
||||
}
|
||||
|
||||
|
||||
public void testEstimatePointCount() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
|
||||
byte[] pointValue = new byte[3];
|
||||
byte[] uniquePointValue = new byte[3];
|
||||
random().nextBytes(uniquePointValue);
|
||||
final int numDocs = atLeast(10000); // make sure we have several leaves
|
||||
for (int i = 0; i < numDocs; ++i) {
|
||||
Document doc = new Document();
|
||||
if (i == numDocs / 2) {
|
||||
doc.add(new BinaryPoint("f", uniquePointValue));
|
||||
} else {
|
||||
do {
|
||||
random().nextBytes(pointValue);
|
||||
} while (Arrays.equals(pointValue, uniquePointValue));
|
||||
doc.add(new BinaryPoint("f", pointValue));
|
||||
}
|
||||
w.addDocument(doc);
|
||||
}
|
||||
w.forceMerge(1);
|
||||
final IndexReader r = DirectoryReader.open(w);
|
||||
w.close();
|
||||
final LeafReader lr = getOnlyLeafReader(r);
|
||||
PointValues points = lr.getPointValues("f");
|
||||
|
||||
// If all points match, then the point count is numLeaves * maxPointsInLeafNode
|
||||
final int numLeaves = (int) Math.ceil((double) numDocs / maxPointsInLeafNode);
|
||||
assertEquals(numLeaves * maxPointsInLeafNode,
|
||||
points.estimatePointCount(new IntersectVisitor() {
|
||||
@Override
|
||||
public void visit(int docID, byte[] packedValue) throws IOException {}
|
||||
|
||||
@Override
|
||||
public void visit(int docID) throws IOException {}
|
||||
|
||||
@Override
|
||||
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
return Relation.CELL_INSIDE_QUERY;
|
||||
}
|
||||
}));
|
||||
|
||||
// Return 0 if no points match
|
||||
assertEquals(0,
|
||||
points.estimatePointCount(new IntersectVisitor() {
|
||||
@Override
|
||||
public void visit(int docID, byte[] packedValue) throws IOException {}
|
||||
|
||||
@Override
|
||||
public void visit(int docID) throws IOException {}
|
||||
|
||||
@Override
|
||||
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
return Relation.CELL_OUTSIDE_QUERY;
|
||||
}
|
||||
}));
|
||||
|
||||
// If only one point matches, then the point count is (maxPointsInLeafNode + 1) / 2
|
||||
assertEquals((maxPointsInLeafNode + 1) / 2,
|
||||
points.estimatePointCount(new IntersectVisitor() {
|
||||
@Override
|
||||
public void visit(int docID, byte[] packedValue) throws IOException {}
|
||||
|
||||
@Override
|
||||
public void visit(int docID) throws IOException {}
|
||||
|
||||
@Override
|
||||
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
if (StringHelper.compare(3, uniquePointValue, 0, maxPackedValue, 0) > 0 ||
|
||||
StringHelper.compare(3, uniquePointValue, 0, minPackedValue, 0) < 0) {
|
||||
return Relation.CELL_OUTSIDE_QUERY;
|
||||
}
|
||||
return Relation.CELL_CROSSES_QUERY;
|
||||
}
|
||||
}));
|
||||
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// The tree is always balanced in the N dims case, and leaves are
|
||||
// not all full so things are a bit different
|
||||
public void testEstimatePointCount2Dims() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
|
||||
byte[][] pointValue = new byte[2][];
|
||||
pointValue[0] = new byte[3];
|
||||
pointValue[1] = new byte[3];
|
||||
byte[][] uniquePointValue = new byte[2][];
|
||||
uniquePointValue[0] = new byte[3];
|
||||
uniquePointValue[1] = new byte[3];
|
||||
random().nextBytes(uniquePointValue[0]);
|
||||
random().nextBytes(uniquePointValue[1]);
|
||||
final int numDocs = atLeast(10000); // make sure we have several leaves
|
||||
for (int i = 0; i < numDocs; ++i) {
|
||||
Document doc = new Document();
|
||||
if (i == numDocs / 2) {
|
||||
doc.add(new BinaryPoint("f", uniquePointValue));
|
||||
} else {
|
||||
do {
|
||||
random().nextBytes(pointValue[0]);
|
||||
random().nextBytes(pointValue[1]);
|
||||
} while (Arrays.equals(pointValue[0], uniquePointValue[0]) || Arrays.equals(pointValue[1], uniquePointValue[1]));
|
||||
doc.add(new BinaryPoint("f", pointValue));
|
||||
}
|
||||
w.addDocument(doc);
|
||||
}
|
||||
w.forceMerge(1);
|
||||
final IndexReader r = DirectoryReader.open(w);
|
||||
w.close();
|
||||
final LeafReader lr = getOnlyLeafReader(r);
|
||||
PointValues points = lr.getPointValues("f");
|
||||
|
||||
// With >1 dims, the tree is balanced
|
||||
int actualMaxPointsInLeafNode = numDocs;
|
||||
while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
|
||||
actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
|
||||
}
|
||||
|
||||
// If all points match, then the point count is numLeaves * maxPointsInLeafNode
|
||||
final int numLeaves = Integer.highestOneBit((numDocs - 1) / actualMaxPointsInLeafNode) << 1;
|
||||
assertEquals(numLeaves * actualMaxPointsInLeafNode,
|
||||
points.estimatePointCount(new IntersectVisitor() {
|
||||
@Override
|
||||
public void visit(int docID, byte[] packedValue) throws IOException {}
|
||||
|
||||
@Override
|
||||
public void visit(int docID) throws IOException {}
|
||||
|
||||
@Override
|
||||
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
return Relation.CELL_INSIDE_QUERY;
|
||||
}
|
||||
}));
|
||||
|
||||
// Return 0 if no points match
|
||||
assertEquals(0,
|
||||
points.estimatePointCount(new IntersectVisitor() {
|
||||
@Override
|
||||
public void visit(int docID, byte[] packedValue) throws IOException {}
|
||||
|
||||
@Override
|
||||
public void visit(int docID) throws IOException {}
|
||||
|
||||
@Override
|
||||
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
return Relation.CELL_OUTSIDE_QUERY;
|
||||
}
|
||||
}));
|
||||
|
||||
// If only one point matches, then the point count is (actualMaxPointsInLeafNode + 1) / 2
|
||||
assertEquals((actualMaxPointsInLeafNode + 1) / 2,
|
||||
points.estimatePointCount(new IntersectVisitor() {
|
||||
@Override
|
||||
public void visit(int docID, byte[] packedValue) throws IOException {}
|
||||
|
||||
@Override
|
||||
public void visit(int docID) throws IOException {}
|
||||
|
||||
@Override
|
||||
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
for (int dim = 0; dim < 2; ++dim) {
|
||||
if (StringHelper.compare(3, uniquePointValue[0], 0, maxPackedValue, dim * 3) > 0 ||
|
||||
StringHelper.compare(3, uniquePointValue[0], 0, minPackedValue, dim * 3) < 0) {
|
||||
return Relation.CELL_OUTSIDE_QUERY;
|
||||
}
|
||||
}
|
||||
return Relation.CELL_CROSSES_QUERY;
|
||||
}
|
||||
}));
|
||||
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1104,4 +1104,94 @@ public class TestBKD extends LuceneTestCase {
|
|||
in.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testEstimatePointCount() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
final int numValues = atLeast(10000); // make sure to have multiple leaves
|
||||
final int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
|
||||
final int numBytesPerDim = TestUtil.nextInt(random(), 1, 4);
|
||||
final byte[] pointValue = new byte[numBytesPerDim];
|
||||
final byte[] uniquePointValue = new byte[numBytesPerDim];
|
||||
random().nextBytes(uniquePointValue);
|
||||
|
||||
BKDWriter w = new BKDWriter(numValues, dir, "_temp", 1, numBytesPerDim, maxPointsInLeafNode,
|
||||
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, numValues, true);
|
||||
for (int i = 0; i < numValues; ++i) {
|
||||
if (i == numValues / 2) {
|
||||
w.add(uniquePointValue, i);
|
||||
} else {
|
||||
do {
|
||||
random().nextBytes(pointValue);
|
||||
} while (Arrays.equals(pointValue, uniquePointValue));
|
||||
w.add(pointValue, i);
|
||||
}
|
||||
}
|
||||
final long indexFP;
|
||||
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
|
||||
indexFP = w.finish(out);
|
||||
w.close();
|
||||
}
|
||||
|
||||
IndexInput pointsIn = dir.openInput("bkd", IOContext.DEFAULT);
|
||||
pointsIn.seek(indexFP);
|
||||
BKDReader points = new BKDReader(pointsIn);
|
||||
|
||||
int actualMaxPointsInLeafNode = numValues;
|
||||
while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
|
||||
actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
|
||||
}
|
||||
|
||||
// If all points match, then the point count is numLeaves * maxPointsInLeafNode
|
||||
final int numLeaves = Integer.highestOneBit((numValues - 1) / actualMaxPointsInLeafNode) << 1;
|
||||
assertEquals(numLeaves * actualMaxPointsInLeafNode,
|
||||
points.estimatePointCount(new IntersectVisitor() {
|
||||
@Override
|
||||
public void visit(int docID, byte[] packedValue) throws IOException {}
|
||||
|
||||
@Override
|
||||
public void visit(int docID) throws IOException {}
|
||||
|
||||
@Override
|
||||
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
return Relation.CELL_INSIDE_QUERY;
|
||||
}
|
||||
}));
|
||||
|
||||
// Return 0 if no points match
|
||||
assertEquals(0,
|
||||
points.estimatePointCount(new IntersectVisitor() {
|
||||
@Override
|
||||
public void visit(int docID, byte[] packedValue) throws IOException {}
|
||||
|
||||
@Override
|
||||
public void visit(int docID) throws IOException {}
|
||||
|
||||
@Override
|
||||
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
return Relation.CELL_OUTSIDE_QUERY;
|
||||
}
|
||||
}));
|
||||
|
||||
// If only one point matches, then the point count is (actualMaxPointsInLeafNode + 1) / 2
|
||||
assertEquals((actualMaxPointsInLeafNode + 1) / 2,
|
||||
points.estimatePointCount(new IntersectVisitor() {
|
||||
@Override
|
||||
public void visit(int docID, byte[] packedValue) throws IOException {}
|
||||
|
||||
@Override
|
||||
public void visit(int docID) throws IOException {}
|
||||
|
||||
@Override
|
||||
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
if (StringHelper.compare(3, uniquePointValue, 0, maxPackedValue, 0) > 0 ||
|
||||
StringHelper.compare(3, uniquePointValue, 0, minPackedValue, 0) < 0) {
|
||||
return Relation.CELL_OUTSIDE_QUERY;
|
||||
}
|
||||
return Relation.CELL_CROSSES_QUERY;
|
||||
}
|
||||
}));
|
||||
|
||||
pointsIn.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue