mirror of https://github.com/apache/lucene.git
LUCENE-10378 Implement Weight#count for PointRangeQuery (#658)
Implement Weight#count for PointRangeQuery to provide a faster way to calculate the number of matching range docs when each doc has at-most one point and the points are 1-dimensional.
This commit is contained in:
parent
6157854523
commit
dd25fabb03
|
@ -149,6 +149,10 @@ New Features
|
|||
|
||||
* LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss,
|
||||
Alan Woodward)
|
||||
|
||||
* LUCENE-10378: Implement Weight#count for PointRangeQuery to provide a faster way to calculate
|
||||
the number of matching range docs when each doc has at-most one point and the points are 1-dimensional.
|
||||
(Gautam Worah, Ignacio Vera, Adrien Grand)
|
||||
|
||||
Improvements
|
||||
---------------------
|
||||
|
|
|
@ -82,7 +82,7 @@ public final class DocValuesFieldExistsQuery extends Query {
|
|||
final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
|
||||
if (fieldInfo == null || fieldInfo.getDocValuesType() == DocValuesType.NONE) {
|
||||
return 0; // the field doesn't index doc values
|
||||
} else if (!reader.hasDeletions()) {
|
||||
} else if (reader.hasDeletions() == false) {
|
||||
if (fieldInfo.getPointDimensionCount() > 0) {
|
||||
return reader.getPointValues(field).getDocCount();
|
||||
} else if (fieldInfo.getIndexOptions() != IndexOptions.NONE) {
|
||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.search;
|
|||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Predicate;
|
||||
import org.apache.lucene.document.IntPoint;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
@ -258,14 +260,10 @@ public abstract class PointRangeQuery extends Query {
|
|||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
|
||||
LeafReader reader = context.reader();
|
||||
|
||||
PointValues values = reader.getPointValues(field);
|
||||
private boolean checkValidPointValues(PointValues values) throws IOException {
|
||||
if (values == null) {
|
||||
// No docs in this segment/field indexed any points
|
||||
return null;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (values.getNumIndexDimensions() != numDims) {
|
||||
|
@ -286,6 +284,17 @@ public abstract class PointRangeQuery extends Query {
|
|||
+ " but this query has bytesPerDim="
|
||||
+ bytesPerDim);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
|
||||
LeafReader reader = context.reader();
|
||||
|
||||
PointValues values = reader.getPointValues(field);
|
||||
if (checkValidPointValues(values) == false) {
|
||||
return null;
|
||||
}
|
||||
|
||||
boolean allDocsMatch;
|
||||
if (values.getDocCount() == reader.maxDoc()) {
|
||||
|
@ -369,6 +378,106 @@ public abstract class PointRangeQuery extends Query {
|
|||
return scorerSupplier.get(Long.MAX_VALUE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int count(LeafReaderContext context) throws IOException {
|
||||
LeafReader reader = context.reader();
|
||||
|
||||
PointValues values = reader.getPointValues(field);
|
||||
if (checkValidPointValues(values) == false) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (reader.hasDeletions() == false
|
||||
&& numDims == 1
|
||||
&& values.getDocCount() == values.size()) {
|
||||
// if all documents have at-most one point
|
||||
return (int) pointCount(values.getPointTree(), this::relate, this::matches);
|
||||
}
|
||||
return super.count(context);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the number of points matching the provided range conditions. Using this method is
|
||||
* faster than calling {@link PointValues#intersect(IntersectVisitor)} to get the count of
|
||||
* intersecting points. This method does not enforce live documents, therefore it should only
|
||||
* be used when there are no deleted documents.
|
||||
*
|
||||
* @param pointTree start node of the count operation
|
||||
* @param nodeComparator comparator to be used for checking whether the internal node is
|
||||
* inside the range
|
||||
* @param leafComparator comparator to be used for checking whether the leaf node is inside
|
||||
* the range
|
||||
* @return count of points that match the range
|
||||
*/
|
||||
private long pointCount(
|
||||
PointValues.PointTree pointTree,
|
||||
BiFunction<byte[], byte[], Relation> nodeComparator,
|
||||
Predicate<byte[]> leafComparator)
|
||||
throws IOException {
|
||||
final int[] matchingNodeCount = {0};
|
||||
// create a custom IntersectVisitor that records the number of leafNodes that matched
|
||||
final IntersectVisitor visitor =
|
||||
new IntersectVisitor() {
|
||||
@Override
|
||||
public void visit(int docID) {
|
||||
// this branch should be unreachable
|
||||
throw new UnsupportedOperationException(
|
||||
"This IntersectVisitor does not perform any actions on a "
|
||||
+ "docID="
|
||||
+ docID
|
||||
+ " node being visited");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(int docID, byte[] packedValue) {
|
||||
if (leafComparator.test(packedValue)) {
|
||||
matchingNodeCount[0]++;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
return nodeComparator.apply(minPackedValue, maxPackedValue);
|
||||
}
|
||||
};
|
||||
pointCount(visitor, pointTree, matchingNodeCount);
|
||||
return matchingNodeCount[0];
|
||||
}
|
||||
|
||||
private void pointCount(
|
||||
IntersectVisitor visitor, PointValues.PointTree pointTree, int[] matchingNodeCount)
|
||||
throws IOException {
|
||||
Relation r = visitor.compare(pointTree.getMinPackedValue(), pointTree.getMaxPackedValue());
|
||||
switch (r) {
|
||||
case CELL_OUTSIDE_QUERY:
|
||||
// This cell is fully outside the query shape: return 0 as the count of its nodes
|
||||
return;
|
||||
case CELL_INSIDE_QUERY:
|
||||
// This cell is fully inside the query shape: return the size of the entire node as the
|
||||
// count
|
||||
matchingNodeCount[0] += pointTree.size();
|
||||
return;
|
||||
case CELL_CROSSES_QUERY:
|
||||
/*
|
||||
The cell crosses the shape boundary, or the cell fully contains the query, so we fall
|
||||
through and do full counting.
|
||||
*/
|
||||
if (pointTree.moveToChild()) {
|
||||
do {
|
||||
pointCount(visitor, pointTree, matchingNodeCount);
|
||||
} while (pointTree.moveToSibling());
|
||||
pointTree.moveToParent();
|
||||
} else {
|
||||
// we have reached a leaf node here.
|
||||
pointTree.visitDocValues(visitor);
|
||||
// leaf node count is saved in the matchingNodeCount array by the visitor
|
||||
}
|
||||
return;
|
||||
default:
|
||||
throw new IllegalArgumentException("Unreachable code");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCacheable(LeafReaderContext ctx) {
|
||||
return true;
|
||||
|
|
|
@ -2194,6 +2194,60 @@ public class TestPointQueries extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
public void testPointRangeWeightCount() throws IOException {
|
||||
// the optimization for Weight#count kicks in only when the number of dimensions is 1
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
|
||||
int numPoints = random().nextInt(1, 10);
|
||||
int[] points = new int[numPoints];
|
||||
|
||||
int numQueries = random().nextInt(1, 10);
|
||||
int[] lowerBound = new int[numQueries];
|
||||
int[] upperBound = new int[numQueries];
|
||||
int[] expectedCount = new int[numQueries];
|
||||
|
||||
for (int i = 0; i < numQueries; i++) {
|
||||
// generate random queries
|
||||
lowerBound[i] = random().nextInt(1, 10);
|
||||
// allow malformed ranges where upperBound could be less than lowerBound
|
||||
upperBound[i] = random().nextInt(1, 10);
|
||||
}
|
||||
|
||||
for (int i = 0; i < numPoints; i++) {
|
||||
// generate random 1D points
|
||||
points[i] = random().nextInt(1, 10);
|
||||
if (random().nextBoolean()) {
|
||||
// the doc may have at-most 1 point
|
||||
Document doc = new Document();
|
||||
doc.add(new IntPoint("point", points[i]));
|
||||
w.addDocument(doc);
|
||||
for (int j = 0; j < numQueries; j++) {
|
||||
// calculate the number of points that lie within the query range
|
||||
if (lowerBound[j] <= points[i] && points[i] <= upperBound[j]) {
|
||||
expectedCount[j]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
w.commit();
|
||||
w.forceMerge(1);
|
||||
|
||||
IndexReader reader = w.getReader();
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
if (searcher.leafContexts.isEmpty() == false) { // we need at least 1 leaf in the segment
|
||||
for (int i = 0; i < numQueries; i++) {
|
||||
Query query = IntPoint.newRangeQuery("point", lowerBound[i], upperBound[i]);
|
||||
Weight weight = searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 1);
|
||||
assertEquals(expectedCount[i], weight.count(searcher.leafContexts.get(0)));
|
||||
}
|
||||
}
|
||||
|
||||
reader.close();
|
||||
w.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testPointRangeEquals() {
|
||||
Query q1, q2;
|
||||
|
||||
|
|
Loading…
Reference in New Issue