LUCENE-10378 Implement Weight#count for PointRangeQuery (#658)

Implement Weight#count for PointRangeQuery to provide a faster way to calculate
the number of matching range docs when each doc has at-most one point and the 
points are 1-dimensional.
This commit is contained in:
Gautam Worah 2022-02-15 22:23:49 -08:00 committed by GitHub
parent 6157854523
commit dd25fabb03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 174 additions and 7 deletions

View File

@ -149,6 +149,10 @@ New Features
* LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss,
Alan Woodward)
* LUCENE-10378: Implement Weight#count for PointRangeQuery to provide a faster way to calculate
the number of matching range docs when each doc has at-most one point and the points are 1-dimensional.
(Gautam Worah, Ignacio Vera, Adrien Grand)
Improvements
---------------------

View File

@ -82,7 +82,7 @@ public final class DocValuesFieldExistsQuery extends Query {
final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
if (fieldInfo == null || fieldInfo.getDocValuesType() == DocValuesType.NONE) {
return 0; // the field doesn't index doc values
} else if (!reader.hasDeletions()) {
} else if (reader.hasDeletions() == false) {
if (fieldInfo.getPointDimensionCount() > 0) {
return reader.getPointValues(field).getDocCount();
} else if (fieldInfo.getIndexOptions() != IndexOptions.NONE) {

View File

@ -19,6 +19,8 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.Arrays;
import java.util.Objects;
import java.util.function.BiFunction;
import java.util.function.Predicate;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
@ -258,14 +260,10 @@ public abstract class PointRangeQuery extends Query {
};
}
@Override
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
PointValues values = reader.getPointValues(field);
private boolean checkValidPointValues(PointValues values) throws IOException {
if (values == null) {
// No docs in this segment/field indexed any points
return null;
return false;
}
if (values.getNumIndexDimensions() != numDims) {
@ -286,6 +284,17 @@ public abstract class PointRangeQuery extends Query {
+ " but this query has bytesPerDim="
+ bytesPerDim);
}
return true;
}
@Override
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
PointValues values = reader.getPointValues(field);
if (checkValidPointValues(values) == false) {
return null;
}
boolean allDocsMatch;
if (values.getDocCount() == reader.maxDoc()) {
@ -369,6 +378,106 @@ public abstract class PointRangeQuery extends Query {
return scorerSupplier.get(Long.MAX_VALUE);
}
@Override
public int count(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
PointValues values = reader.getPointValues(field);
if (checkValidPointValues(values) == false) {
return 0;
}
if (reader.hasDeletions() == false
&& numDims == 1
&& values.getDocCount() == values.size()) {
// if all documents have at-most one point
return (int) pointCount(values.getPointTree(), this::relate, this::matches);
}
return super.count(context);
}
/**
* Finds the number of points matching the provided range conditions. Using this method is
* faster than calling {@link PointValues#intersect(IntersectVisitor)} to get the count of
* intersecting points. This method does not enforce live documents, therefore it should only
* be used when there are no deleted documents.
*
* @param pointTree start node of the count operation
* @param nodeComparator comparator to be used for checking whether the internal node is
* inside the range
* @param leafComparator comparator to be used for checking whether the leaf node is inside
* the range
* @return count of points that match the range
*/
private long pointCount(
PointValues.PointTree pointTree,
BiFunction<byte[], byte[], Relation> nodeComparator,
Predicate<byte[]> leafComparator)
throws IOException {
final int[] matchingNodeCount = {0};
// create a custom IntersectVisitor that records the number of leafNodes that matched
final IntersectVisitor visitor =
new IntersectVisitor() {
@Override
public void visit(int docID) {
// this branch should be unreachable
throw new UnsupportedOperationException(
"This IntersectVisitor does not perform any actions on a "
+ "docID="
+ docID
+ " node being visited");
}
@Override
public void visit(int docID, byte[] packedValue) {
if (leafComparator.test(packedValue)) {
matchingNodeCount[0]++;
}
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
return nodeComparator.apply(minPackedValue, maxPackedValue);
}
};
pointCount(visitor, pointTree, matchingNodeCount);
return matchingNodeCount[0];
}
private void pointCount(
IntersectVisitor visitor, PointValues.PointTree pointTree, int[] matchingNodeCount)
throws IOException {
Relation r = visitor.compare(pointTree.getMinPackedValue(), pointTree.getMaxPackedValue());
switch (r) {
case CELL_OUTSIDE_QUERY:
// This cell is fully outside the query shape: return 0 as the count of its nodes
return;
case CELL_INSIDE_QUERY:
// This cell is fully inside the query shape: return the size of the entire node as the
// count
matchingNodeCount[0] += pointTree.size();
return;
case CELL_CROSSES_QUERY:
/*
The cell crosses the shape boundary, or the cell fully contains the query, so we fall
through and do full counting.
*/
if (pointTree.moveToChild()) {
do {
pointCount(visitor, pointTree, matchingNodeCount);
} while (pointTree.moveToSibling());
pointTree.moveToParent();
} else {
// we have reached a leaf node here.
pointTree.visitDocValues(visitor);
// leaf node count is saved in the matchingNodeCount array by the visitor
}
return;
default:
throw new IllegalArgumentException("Unreachable code");
}
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return true;

View File

@ -2194,6 +2194,60 @@ public class TestPointQueries extends LuceneTestCase {
dir.close();
}
public void testPointRangeWeightCount() throws IOException {
// the optimization for Weight#count kicks in only when the number of dimensions is 1
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
int numPoints = random().nextInt(1, 10);
int[] points = new int[numPoints];
int numQueries = random().nextInt(1, 10);
int[] lowerBound = new int[numQueries];
int[] upperBound = new int[numQueries];
int[] expectedCount = new int[numQueries];
for (int i = 0; i < numQueries; i++) {
// generate random queries
lowerBound[i] = random().nextInt(1, 10);
// allow malformed ranges where upperBound could be less than lowerBound
upperBound[i] = random().nextInt(1, 10);
}
for (int i = 0; i < numPoints; i++) {
// generate random 1D points
points[i] = random().nextInt(1, 10);
if (random().nextBoolean()) {
// the doc may have at-most 1 point
Document doc = new Document();
doc.add(new IntPoint("point", points[i]));
w.addDocument(doc);
for (int j = 0; j < numQueries; j++) {
// calculate the number of points that lie within the query range
if (lowerBound[j] <= points[i] && points[i] <= upperBound[j]) {
expectedCount[j]++;
}
}
}
}
w.commit();
w.forceMerge(1);
IndexReader reader = w.getReader();
IndexSearcher searcher = new IndexSearcher(reader);
if (searcher.leafContexts.isEmpty() == false) { // we need at least 1 leaf in the segment
for (int i = 0; i < numQueries; i++) {
Query query = IntPoint.newRangeQuery("point", lowerBound[i], upperBound[i]);
Weight weight = searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 1);
assertEquals(expectedCount[i], weight.count(searcher.leafContexts.get(0)));
}
}
reader.close();
w.close();
dir.close();
}
public void testPointRangeEquals() {
Query q1, q2;