LUCENE-10378 Implement Weight#count for PointRangeQuery (#658)

Implement Weight#count for PointRangeQuery to provide a faster way to calculate the number of matching range docs when each doc has at-most one point and the points are 1-dimensional.
2022-02-15 22:23:49 -08:00 · 2022-02-15 22:23:49 -08:00 · dd25fabb03
parent 6157854523
commit dd25fabb03
4 changed files with 174 additions and 7 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -150,6 +150,10 @@ New Features
 * LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss, 
  Alan Woodward)
 * LUCENE-10378: Implement Weight#count for PointRangeQuery to provide a faster way to calculate
  the number of matching range docs when each doc has at-most one point and the points are 1-dimensional.
  (Gautam Worah, Ignacio Vera, Adrien Grand)
 Improvements
 ---------------------
--- a/lucene/core/src/java/org/apache/lucene/search/DocValuesFieldExistsQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/DocValuesFieldExistsQuery.java
@ -82,7 +82,7 @@ public final class DocValuesFieldExistsQuery extends Query {
        final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
        if (fieldInfo == null || fieldInfo.getDocValuesType() == DocValuesType.NONE) {
          return 0; // the field doesn't index doc values
-        } else if (!reader.hasDeletions()) {
+        } else if (reader.hasDeletions() == false) {
          if (fieldInfo.getPointDimensionCount() > 0) {
            return reader.getPointValues(field).getDocCount();
          } else if (fieldInfo.getIndexOptions() != IndexOptions.NONE) {
--- a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
@ -19,6 +19,8 @@ package org.apache.lucene.search;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.BiFunction;
 import java.util.function.Predicate;
 import org.apache.lucene.document.IntPoint;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
@ -258,14 +260,10 @@ public abstract class PointRangeQuery extends Query {
        };
      }
-      @Override
+      private boolean checkValidPointValues(PointValues values) throws IOException {
      public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
        LeafReader reader = context.reader();
        PointValues values = reader.getPointValues(field);
        if (values == null) {
          // No docs in this segment/field indexed any points
-          return null;
+          return false;
        }
        if (values.getNumIndexDimensions() != numDims) {
@ -286,6 +284,17 @@ public abstract class PointRangeQuery extends Query {
                  + " but this query has bytesPerDim="
                  + bytesPerDim);
        }
        return true;
      }
      @Override
      public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
        LeafReader reader = context.reader();
        PointValues values = reader.getPointValues(field);
        if (checkValidPointValues(values) == false) {
          return null;
        }
        boolean allDocsMatch;
        if (values.getDocCount() == reader.maxDoc()) {
@ -369,6 +378,106 @@ public abstract class PointRangeQuery extends Query {
        return scorerSupplier.get(Long.MAX_VALUE);
      }
      @Override
      public int count(LeafReaderContext context) throws IOException {
        LeafReader reader = context.reader();
        PointValues values = reader.getPointValues(field);
        if (checkValidPointValues(values) == false) {
          return 0;
        }
        if (reader.hasDeletions() == false
            && numDims == 1
            && values.getDocCount() == values.size()) {
          // if all documents have at-most one point
          return (int) pointCount(values.getPointTree(), this::relate, this::matches);
        }
        return super.count(context);
      }
      /**
       * Finds the number of points matching the provided range conditions. Using this method is
       * faster than calling {@link PointValues#intersect(IntersectVisitor)} to get the count of
       * intersecting points. This method does not enforce live documents, therefore it should only
       * be used when there are no deleted documents.
       *
       * @param pointTree start node of the count operation
       * @param nodeComparator comparator to be used for checking whether the internal node is
       *     inside the range
       * @param leafComparator comparator to be used for checking whether the leaf node is inside
       *     the range
       * @return count of points that match the range
       */
      private long pointCount(
          PointValues.PointTree pointTree,
          BiFunction<byte[], byte[], Relation> nodeComparator,
          Predicate<byte[]> leafComparator)
          throws IOException {
        final int[] matchingNodeCount = {0};
        // create a custom IntersectVisitor that records the number of leafNodes that matched
        final IntersectVisitor visitor =
            new IntersectVisitor() {
              @Override
              public void visit(int docID) {
                // this branch should be unreachable
                throw new UnsupportedOperationException(
                    "This IntersectVisitor does not perform any actions on a "
                        + "docID="
                        + docID
                        + " node being visited");
              }
              @Override
              public void visit(int docID, byte[] packedValue) {
                if (leafComparator.test(packedValue)) {
                  matchingNodeCount[0]++;
                }
              }
              @Override
              public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
                return nodeComparator.apply(minPackedValue, maxPackedValue);
              }
            };
        pointCount(visitor, pointTree, matchingNodeCount);
        return matchingNodeCount[0];
      }
      private void pointCount(
          IntersectVisitor visitor, PointValues.PointTree pointTree, int[] matchingNodeCount)
          throws IOException {
        Relation r = visitor.compare(pointTree.getMinPackedValue(), pointTree.getMaxPackedValue());
        switch (r) {
          case CELL_OUTSIDE_QUERY:
            // This cell is fully outside the query shape: return 0 as the count of its nodes
            return;
          case CELL_INSIDE_QUERY:
            // This cell is fully inside the query shape: return the size of the entire node as the
            // count
            matchingNodeCount[0] += pointTree.size();
            return;
          case CELL_CROSSES_QUERY:
            /*
            The cell crosses the shape boundary, or the cell fully contains the query, so we fall
            through and do full counting.
            */
            if (pointTree.moveToChild()) {
              do {
                pointCount(visitor, pointTree, matchingNodeCount);
              } while (pointTree.moveToSibling());
              pointTree.moveToParent();
            } else {
              // we have reached a leaf node here.
              pointTree.visitDocValues(visitor);
              // leaf node count is saved in the matchingNodeCount array by the visitor
            }
            return;
          default:
            throw new IllegalArgumentException("Unreachable code");
        }
      }
      @Override
      public boolean isCacheable(LeafReaderContext ctx) {
        return true;
--- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
@ -2194,6 +2194,60 @@ public class TestPointQueries extends LuceneTestCase {
    dir.close();
  }
  public void testPointRangeWeightCount() throws IOException {
    // the optimization for Weight#count kicks in only when the number of dimensions is 1
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    int numPoints = random().nextInt(1, 10);
    int[] points = new int[numPoints];
    int numQueries = random().nextInt(1, 10);
    int[] lowerBound = new int[numQueries];
    int[] upperBound = new int[numQueries];
    int[] expectedCount = new int[numQueries];
    for (int i = 0; i < numQueries; i++) {
      // generate random queries
      lowerBound[i] = random().nextInt(1, 10);
      // allow malformed ranges where upperBound could be less than lowerBound
      upperBound[i] = random().nextInt(1, 10);
    }
    for (int i = 0; i < numPoints; i++) {
      // generate random 1D points
      points[i] = random().nextInt(1, 10);
      if (random().nextBoolean()) {
        // the doc may have at-most 1 point
        Document doc = new Document();
        doc.add(new IntPoint("point", points[i]));
        w.addDocument(doc);
        for (int j = 0; j < numQueries; j++) {
          // calculate the number of points that lie within the query range
          if (lowerBound[j] <= points[i] && points[i] <= upperBound[j]) {
            expectedCount[j]++;
          }
        }
      }
    }
    w.commit();
    w.forceMerge(1);
    IndexReader reader = w.getReader();
    IndexSearcher searcher = new IndexSearcher(reader);
    if (searcher.leafContexts.isEmpty() == false) { // we need at least 1 leaf in the segment
      for (int i = 0; i < numQueries; i++) {
        Query query = IntPoint.newRangeQuery("point", lowerBound[i], upperBound[i]);
        Weight weight = searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 1);
        assertEquals(expectedCount[i], weight.count(searcher.leafContexts.get(0)));
      }
    }
    reader.close();
    w.close();
    dir.close();
  }
  public void testPointRangeEquals() {
    Query q1, q2;