LUCENE-10378 Implement Weight#count for PointRangeQuery (#658)

Implement Weight#count for PointRangeQuery to provide a faster way to calculate the number of matching range docs when each doc has at-most one point and the points are 1-dimensional.
2022-02-15 22:23:49 -08:00 · 2022-02-15 22:23:49 -08:00 · dd25fabb03
parent 6157854523
commit dd25fabb03
4 changed files with 174 additions and 7 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -149,6 +149,10 @@ New Features

 * LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss, 
  Alan Woodward)
+  
+* LUCENE-10378: Implement Weight#count for PointRangeQuery to provide a faster way to calculate
+  the number of matching range docs when each doc has at-most one point and the points are 1-dimensional.
+  (Gautam Worah, Ignacio Vera, Adrien Grand)

 Improvements
 ---------------------
--- a/lucene/core/src/java/org/apache/lucene/search/DocValuesFieldExistsQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/DocValuesFieldExistsQuery.java
@ -82,7 +82,7 @@ public final class DocValuesFieldExistsQuery extends Query {
        final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
        if (fieldInfo == null || fieldInfo.getDocValuesType() == DocValuesType.NONE) {
          return 0; // the field doesn't index doc values
-        } else if (!reader.hasDeletions()) {
+        } else if (reader.hasDeletions() == false) {
          if (fieldInfo.getPointDimensionCount() > 0) {
            return reader.getPointValues(field).getDocCount();
          } else if (fieldInfo.getIndexOptions() != IndexOptions.NONE) {
--- a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
@ -19,6 +19,8 @@ package org.apache.lucene.search;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Objects;
+import java.util.function.BiFunction;
+import java.util.function.Predicate;
 import org.apache.lucene.document.IntPoint;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
@ -258,14 +260,10 @@ public abstract class PointRangeQuery extends Query {
        };
      }

-      @Override
-      public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
-        LeafReader reader = context.reader();
-
-        PointValues values = reader.getPointValues(field);
+      private boolean checkValidPointValues(PointValues values) throws IOException {
        if (values == null) {
          // No docs in this segment/field indexed any points
-          return null;
+          return false;
        }

        if (values.getNumIndexDimensions() != numDims) {
@ -286,6 +284,17 @@ public abstract class PointRangeQuery extends Query {
                  + " but this query has bytesPerDim="
                  + bytesPerDim);
        }
+        return true;
+      }
+
+      @Override
+      public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
+        LeafReader reader = context.reader();
+
+        PointValues values = reader.getPointValues(field);
+        if (checkValidPointValues(values) == false) {
+          return null;
+        }

        boolean allDocsMatch;
        if (values.getDocCount() == reader.maxDoc()) {
@ -369,6 +378,106 @@ public abstract class PointRangeQuery extends Query {
        return scorerSupplier.get(Long.MAX_VALUE);
      }

+      @Override
+      public int count(LeafReaderContext context) throws IOException {
+        LeafReader reader = context.reader();
+
+        PointValues values = reader.getPointValues(field);
+        if (checkValidPointValues(values) == false) {
+          return 0;
+        }
+
+        if (reader.hasDeletions() == false
+            && numDims == 1
+            && values.getDocCount() == values.size()) {
+          // if all documents have at-most one point
+          return (int) pointCount(values.getPointTree(), this::relate, this::matches);
+        }
+        return super.count(context);
+      }
+
+      /**
+       * Finds the number of points matching the provided range conditions. Using this method is
+       * faster than calling {@link PointValues#intersect(IntersectVisitor)} to get the count of
+       * intersecting points. This method does not enforce live documents, therefore it should only
+       * be used when there are no deleted documents.
+       *
+       * @param pointTree start node of the count operation
+       * @param nodeComparator comparator to be used for checking whether the internal node is
+       *     inside the range
+       * @param leafComparator comparator to be used for checking whether the leaf node is inside
+       *     the range
+       * @return count of points that match the range
+       */
+      private long pointCount(
+          PointValues.PointTree pointTree,
+          BiFunction<byte[], byte[], Relation> nodeComparator,
+          Predicate<byte[]> leafComparator)
+          throws IOException {
+        final int[] matchingNodeCount = {0};
+        // create a custom IntersectVisitor that records the number of leafNodes that matched
+        final IntersectVisitor visitor =
+            new IntersectVisitor() {
+              @Override
+              public void visit(int docID) {
+                // this branch should be unreachable
+                throw new UnsupportedOperationException(
+                    "This IntersectVisitor does not perform any actions on a "
+                        + "docID="
+                        + docID
+                        + " node being visited");
+              }
+
+              @Override
+              public void visit(int docID, byte[] packedValue) {
+                if (leafComparator.test(packedValue)) {
+                  matchingNodeCount[0]++;
+                }
+              }
+
+              @Override
+              public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+                return nodeComparator.apply(minPackedValue, maxPackedValue);
+              }
+            };
+        pointCount(visitor, pointTree, matchingNodeCount);
+        return matchingNodeCount[0];
+      }
+
+      private void pointCount(
+          IntersectVisitor visitor, PointValues.PointTree pointTree, int[] matchingNodeCount)
+          throws IOException {
+        Relation r = visitor.compare(pointTree.getMinPackedValue(), pointTree.getMaxPackedValue());
+        switch (r) {
+          case CELL_OUTSIDE_QUERY:
+            // This cell is fully outside the query shape: return 0 as the count of its nodes
+            return;
+          case CELL_INSIDE_QUERY:
+            // This cell is fully inside the query shape: return the size of the entire node as the
+            // count
+            matchingNodeCount[0] += pointTree.size();
+            return;
+          case CELL_CROSSES_QUERY:
+            /*
+            The cell crosses the shape boundary, or the cell fully contains the query, so we fall
+            through and do full counting.
+            */
+            if (pointTree.moveToChild()) {
+              do {
+                pointCount(visitor, pointTree, matchingNodeCount);
+              } while (pointTree.moveToSibling());
+              pointTree.moveToParent();
+            } else {
+              // we have reached a leaf node here.
+              pointTree.visitDocValues(visitor);
+              // leaf node count is saved in the matchingNodeCount array by the visitor
+            }
+            return;
+          default:
+            throw new IllegalArgumentException("Unreachable code");
+        }
+      }
+
      @Override
      public boolean isCacheable(LeafReaderContext ctx) {
        return true;
--- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
@ -2194,6 +2194,60 @@ public class TestPointQueries extends LuceneTestCase {
    dir.close();
  }

+  public void testPointRangeWeightCount() throws IOException {
+    // the optimization for Weight#count kicks in only when the number of dimensions is 1
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+
+    int numPoints = random().nextInt(1, 10);
+    int[] points = new int[numPoints];
+
+    int numQueries = random().nextInt(1, 10);
+    int[] lowerBound = new int[numQueries];
+    int[] upperBound = new int[numQueries];
+    int[] expectedCount = new int[numQueries];
+
+    for (int i = 0; i < numQueries; i++) {
+      // generate random queries
+      lowerBound[i] = random().nextInt(1, 10);
+      // allow malformed ranges where upperBound could be less than lowerBound
+      upperBound[i] = random().nextInt(1, 10);
+    }
+
+    for (int i = 0; i < numPoints; i++) {
+      // generate random 1D points
+      points[i] = random().nextInt(1, 10);
+      if (random().nextBoolean()) {
+        // the doc may have at-most 1 point
+        Document doc = new Document();
+        doc.add(new IntPoint("point", points[i]));
+        w.addDocument(doc);
+        for (int j = 0; j < numQueries; j++) {
+          // calculate the number of points that lie within the query range
+          if (lowerBound[j] <= points[i] && points[i] <= upperBound[j]) {
+            expectedCount[j]++;
+          }
+        }
+      }
+    }
+    w.commit();
+    w.forceMerge(1);
+
+    IndexReader reader = w.getReader();
+    IndexSearcher searcher = new IndexSearcher(reader);
+    if (searcher.leafContexts.isEmpty() == false) { // we need at least 1 leaf in the segment
+      for (int i = 0; i < numQueries; i++) {
+        Query query = IntPoint.newRangeQuery("point", lowerBound[i], upperBound[i]);
+        Weight weight = searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 1);
+        assertEquals(expectedCount[i], weight.count(searcher.leafContexts.get(0)));
+      }
+    }
+
+    reader.close();
+    w.close();
+    dir.close();
+  }
+
  public void testPointRangeEquals() {
    Query q1, q2;