LUCENE-10207: TermInSetQuery now provides a ScoreSupplier with cost estimation for use in IndexOrDocValuesQuery (#1058)

2022-09-01 14:04:43 -07:00 · 2022-09-01 14:04:43 -07:00 · 680f21dca5
parent 0462a0ad73
commit 680f21dca5
2 changed files with 64 additions and 7 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -101,6 +101,9 @@ Improvements
 ---------------------
 * LUCENE-10592: Build HNSW Graph on indexing. (Mayya Sharipova, Adrien Grand, Julie Tibshirani)

+* LUCENE-10207: TermInSetQuery can now provide a ScoreSupplier with cost estimation, making it
+  usable in IndexOrDocValuesQuery. (Greg Miller)
+
 * GITHUB#11715: Add Integer awareness to RamUsageEstimator.sizeOf (Mike Drob)

 Optimizations
--- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@ -354,15 +354,69 @@ public class TermInSetQuery extends Query implements Accountable {
      }

      @Override
-      public Scorer scorer(LeafReaderContext context) throws IOException {
-        final WeightOrDocIdSet weightOrBitSet = rewrite(context);
-        if (weightOrBitSet == null) {
+      public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
+        Terms indexTerms = context.reader().terms(field);
+        if (indexTerms == null) {
          return null;
-        } else if (weightOrBitSet.weight != null) {
-          return weightOrBitSet.weight.scorer(context);
-        } else {
-          return scorer(weightOrBitSet.set);
        }
+
+        // Cost estimation reasoning is:
+        //  1. Assume every query term matches at least one document (queryTermsCount).
+        //  2. Determine the total number of docs beyond the first one for each term.
+        //     That count provides a ceiling on the number of extra docs that could match beyond
+        //     that first one. (We omit the first since it's already been counted in #1).
+        // This approach still provides correct worst-case cost in general, but provides tighter
+        // estimates for primary-key-like fields. See: LUCENE-10207
+
+        // TODO: This cost estimation may grossly overestimate since we have no index statistics
+        // for the specific query terms. While it's nice to avoid the cost of intersecting the
+        // query terms with the index, it could be beneficial to do that work and get better
+        // cost estimates.
+        final long cost;
+        final long queryTermsCount = termData.size();
+        long potentialExtraCost = indexTerms.getSumDocFreq();
+        final long indexedTermCount = indexTerms.size();
+        if (indexedTermCount != -1) {
+          potentialExtraCost -= indexedTermCount;
+        }
+        cost = queryTermsCount + potentialExtraCost;
+
+        final Weight weight = this;
+        return new ScorerSupplier() {
+          @Override
+          public Scorer get(long leadCost) throws IOException {
+            WeightOrDocIdSet weightOrDocIdSet = rewrite(context);
+            if (weightOrDocIdSet == null) {
+              return null;
+            }
+
+            final Scorer scorer;
+            if (weightOrDocIdSet.weight != null) {
+              scorer = weightOrDocIdSet.weight.scorer(context);
+            } else {
+              scorer = scorer(weightOrDocIdSet.set);
+            }
+
+            return Objects.requireNonNullElseGet(
+                scorer,
+                () ->
+                    new ConstantScoreScorer(weight, score(), scoreMode, DocIdSetIterator.empty()));
+          }
+
+          @Override
+          public long cost() {
+            return cost;
+          }
+        };
+      }
+
+      @Override
+      public Scorer scorer(LeafReaderContext context) throws IOException {
+        final ScorerSupplier supplier = scorerSupplier(context);
+        if (supplier == null) {
+          return null;
+        }
+        return supplier.get(Long.MAX_VALUE);
      }

      @Override