From 680f21dca5fe9d1bee85f2a0ea688119fda49e80 Mon Sep 17 00:00:00 2001
From: Greg Miller <gsmiller@gmail.com>
Date: Thu, 1 Sep 2022 14:04:43 -0700
Subject: [PATCH] LUCENE-10207: TermInSetQuery now provides a ScoreSupplier
 with cost estimation for use in IndexOrDocValuesQuery (#1058)

---
 lucene/CHANGES.txt                            |  3 +
 .../apache/lucene/search/TermInSetQuery.java  | 68 +++++++++++++++++--
 2 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 02956ce0b37..bb3c16dd612 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -101,6 +101,9 @@ Improvements
 ---------------------
 * LUCENE-10592: Build HNSW Graph on indexing. (Mayya Sharipova, Adrien Grand, Julie Tibshirani)
 
+* LUCENE-10207: TermInSetQuery can now provide a ScoreSupplier with cost estimation, making it
+  usable in IndexOrDocValuesQuery. (Greg Miller)
+
 * GITHUB#11715: Add Integer awareness to RamUsageEstimator.sizeOf (Mike Drob)
 
 Optimizations
diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
index 6fd2e5f4d12..cd44ab5e1b2 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@@ -354,15 +354,69 @@ public class TermInSetQuery extends Query implements Accountable {
       }
 
       @Override
-      public Scorer scorer(LeafReaderContext context) throws IOException {
-        final WeightOrDocIdSet weightOrBitSet = rewrite(context);
-        if (weightOrBitSet == null) {
+      public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
+        Terms indexTerms = context.reader().terms(field);
+        if (indexTerms == null) {
           return null;
-        } else if (weightOrBitSet.weight != null) {
-          return weightOrBitSet.weight.scorer(context);
-        } else {
-          return scorer(weightOrBitSet.set);
         }
+
+        // Cost estimation reasoning is:
+        //  1. Assume every query term matches at least one document (queryTermsCount).
+        //  2. Determine the total number of docs beyond the first one for each term.
+        //     That count provides a ceiling on the number of extra docs that could match beyond
+        //     that first one. (We omit the first since it's already been counted in #1).
+        // This approach still provides correct worst-case cost in general, but provides tighter
+        // estimates for primary-key-like fields. See: LUCENE-10207
+
+        // TODO: This cost estimation may grossly overestimate since we have no index statistics
+        // for the specific query terms. While it's nice to avoid the cost of intersecting the
+        // query terms with the index, it could be beneficial to do that work and get better
+        // cost estimates.
+        final long cost;
+        final long queryTermsCount = termData.size();
+        long potentialExtraCost = indexTerms.getSumDocFreq();
+        final long indexedTermCount = indexTerms.size();
+        if (indexedTermCount != -1) {
+          potentialExtraCost -= indexedTermCount;
+        }
+        cost = queryTermsCount + potentialExtraCost;
+
+        final Weight weight = this;
+        return new ScorerSupplier() {
+          @Override
+          public Scorer get(long leadCost) throws IOException {
+            WeightOrDocIdSet weightOrDocIdSet = rewrite(context);
+            if (weightOrDocIdSet == null) {
+              return null;
+            }
+
+            final Scorer scorer;
+            if (weightOrDocIdSet.weight != null) {
+              scorer = weightOrDocIdSet.weight.scorer(context);
+            } else {
+              scorer = scorer(weightOrDocIdSet.set);
+            }
+
+            return Objects.requireNonNullElseGet(
+                scorer,
+                () ->
+                    new ConstantScoreScorer(weight, score(), scoreMode, DocIdSetIterator.empty()));
+          }
+
+          @Override
+          public long cost() {
+            return cost;
+          }
+        };
+      }
+
+      @Override
+      public Scorer scorer(LeafReaderContext context) throws IOException {
+        final ScorerSupplier supplier = scorerSupplier(context);
+        if (supplier == null) {
+          return null;
+        }
+        return supplier.get(Long.MAX_VALUE);
       }
 
       @Override