From 680f21dca5fe9d1bee85f2a0ea688119fda49e80 Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Thu, 1 Sep 2022 14:04:43 -0700 Subject: [PATCH] LUCENE-10207: TermInSetQuery now provides a ScoreSupplier with cost estimation for use in IndexOrDocValuesQuery (#1058) --- lucene/CHANGES.txt | 3 + .../apache/lucene/search/TermInSetQuery.java | 68 +++++++++++++++++-- 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 02956ce0b37..bb3c16dd612 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -101,6 +101,9 @@ Improvements --------------------- * LUCENE-10592: Build HNSW Graph on indexing. (Mayya Sharipova, Adrien Grand, Julie Tibshirani) +* LUCENE-10207: TermInSetQuery can now provide a ScoreSupplier with cost estimation, making it + usable in IndexOrDocValuesQuery. (Greg Miller) + * GITHUB#11715: Add Integer awareness to RamUsageEstimator.sizeOf (Mike Drob) Optimizations diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java index 6fd2e5f4d12..cd44ab5e1b2 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java @@ -354,15 +354,69 @@ public class TermInSetQuery extends Query implements Accountable { } @Override - public Scorer scorer(LeafReaderContext context) throws IOException { - final WeightOrDocIdSet weightOrBitSet = rewrite(context); - if (weightOrBitSet == null) { + public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { + Terms indexTerms = context.reader().terms(field); + if (indexTerms == null) { return null; - } else if (weightOrBitSet.weight != null) { - return weightOrBitSet.weight.scorer(context); - } else { - return scorer(weightOrBitSet.set); } + + // Cost estimation reasoning is: + // 1. Assume every query term matches at least one document (queryTermsCount). + // 2. Determine the total number of docs beyond the first one for each term. + // That count provides a ceiling on the number of extra docs that could match beyond + // that first one. (We omit the first since it's already been counted in #1). + // This approach still provides correct worst-case cost in general, but provides tighter + // estimates for primary-key-like fields. See: LUCENE-10207 + + // TODO: This cost estimation may grossly overestimate since we have no index statistics + // for the specific query terms. While it's nice to avoid the cost of intersecting the + // query terms with the index, it could be beneficial to do that work and get better + // cost estimates. + final long cost; + final long queryTermsCount = termData.size(); + long potentialExtraCost = indexTerms.getSumDocFreq(); + final long indexedTermCount = indexTerms.size(); + if (indexedTermCount != -1) { + potentialExtraCost -= indexedTermCount; + } + cost = queryTermsCount + potentialExtraCost; + + final Weight weight = this; + return new ScorerSupplier() { + @Override + public Scorer get(long leadCost) throws IOException { + WeightOrDocIdSet weightOrDocIdSet = rewrite(context); + if (weightOrDocIdSet == null) { + return null; + } + + final Scorer scorer; + if (weightOrDocIdSet.weight != null) { + scorer = weightOrDocIdSet.weight.scorer(context); + } else { + scorer = scorer(weightOrDocIdSet.set); + } + + return Objects.requireNonNullElseGet( + scorer, + () -> + new ConstantScoreScorer(weight, score(), scoreMode, DocIdSetIterator.empty())); + } + + @Override + public long cost() { + return cost; + } + }; + } + + @Override + public Scorer scorer(LeafReaderContext context) throws IOException { + final ScorerSupplier supplier = scorerSupplier(context); + if (supplier == null) { + return null; + } + return supplier.get(Long.MAX_VALUE); } @Override