LUCENE-10207: TermInSetQuery now provides a ScoreSupplier with cost estimation for use in IndexOrDocValuesQuery (#1058)

This commit is contained in:
Greg Miller 2022-09-01 14:04:43 -07:00 committed by GitHub
parent 0462a0ad73
commit 680f21dca5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 64 additions and 7 deletions

View File

@ -101,6 +101,9 @@ Improvements
---------------------
* LUCENE-10592: Build HNSW Graph on indexing. (Mayya Sharipova, Adrien Grand, Julie Tibshirani)
* LUCENE-10207: TermInSetQuery can now provide a ScoreSupplier with cost estimation, making it
usable in IndexOrDocValuesQuery. (Greg Miller)
* GITHUB#11715: Add Integer awareness to RamUsageEstimator.sizeOf (Mike Drob)
Optimizations

View File

@ -354,15 +354,69 @@ public class TermInSetQuery extends Query implements Accountable {
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
if (weightOrBitSet == null) {
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
Terms indexTerms = context.reader().terms(field);
if (indexTerms == null) {
return null;
} else if (weightOrBitSet.weight != null) {
return weightOrBitSet.weight.scorer(context);
} else {
return scorer(weightOrBitSet.set);
}
// Cost estimation reasoning is:
// 1. Assume every query term matches at least one document (queryTermsCount).
// 2. Determine the total number of docs beyond the first one for each term.
// That count provides a ceiling on the number of extra docs that could match beyond
// that first one. (We omit the first since it's already been counted in #1).
// This approach still provides correct worst-case cost in general, but provides tighter
// estimates for primary-key-like fields. See: LUCENE-10207
// TODO: This cost estimation may grossly overestimate since we have no index statistics
// for the specific query terms. While it's nice to avoid the cost of intersecting the
// query terms with the index, it could be beneficial to do that work and get better
// cost estimates.
final long cost;
final long queryTermsCount = termData.size();
long potentialExtraCost = indexTerms.getSumDocFreq();
final long indexedTermCount = indexTerms.size();
if (indexedTermCount != -1) {
potentialExtraCost -= indexedTermCount;
}
cost = queryTermsCount + potentialExtraCost;
final Weight weight = this;
return new ScorerSupplier() {
@Override
public Scorer get(long leadCost) throws IOException {
WeightOrDocIdSet weightOrDocIdSet = rewrite(context);
if (weightOrDocIdSet == null) {
return null;
}
final Scorer scorer;
if (weightOrDocIdSet.weight != null) {
scorer = weightOrDocIdSet.weight.scorer(context);
} else {
scorer = scorer(weightOrDocIdSet.set);
}
return Objects.requireNonNullElseGet(
scorer,
() ->
new ConstantScoreScorer(weight, score(), scoreMode, DocIdSetIterator.empty()));
}
@Override
public long cost() {
return cost;
}
};
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
final ScorerSupplier supplier = scorerSupplier(context);
if (supplier == null) {
return null;
}
return supplier.get(Long.MAX_VALUE);
}
@Override