mirror of https://github.com/apache/lucene.git
LUCENE-10207: TermInSetQuery now provides a ScoreSupplier with cost estimation for use in IndexOrDocValuesQuery (#1058)
This commit is contained in:
parent
0462a0ad73
commit
680f21dca5
|
@ -101,6 +101,9 @@ Improvements
|
|||
---------------------
|
||||
* LUCENE-10592: Build HNSW Graph on indexing. (Mayya Sharipova, Adrien Grand, Julie Tibshirani)
|
||||
|
||||
* LUCENE-10207: TermInSetQuery can now provide a ScoreSupplier with cost estimation, making it
|
||||
usable in IndexOrDocValuesQuery. (Greg Miller)
|
||||
|
||||
* GITHUB#11715: Add Integer awareness to RamUsageEstimator.sizeOf (Mike Drob)
|
||||
|
||||
Optimizations
|
||||
|
|
|
@ -354,15 +354,69 @@ public class TermInSetQuery extends Query implements Accountable {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
|
||||
if (weightOrBitSet == null) {
|
||||
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
|
||||
Terms indexTerms = context.reader().terms(field);
|
||||
if (indexTerms == null) {
|
||||
return null;
|
||||
} else if (weightOrBitSet.weight != null) {
|
||||
return weightOrBitSet.weight.scorer(context);
|
||||
} else {
|
||||
return scorer(weightOrBitSet.set);
|
||||
}
|
||||
|
||||
// Cost estimation reasoning is:
|
||||
// 1. Assume every query term matches at least one document (queryTermsCount).
|
||||
// 2. Determine the total number of docs beyond the first one for each term.
|
||||
// That count provides a ceiling on the number of extra docs that could match beyond
|
||||
// that first one. (We omit the first since it's already been counted in #1).
|
||||
// This approach still provides correct worst-case cost in general, but provides tighter
|
||||
// estimates for primary-key-like fields. See: LUCENE-10207
|
||||
|
||||
// TODO: This cost estimation may grossly overestimate since we have no index statistics
|
||||
// for the specific query terms. While it's nice to avoid the cost of intersecting the
|
||||
// query terms with the index, it could be beneficial to do that work and get better
|
||||
// cost estimates.
|
||||
final long cost;
|
||||
final long queryTermsCount = termData.size();
|
||||
long potentialExtraCost = indexTerms.getSumDocFreq();
|
||||
final long indexedTermCount = indexTerms.size();
|
||||
if (indexedTermCount != -1) {
|
||||
potentialExtraCost -= indexedTermCount;
|
||||
}
|
||||
cost = queryTermsCount + potentialExtraCost;
|
||||
|
||||
final Weight weight = this;
|
||||
return new ScorerSupplier() {
|
||||
@Override
|
||||
public Scorer get(long leadCost) throws IOException {
|
||||
WeightOrDocIdSet weightOrDocIdSet = rewrite(context);
|
||||
if (weightOrDocIdSet == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final Scorer scorer;
|
||||
if (weightOrDocIdSet.weight != null) {
|
||||
scorer = weightOrDocIdSet.weight.scorer(context);
|
||||
} else {
|
||||
scorer = scorer(weightOrDocIdSet.set);
|
||||
}
|
||||
|
||||
return Objects.requireNonNullElseGet(
|
||||
scorer,
|
||||
() ->
|
||||
new ConstantScoreScorer(weight, score(), scoreMode, DocIdSetIterator.empty()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return cost;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||
final ScorerSupplier supplier = scorerSupplier(context);
|
||||
if (supplier == null) {
|
||||
return null;
|
||||
}
|
||||
return supplier.get(Long.MAX_VALUE);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
Loading…
Reference in New Issue