Improve MaxScoreBulkScorer partitioning logic. (#12457)

Partitioning scorers is an optimization problem: the optimal set of
non-essential scorers is the subset of scorers whose sum of max window scores
is less than the minimum competitive score that maximizes the sum of costs.

The current approach consists of sorting scorers by maximum score within the
window and computing the set of non-essential clauses as the first scorers
whose sum of max scores is less than the minimum competitive score, ie. you
cannot have a competitive hit by matching only non-essential clauses.

This sorting logic works well in the common case when costs are inversely
correlated with maximum scores and gives an optimal solution: the above
algorithm will also optimize the cost of non-essential clauses and thus
minimize the cost of essential clauses, in-turn further improving query
runtimes. But this isn't true for all queries. E.g. fuzzy queries compute
scores based on artificial term statistics, so scores are no longer inversely
correlated with maximum scores. This was especially visible with the query
`titel~2` on the wikipedia dataset, as `title` matches this query and is a
high-frequency term. Yet the score contribution of this term is in the same
order as the contribution of most other terms, so query runtime gets much
improved if this clause gets considered non-essential rather than essential.

This commit optimize the partitioning logic a bit by sorting clauses by
`max_score / cost` instead of just `max_score`. This will not change anything
in the common case when max scores are inversely correlated with costs, but can
significantly help otherwise. E.g. `titel~2` went from 41ms to 13ms on my
machine and the wikimedium10m dataset with this change.
This commit is contained in:
Adrien Grand 2023-07-29 21:02:28 +02:00 committed by GitHub
parent 32ec38271e
commit 5e725964a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 41 additions and 14 deletions

View File

@ -30,6 +30,7 @@ final class MaxScoreBulkScorer extends BulkScorer {
private final int maxDoc;
// All scorers, sorted by increasing max score.
private final DisiWrapper[] allScorers;
private final DisiWrapper[] scratch;
// These are the last scorers from `allScorers` that are "essential", ie. required for a match to
// have a competitive score.
private final DisiPriorityQueue essentialQueue;
@ -49,6 +50,7 @@ final class MaxScoreBulkScorer extends BulkScorer {
MaxScoreBulkScorer(int maxDoc, List<Scorer> scorers) throws IOException {
this.maxDoc = maxDoc;
allScorers = new DisiWrapper[scorers.size()];
scratch = new DisiWrapper[allScorers.length];
int i = 0;
long cost = 0;
for (Scorer scorer : scorers) {
@ -91,16 +93,25 @@ final class MaxScoreBulkScorer extends BulkScorer {
while (top.doc < outerWindowMax) {
scoreInnerWindow(collector, acceptDocs, outerWindowMax);
top = essentialQueue.top();
if (minCompetitiveScoreUpdated) {
minCompetitiveScoreUpdated = false;
if (partitionScorers() == false) {
outerWindowMin = outerWindowMax;
continue outer;
} else {
// Partitioning may have swapped essential and non-essential scorers, and some of the
// non-essential scorers may be behind the last scored doc. So let's advance to the next
// candidate match.
final int nextCandidateMatch = top.doc;
top = essentialQueue.top();
while (top.doc < nextCandidateMatch) {
top.doc = top.iterator.advance(nextCandidateMatch);
top = essentialQueue.updateTop();
}
}
}
top = essentialQueue.top();
}
outerWindowMin = outerWindowMax;
}
@ -220,9 +231,6 @@ final class MaxScoreBulkScorer extends BulkScorer {
if (maxPossibleScore < minCompetitiveScore) {
// Hit is not competitive.
return;
} else if (maxScoreSums[i] == 0f) {
// Can break since scorers are sorted by ascending score.
break;
}
DisiWrapper scorer = allScorers[i];
@ -239,19 +247,38 @@ final class MaxScoreBulkScorer extends BulkScorer {
}
private boolean partitionScorers() {
Arrays.sort(allScorers, Comparator.comparingDouble(scorer -> scorer.maxWindowScore));
// Partitioning scorers is an optimization problem: the optimal set of non-essential scorers is
// the subset of scorers whose sum of max window scores is less than the minimum competitive
// score that maximizes the sum of costs.
// Computing the optimal solution to this problem would take O(2^num_clauses). As a first
// approximation, we take the first scorers sorted by max_window_score / cost whose sum of max
// scores is less than the minimum competitive scores. In the common case, maximum scores are
// inversely correlated with document frequency so this is the same as only sorting by maximum
// score, as described in the MAXSCORE paper and gives the optimal solution. However, this can
// make a difference when using custom scores (like FuzzyQuery), high query-time boosts, or
// scoring based on wacky weights.
System.arraycopy(allScorers, 0, scratch, 0, allScorers.length);
Arrays.sort(
scratch,
Comparator.comparingDouble(
scorer -> (double) scorer.maxWindowScore / Math.max(1L, scorer.cost)));
double maxScoreSum = 0;
for (firstEssentialScorer = 0;
firstEssentialScorer < allScorers.length;
++firstEssentialScorer) {
maxScoreSum += allScorers[firstEssentialScorer].maxWindowScore;
maxScoreSums[firstEssentialScorer] = maxScoreSum;
firstEssentialScorer = 0;
for (int i = 0; i < allScorers.length; ++i) {
final DisiWrapper w = scratch[i];
double newMaxScoreSum = maxScoreSum + w.maxWindowScore;
float maxScoreSumFloat =
MaxScoreSumPropagator.scoreSumUpperBound(maxScoreSum, firstEssentialScorer + 1);
if (maxScoreSumFloat >= minCompetitiveScore) {
break;
MaxScoreSumPropagator.scoreSumUpperBound(newMaxScoreSum, firstEssentialScorer + 1);
if (maxScoreSumFloat < minCompetitiveScore) {
maxScoreSum = newMaxScoreSum;
allScorers[firstEssentialScorer] = w;
maxScoreSums[firstEssentialScorer] = maxScoreSum;
firstEssentialScorer++;
} else {
allScorers[allScorers.length - 1 - (i - firstEssentialScorer)] = w;
}
}
if (firstEssentialScorer == allScorers.length) {
return false;
}