Introduce a heuristic to amortize the per-window overhead in MaxScoreBulkScorer. (#13941)

It is sometimes possible for `MaxScoreBulkScorer` to compute windows that don't
contain many candidate matches, resulting in more time spent evaluating maximum
scores per window than evaluating candidate matches on this window.

This PR introduces a heuristic that tries to require at least 32 candidate
matches per clause per window to amortize the per-window overhead. This results
in a speedup for the `OrMany` task.
This commit is contained in:
Adrien Grand 2024-10-22 16:52:36 +02:00
parent ff03379eb6
commit 555c7d2ef2
2 changed files with 33 additions and 0 deletions

View File

@ -34,6 +34,9 @@ Optimizations
speeds up exhaustive evaluation of disjunctions of term queries.
(Adrien Grand)
* GITHUB#13941: Optimized computation of top-hits on disjunctive queries with
many clauses. (Adrien Grand)
Bug Fixes
---------------------
* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended

View File

@ -66,6 +66,15 @@ final class MaxScoreBulkScorer extends BulkScorer {
maxScoreSums = new double[allScorers.length];
}
// Number of outer windows that have been evaluated
private int numOuterWindows;
// Number of candidate matches so far
private int numCandidates;
// Minimum window size. See #computeOuterWindowMax where we have heuristics that adjust the
// minimum window size based on the average number of candidate matches per outer window, to keep
// the per-window overhead under control.
private int minWindowSize = 1;
@Override
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
collector.setScorer(scorable);
@ -124,6 +133,7 @@ final class MaxScoreBulkScorer extends BulkScorer {
}
outerWindowMin = Math.min(top.doc, outerWindowMax);
++numOuterWindows;
}
return nextCandidate(max);
@ -278,6 +288,23 @@ final class MaxScoreBulkScorer extends BulkScorer {
windowMax = (int) Math.min(windowMax, upTo + 1L); // upTo is inclusive
}
if (allScorers.length - firstWindowLead > 1) {
// The more clauses we consider to compute outer windows, the higher chances that one of these
// clauses has a block boundary in the next few doc IDs. This situation can result in more
// time spent computing maximum scores per outer window than evaluating hits. To avoid such
// situations, we target at least 32 candidate matches per clause per outer window on average,
// to make sure we amortize the cost of computing maximum scores.
long threshold = numOuterWindows * 32L * allScorers.length;
if (numCandidates < threshold) {
minWindowSize = Math.min(minWindowSize << 1, INNER_WINDOW_SIZE);
} else {
minWindowSize = 1;
}
int minWindowMax = (int) Math.min(Integer.MAX_VALUE, (long) windowMin + minWindowSize);
windowMax = Math.max(windowMax, minWindowMax);
}
return windowMax;
}
@ -300,6 +327,9 @@ final class MaxScoreBulkScorer extends BulkScorer {
private void scoreNonEssentialClauses(
LeafCollector collector, int doc, double essentialScore, int numNonEssentialClauses)
throws IOException {
++numCandidates;
double score = essentialScore;
for (int i = numNonEssentialClauses - 1; i >= 0; --i) {
float maxPossibleScore =