Replace docBase with actual docId in MaxScoreAccumulator (#13777)

We have been encoding docBase and the score in MaxScoreAccumulator#accumulate.
That makes the assumption that segments are going to be processed in doc order
and implements global max score accounting across segments searched concurrently.

With the introduction of intra-segment concurrency, the same segment may be seen
multiple times, once per segment partition. Partitions are all going to have the same
docBase, hence you may end up with topN results with higher docIds than expected,
because the search early terminates before docs with same score and lower doc ids
are seen.

This commit encodes the docId in the accumulator in place of the docBase to resolve
the described issue.
This commit is contained in:
Luca Cavanna 2024-09-13 08:58:10 +02:00 committed by GitHub
parent 5045d3c67b
commit fe64b04fda
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 21 additions and 28 deletions

View File

@ -51,9 +51,9 @@ final class MaxScoreAccumulator {
return v2;
}
void accumulate(int docBase, float score) {
assert docBase >= 0 && score >= 0;
long encode = (((long) Float.floatToIntBits(score)) << 32) | docBase;
void accumulate(int docId, float score) {
assert docId >= 0 && score >= 0;
long encode = (((long) Float.floatToIntBits(score)) << 32) | docId;
acc.accumulate(encode);
}
@ -63,24 +63,18 @@ final class MaxScoreAccumulator {
return null;
}
float score = Float.intBitsToFloat((int) (value >> 32));
int docBase = (int) value;
return new DocAndScore(docBase, score);
int docId = (int) value;
return new DocAndScore(docId, score);
}
record DocAndScore(int docBase, float score) implements Comparable<DocAndScore> {
record DocAndScore(int docId, float score) implements Comparable<DocAndScore> {
@Override
public int compareTo(DocAndScore o) {
int cmp = Float.compare(score, o.score);
if (cmp == 0) {
// tie-break on the minimum doc base
// For a given minimum competitive score, we want to know the first segment
// where this score occurred, hence the reverse order here.
// On segments with a lower docBase, any document whose score is greater
// than or equal to this score would be competitive, while on segments with a
// higher docBase, documents need to have a strictly greater score to be
// competitive since we tie break on doc ID.
return Integer.compare(o.docBase, docBase);
// tie-break on doc id, lower id has the priority
return Integer.compare(o.docId, docId);
}
return cmp;
}

View File

@ -232,7 +232,7 @@ public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
// the next float if the global minimum score is set on a document id that is
// smaller than the ids in the current leaf
float score =
docBase >= maxMinScore.docBase() ? Math.nextUp(maxMinScore.score()) : maxMinScore.score();
docBase >= maxMinScore.docId() ? Math.nextUp(maxMinScore.score()) : maxMinScore.score();
if (score > minCompetitiveScore) {
assert hitsThresholdChecker.isThresholdReached();
scorer.setMinCompetitiveScore(score);
@ -254,10 +254,9 @@ public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
minCompetitiveScore = localMinScore;
if (minScoreAcc != null) {
// we don't use the next float but we register the document
// id so that other leaves can require it if they are after
// the current maximum
minScoreAcc.accumulate(docBase, pqTop.score);
// we don't use the next float but we register the document id so that other leaves or
// leaf partitions can require it if they are after the current maximum
minScoreAcc.accumulate(pqTop.doc, pqTop.score);
}
}
}

View File

@ -24,28 +24,28 @@ public class TestMaxScoreAccumulator extends LuceneTestCase {
MaxScoreAccumulator acc = new MaxScoreAccumulator();
acc.accumulate(0, 0f);
assertEquals(0f, acc.get().score(), 0);
assertEquals(0, acc.get().docBase(), 0);
assertEquals(0, acc.get().docId(), 0);
acc.accumulate(10, 0f);
assertEquals(0f, acc.get().score(), 0);
assertEquals(0, acc.get().docBase(), 0);
assertEquals(0, acc.get().docId(), 0);
acc.accumulate(100, 1000f);
assertEquals(1000f, acc.get().score(), 0);
assertEquals(100, acc.get().docBase(), 0);
assertEquals(100, acc.get().docId(), 0);
acc.accumulate(1000, 5f);
assertEquals(1000f, acc.get().score(), 0);
assertEquals(100, acc.get().docBase(), 0);
assertEquals(100, acc.get().docId(), 0);
acc.accumulate(99, 1000f);
assertEquals(1000f, acc.get().score(), 0);
assertEquals(99, acc.get().docBase(), 0);
assertEquals(99, acc.get().docId(), 0);
acc.accumulate(1000, 1001f);
assertEquals(1001f, acc.get().score(), 0);
assertEquals(1000, acc.get().docBase(), 0);
assertEquals(1000, acc.get().docId(), 0);
acc.accumulate(10, 1001f);
assertEquals(1001f, acc.get().score(), 0);
assertEquals(10, acc.get().docBase(), 0);
assertEquals(10, acc.get().docId(), 0);
acc.accumulate(100, 1001f);
assertEquals(1001f, acc.get().score(), 0);
assertEquals(10, acc.get().docBase(), 0);
assertEquals(10, acc.get().docId(), 0);
}
public void testRandom() {
@ -56,7 +56,7 @@ public class TestMaxScoreAccumulator extends LuceneTestCase {
for (int i = 0; i < numDocs; i++) {
MaxScoreAccumulator.DocAndScore res =
new MaxScoreAccumulator.DocAndScore(random().nextInt(maxDocs), random().nextFloat());
acc.accumulate(res.docBase(), res.score());
acc.accumulate(res.docId(), res.score());
if (res.compareTo(max) > 0) {
max = res;
}