Fix global score update bug in MultiLeafKnnCollector (#13463)

This commit is contained in:
Greg Miller 2024-06-18 18:34:15 -07:00 committed by GitHub
parent 7e31f56ea1
commit 937c004eda
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 82 additions and 6 deletions

View File

@ -279,7 +279,9 @@ Optimizations
Bug Fixes
---------------------
(No changes)
* GITHUB#13463: Address bug in MultiLeafKnnCollector causing #minCompetitiveSimilarity to stay artificially low in
some corner cases. (Greg Miller)
Other
--------------------

View File

@ -41,6 +41,7 @@ public final class MultiLeafKnnCollector implements KnnCollector {
private final float greediness;
// the queue of the local similarities to periodically update with the global queue
private final FloatHeap updatesQueue;
private final float[] updatesScratch;
// interval to synchronize the local and global queues, as a number of visited vectors
private final int interval = 0xff; // 255
private boolean kResultsCollected = false;
@ -62,6 +63,7 @@ public final class MultiLeafKnnCollector implements KnnCollector {
this.globalSimilarityQueue = globalSimilarityQueue;
this.nonCompetitiveQueue = new FloatHeap(Math.max(1, Math.round((1 - greediness) * k)));
this.updatesQueue = new FloatHeap(k);
this.updatesScratch = new float[k];
}
@Override
@ -103,9 +105,18 @@ public final class MultiLeafKnnCollector implements KnnCollector {
if (kResultsCollected) {
// as we've collected k results, we can start do periodic updates with the global queue
if (firstKResultsCollected || (subCollector.visitedCount() & interval) == 0) {
cachedGlobalMinSim = globalSimilarityQueue.offer(updatesQueue.getHeap());
updatesQueue.clear();
globalSimUpdated = true;
// BlockingFloatHeap#offer requires input to be sorted in ascending order, so we can't
// pass in the underlying updatesQueue array as-is since it is only partially ordered
// (see GH#13462):
int len = updatesQueue.size();
if (len > 0) {
for (int i = 0; i < len; i++) {
updatesScratch[i] = updatesQueue.poll();
}
assert updatesQueue.size() == 0;
cachedGlobalMinSim = globalSimilarityQueue.offer(updatesScratch, len);
globalSimUpdated = true;
}
}
}
return localSimUpdated || globalSimUpdated;

View File

@ -72,12 +72,13 @@ public final class BlockingFloatHeap {
* <p>Values must be sorted in ascending order.
*
* @param values a set of values to insert, must be sorted in ascending order
* @param len number of values from the {@code values} array to insert
* @return the new 'top' element in the queue.
*/
public float offer(float[] values) {
public float offer(float[] values, int len) {
lock.lock();
try {
for (int i = values.length - 1; i >= 0; i--) {
for (int i = len - 1; i >= 0; i--) {
if (size < maxSize) {
push(values[i]);
} else {

View File

@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.knn;
import org.apache.lucene.search.TopKnnCollector;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.hnsw.BlockingFloatHeap;
public class TestMultiLeafKnnCollector extends LuceneTestCase {
/** Validates a fix for GH#13462 */
public void testGlobalScoreCoordination() {
int k = 7;
BlockingFloatHeap globalHeap = new BlockingFloatHeap(k);
MultiLeafKnnCollector collector1 =
new MultiLeafKnnCollector(k, globalHeap, new TopKnnCollector(k, Integer.MAX_VALUE));
MultiLeafKnnCollector collector2 =
new MultiLeafKnnCollector(k, globalHeap, new TopKnnCollector(k, Integer.MAX_VALUE));
// Collect k (7) hits in collector1 with scores [100, 106]:
for (int i = 0; i < k; i++) {
collector1.collect(0, 100f + i);
}
// The global heap should be updated since k hits were collected, and have a min score of
// 100:
assertEquals(100f, globalHeap.peek(), 0f);
assertEquals(100f, collector1.minCompetitiveSimilarity(), 0f);
// Collect k (7) hits in collector2 with only two that are competitive (200 and 300),
// which also forces an update of the global heap with collector2's hits. This is a tricky
// case where the heap will not be fully ordered, so it ensures global queue updates don't
// incorrectly short-circuit (see GH#13462):
collector2.collect(0, 10f);
collector2.collect(0, 11f);
collector2.collect(0, 12f);
collector2.collect(0, 13f);
collector2.collect(0, 200f);
collector2.collect(0, 14f);
collector2.collect(0, 300f);
// At this point, our global heap should contain [102, 103, 104, 105, 106, 200, 300] since
// values 200 and 300 from collector2 should have pushed out 100 and 101 from collector1.
// The min value on the global heap should be 102:
assertEquals(102f, globalHeap.peek(), 0f);
assertEquals(102f, collector2.minCompetitiveSimilarity(), 0f);
}
}