mirror of https://github.com/apache/lucene.git
Fix global score update bug in MultiLeafKnnCollector (#13463)
This commit is contained in:
parent
7e31f56ea1
commit
937c004eda
|
@ -279,7 +279,9 @@ Optimizations
|
|||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
(No changes)
|
||||
|
||||
* GITHUB#13463: Address bug in MultiLeafKnnCollector causing #minCompetitiveSimilarity to stay artificially low in
|
||||
some corner cases. (Greg Miller)
|
||||
|
||||
Other
|
||||
--------------------
|
||||
|
|
|
@ -41,6 +41,7 @@ public final class MultiLeafKnnCollector implements KnnCollector {
|
|||
private final float greediness;
|
||||
// the queue of the local similarities to periodically update with the global queue
|
||||
private final FloatHeap updatesQueue;
|
||||
private final float[] updatesScratch;
|
||||
// interval to synchronize the local and global queues, as a number of visited vectors
|
||||
private final int interval = 0xff; // 255
|
||||
private boolean kResultsCollected = false;
|
||||
|
@ -62,6 +63,7 @@ public final class MultiLeafKnnCollector implements KnnCollector {
|
|||
this.globalSimilarityQueue = globalSimilarityQueue;
|
||||
this.nonCompetitiveQueue = new FloatHeap(Math.max(1, Math.round((1 - greediness) * k)));
|
||||
this.updatesQueue = new FloatHeap(k);
|
||||
this.updatesScratch = new float[k];
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -103,9 +105,18 @@ public final class MultiLeafKnnCollector implements KnnCollector {
|
|||
if (kResultsCollected) {
|
||||
// as we've collected k results, we can start do periodic updates with the global queue
|
||||
if (firstKResultsCollected || (subCollector.visitedCount() & interval) == 0) {
|
||||
cachedGlobalMinSim = globalSimilarityQueue.offer(updatesQueue.getHeap());
|
||||
updatesQueue.clear();
|
||||
globalSimUpdated = true;
|
||||
// BlockingFloatHeap#offer requires input to be sorted in ascending order, so we can't
|
||||
// pass in the underlying updatesQueue array as-is since it is only partially ordered
|
||||
// (see GH#13462):
|
||||
int len = updatesQueue.size();
|
||||
if (len > 0) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
updatesScratch[i] = updatesQueue.poll();
|
||||
}
|
||||
assert updatesQueue.size() == 0;
|
||||
cachedGlobalMinSim = globalSimilarityQueue.offer(updatesScratch, len);
|
||||
globalSimUpdated = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return localSimUpdated || globalSimUpdated;
|
||||
|
|
|
@ -72,12 +72,13 @@ public final class BlockingFloatHeap {
|
|||
* <p>Values must be sorted in ascending order.
|
||||
*
|
||||
* @param values a set of values to insert, must be sorted in ascending order
|
||||
* @param len number of values from the {@code values} array to insert
|
||||
* @return the new 'top' element in the queue.
|
||||
*/
|
||||
public float offer(float[] values) {
|
||||
public float offer(float[] values, int len) {
|
||||
lock.lock();
|
||||
try {
|
||||
for (int i = values.length - 1; i >= 0; i--) {
|
||||
for (int i = len - 1; i >= 0; i--) {
|
||||
if (size < maxSize) {
|
||||
push(values[i]);
|
||||
} else {
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.knn;
|
||||
|
||||
import org.apache.lucene.search.TopKnnCollector;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.hnsw.BlockingFloatHeap;
|
||||
|
||||
public class TestMultiLeafKnnCollector extends LuceneTestCase {
|
||||
|
||||
/** Validates a fix for GH#13462 */
|
||||
public void testGlobalScoreCoordination() {
|
||||
int k = 7;
|
||||
BlockingFloatHeap globalHeap = new BlockingFloatHeap(k);
|
||||
MultiLeafKnnCollector collector1 =
|
||||
new MultiLeafKnnCollector(k, globalHeap, new TopKnnCollector(k, Integer.MAX_VALUE));
|
||||
MultiLeafKnnCollector collector2 =
|
||||
new MultiLeafKnnCollector(k, globalHeap, new TopKnnCollector(k, Integer.MAX_VALUE));
|
||||
|
||||
// Collect k (7) hits in collector1 with scores [100, 106]:
|
||||
for (int i = 0; i < k; i++) {
|
||||
collector1.collect(0, 100f + i);
|
||||
}
|
||||
|
||||
// The global heap should be updated since k hits were collected, and have a min score of
|
||||
// 100:
|
||||
assertEquals(100f, globalHeap.peek(), 0f);
|
||||
assertEquals(100f, collector1.minCompetitiveSimilarity(), 0f);
|
||||
|
||||
// Collect k (7) hits in collector2 with only two that are competitive (200 and 300),
|
||||
// which also forces an update of the global heap with collector2's hits. This is a tricky
|
||||
// case where the heap will not be fully ordered, so it ensures global queue updates don't
|
||||
// incorrectly short-circuit (see GH#13462):
|
||||
collector2.collect(0, 10f);
|
||||
collector2.collect(0, 11f);
|
||||
collector2.collect(0, 12f);
|
||||
collector2.collect(0, 13f);
|
||||
collector2.collect(0, 200f);
|
||||
collector2.collect(0, 14f);
|
||||
collector2.collect(0, 300f);
|
||||
|
||||
// At this point, our global heap should contain [102, 103, 104, 105, 106, 200, 300] since
|
||||
// values 200 and 300 from collector2 should have pushed out 100 and 101 from collector1.
|
||||
// The min value on the global heap should be 102:
|
||||
assertEquals(102f, globalHeap.peek(), 0f);
|
||||
assertEquals(102f, collector2.minCompetitiveSimilarity(), 0f);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue