Expose more parameters in MultiLeafKnnCollector ctor (#13826)

There is currently no way to configure two parameters for the multi-leaf collector. For expert extensibility, this commit adds another ctor for advance usage:

closes: #13699
This commit is contained in:
Benjamin Trent 2024-10-02 13:58:16 -04:00 committed by GitHub
parent 3ede9de6aa
commit bf74233912
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 28 additions and 3 deletions

View File

@ -33,17 +33,17 @@ public final class MultiLeafKnnCollector implements KnnCollector {
// greediness of globally non-competitive search: (0,1]
private static final float DEFAULT_GREEDINESS = 0.9f;
private static final int DEFAULT_INTERVAL = 0xff;
// the global queue of the highest similarities collected so far across all segments
private final BlockingFloatHeap globalSimilarityQueue;
// the local queue of the highest similarities if we are not competitive globally
// the size of this queue is defined by greediness
private final FloatHeap nonCompetitiveQueue;
private final float greediness;
// the queue of the local similarities to periodically update with the global queue
private final FloatHeap updatesQueue;
private final float[] updatesScratch;
// interval to synchronize the local and global queues, as a number of visited vectors
private final int interval = 0xff; // 255
private final int interval;
private boolean kResultsCollected = false;
private float cachedGlobalMinSim = Float.NEGATIVE_INFINITY;
private final AbstractKnnCollector subCollector;
@ -58,7 +58,32 @@ public final class MultiLeafKnnCollector implements KnnCollector {
*/
public MultiLeafKnnCollector(
int k, BlockingFloatHeap globalSimilarityQueue, AbstractKnnCollector subCollector) {
this.greediness = DEFAULT_GREEDINESS;
this(k, DEFAULT_GREEDINESS, DEFAULT_INTERVAL, globalSimilarityQueue, subCollector);
}
/**
* Create a new MultiLeafKnnCollector.
*
* @param k the number of neighbors to collect
* @param greediness the greediness of the global search
* @param interval (by number of collected values) the interval to synchronize the local and
* global queues
* @param globalSimilarityQueue the global queue of the highest similarities collected so far
* @param subCollector the local collector
*/
public MultiLeafKnnCollector(
int k,
float greediness,
int interval,
BlockingFloatHeap globalSimilarityQueue,
AbstractKnnCollector subCollector) {
if (greediness < 0 || greediness > 1) {
throw new IllegalArgumentException("greediness must be in [0,1]");
}
if (interval <= 0) {
throw new IllegalArgumentException("interval must be positive");
}
this.interval = interval;
this.subCollector = subCollector;
this.globalSimilarityQueue = globalSimilarityQueue;
this.nonCompetitiveQueue = new FloatHeap(Math.max(1, Math.round((1 - greediness) * k)));