Fix HNSW graph visitation limit bug (#12413)

We have some weird behavior in HNSW searcher when finding the candidate entry point for the zeroth layer.

While trying to find the best entry point to gather the full candidate set, we don't filter based on the acceptableOrds bitset. Consequently, if we exit the search early (before hitting the zeroth layer), the results that are returned may contain documents NOT within that bitset.

Luckily since the results are marked as incomplete, the *VectorQuery logic switches back to an exact scan and throws away the results.

However, if any user called the leaf searcher directly, bypassing the query, they could run into this bug.
This commit is contained in:
Benjamin Trent 2023-07-06 15:46:36 -04:00 committed by GitHub
parent f527eb3b12
commit 861153020a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 70 additions and 18 deletions

View File

@ -155,6 +155,8 @@ Bug Fixes
* GITHUB#12388: JoinUtil queries were ignoring boosts. (Alan Woodward)
* GITHUB#12413: Fix HNSW graph search bug that potentially leaked unapproved docs (Ben Trent).
Other
---------------------

View File

@ -204,26 +204,26 @@ public class HnswGraphSearcher<T> {
if (initialEp == -1) {
return new NeighborQueue(1, true);
}
NeighborQueue results;
results = new NeighborQueue(1, false);
int[] eps = new int[] {graph.entryNode()};
int numVisited = 0;
for (int level = graph.numLevels() - 1; level >= 1; level--) {
results.clear();
graphSearcher.searchLevel(results, query, 1, level, eps, vectors, graph, null, visitedLimit);
numVisited += results.visitedCount();
visitedLimit -= results.visitedCount();
if (results.incomplete()) {
results.setVisitedCount(numVisited);
return results;
}
eps[0] = results.pop();
int[] epAndVisited = graphSearcher.findBestEntryPoint(query, vectors, graph, visitedLimit);
int numVisited = epAndVisited[1];
int ep = epAndVisited[0];
if (ep == -1) {
NeighborQueue results = new NeighborQueue(1, false);
results.setVisitedCount(numVisited);
results.markIncomplete();
return results;
}
results = new NeighborQueue(topK, false);
NeighborQueue results = new NeighborQueue(topK, false);
graphSearcher.searchLevel(
results, query, topK, 0, eps, vectors, graph, acceptOrds, visitedLimit);
results,
query,
topK,
0,
new int[] {ep},
vectors,
graph,
acceptOrds,
visitedLimit - numVisited);
results.setVisitedCount(results.visitedCount() + numVisited);
return results;
}
@ -256,6 +256,56 @@ public class HnswGraphSearcher<T> {
return results;
}
/**
* Function to find the best entry point from which to search the zeroth graph layer.
*
* @param query vector query with which to search
* @param vectors random access vector values
* @param graph the HNSWGraph
* @param visitLimit How many vectors are allowed to be visited
* @return An integer array whose first element is the best entry point, and second is the number
* of candidates visited. Entry point of `-1` indicates visitation limit exceed
* @throws IOException When accessing the vector fails
*/
private int[] findBestEntryPoint(
T query, RandomAccessVectorValues<T> vectors, HnswGraph graph, int visitLimit)
throws IOException {
int size = graph.size();
int visitedCount = 1;
prepareScratchState(vectors.size());
int currentEp = graph.entryNode();
float currentScore = compare(query, vectors, currentEp);
boolean foundBetter;
for (int level = graph.numLevels() - 1; level >= 1; level--) {
foundBetter = true;
visited.set(currentEp);
// Keep searching the given level until we stop finding a better candidate entry point
while (foundBetter) {
foundBetter = false;
graphSeek(graph, level, currentEp);
int friendOrd;
while ((friendOrd = graphNextNeighbor(graph)) != NO_MORE_DOCS) {
assert friendOrd < size : "friendOrd=" + friendOrd + "; size=" + size;
if (visited.getAndSet(friendOrd)) {
continue;
}
if (visitedCount >= visitLimit) {
return new int[] {-1, visitedCount};
}
float friendSimilarity = compare(query, vectors, friendOrd);
visitedCount++;
if (friendSimilarity > currentScore
|| (friendSimilarity == currentScore && friendOrd < currentEp)) {
currentScore = friendSimilarity;
currentEp = friendOrd;
foundBetter = true;
}
}
}
}
return new int[] {currentEp, visitedCount};
}
/**
* Add the closest neighbors found to a priority queue (heap). These are returned in REVERSE
* proximity order -- the most distant neighbor of the topK found, i.e. the one with the lowest