Fix HNSW graph reading with excessive connections (#12571)

When re-using the HNSW graph during segment merges, it is possible that more than the configured M*2 connections could be made per vector.

In those instances, we should allow the graph to still be read from the codec and searchable.
This commit is contained in:
Benjamin Trent 2023-09-19 15:38:23 -04:00 committed by GitHub
parent 1d0edd76a5
commit fe348de619
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 7 additions and 1 deletions

View File

@ -236,6 +236,8 @@ Bug Fixes
* GITHUB#12555: Fix bug in TermsEnum#seekCeil on doc values terms enums
that causes IndexOutOfBoundsException. (Egor Potemkin)
* GITHUB#12571: Fix HNSW graph read bug when built with excessive connections. (Ben Trent).
Other
---------------------

View File

@ -40,6 +40,7 @@ import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.RandomAccessInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
@ -457,7 +458,7 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader {
private final DirectMonotonicReader graphLevelNodeOffsets;
private final long[] graphLevelNodeIndexOffsets;
// Allocated to be M*2 to track the current neighbors being explored
private final int[] currentNeighborsBuffer;
private int[] currentNeighborsBuffer;
OffHeapHnswGraph(FieldEntry entry, IndexInput vectorIndex) throws IOException {
this.dataIn =
@ -491,6 +492,9 @@ public final class Lucene95HnswVectorsReader extends KnnVectorsReader {
dataIn.seek(graphLevelNodeOffsets.get(targetIndex + graphLevelNodeIndexOffsets[level]));
arcCount = dataIn.readVInt();
if (arcCount > 0) {
if (arcCount > currentNeighborsBuffer.length) {
currentNeighborsBuffer = ArrayUtil.grow(currentNeighborsBuffer, arcCount);
}
currentNeighborsBuffer[0] = dataIn.readVInt();
for (int i = 1; i < arcCount; i++) {
currentNeighborsBuffer[i] = currentNeighborsBuffer[i - 1] + dataIn.readVInt();