Fix integer overflow when seeking the vector index for connections (#11905)

* Fix integer overflow when seeking the vector index for connections
* Adding monster test to cause overflow failure
This commit is contained in:
Benjamin Trent 2022-11-10 08:24:32 -05:00 committed by GitHub
parent f7417d5961
commit 1360baaee9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 104 additions and 14 deletions

View File

@ -185,6 +185,14 @@ Build
* GITHUB#11886: Upgrade to gradle 7.5.1 (Dawid Weiss)
======================== Lucene 9.4.2 =======================
Bug Fixes
---------------------
* GITHUB#11905: Fix integer overflow when seeking the vector index for connections in a single segment.
This addresses a bug that was introduced in 9.2.0 where having many vectors is not handled well
in the vector connections reader.
======================== Lucene 9.4.1 =======================
Bug Fixes

View File

@ -383,13 +383,17 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
// calculate for each level the start offsets in vectorIndex file from where to read
// neighbours
graphOffsetsByLevel = new long[numLevels];
final long connectionsAndSizeBytes =
Math.multiplyExact(Math.addExact(1L, maxConn), Integer.BYTES);
for (int level = 0; level < numLevels; level++) {
if (level == 0) {
graphOffsetsByLevel[level] = 0;
} else {
int numNodesOnPrevLevel = level == 1 ? size : nodesByLevel[level - 1].length;
graphOffsetsByLevel[level] =
graphOffsetsByLevel[level - 1] + (1 + maxConn) * Integer.BYTES * numNodesOnPrevLevel;
Math.addExact(
graphOffsetsByLevel[level - 1],
Math.multiplyExact(connectionsAndSizeBytes, numNodesOnPrevLevel));
}
}
}
@ -542,7 +546,7 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
this.entryNode = numLevels > 1 ? nodesByLevel[numLevels - 1][0] : 0;
this.size = entry.size();
this.graphOffsetsByLevel = entry.graphOffsetsByLevel;
this.bytesForConns = ((long) entry.maxConn + 1) * Integer.BYTES;
this.bytesForConns = Math.multiplyExact(Math.addExact(entry.maxConn, 1L), Integer.BYTES);
}
@Override

View File

@ -366,16 +366,20 @@ public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
// calculate for each level the start offsets in vectorIndex file from where to read
// neighbours
graphOffsetsByLevel = new long[numLevels];
final long connectionsAndSizeLevel0Bytes =
Math.multiplyExact(Math.addExact(1, Math.multiplyExact(M, 2L)), Integer.BYTES);
final long connectionsAndSizeBytes = Math.multiplyExact(Math.addExact(1L, M), Integer.BYTES);
for (int level = 0; level < numLevels; level++) {
if (level == 0) {
graphOffsetsByLevel[level] = 0;
} else if (level == 1) {
int numNodesOnLevel0 = size;
graphOffsetsByLevel[level] = (1 + (M * 2)) * Integer.BYTES * numNodesOnLevel0;
graphOffsetsByLevel[level] = Math.multiplyExact(connectionsAndSizeLevel0Bytes, size);
} else {
int numNodesOnPrevLevel = nodesByLevel[level - 1].length;
graphOffsetsByLevel[level] =
graphOffsetsByLevel[level - 1] + (1 + M) * Integer.BYTES * numNodesOnPrevLevel;
Math.addExact(
graphOffsetsByLevel[level - 1],
Math.multiplyExact(connectionsAndSizeBytes, numNodesOnPrevLevel));
}
}
}
@ -408,8 +412,9 @@ public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
this.entryNode = numLevels > 1 ? nodesByLevel[numLevels - 1][0] : 0;
this.size = entry.size();
this.graphOffsetsByLevel = entry.graphOffsetsByLevel;
this.bytesForConns = ((long) entry.M + 1) * Integer.BYTES;
this.bytesForConns0 = ((long) (entry.M * 2) + 1) * Integer.BYTES;
this.bytesForConns = Math.multiplyExact(Math.addExact(entry.M, 1L), Integer.BYTES);
this.bytesForConns0 =
Math.multiplyExact(Math.addExact(Math.multiplyExact(entry.M, 2L), 1), Integer.BYTES);
}
@Override

View File

@ -394,16 +394,20 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
// calculate for each level the start offsets in vectorIndex file from where to read
// neighbours
graphOffsetsByLevel = new long[numLevels];
final long connectionsAndSizeLevel0Bytes =
Math.multiplyExact(Math.addExact(1, Math.multiplyExact(M, 2L)), Integer.BYTES);
final long connectionsAndSizeBytes = Math.multiplyExact(Math.addExact(1L, M), Integer.BYTES);
for (int level = 0; level < numLevels; level++) {
if (level == 0) {
graphOffsetsByLevel[level] = 0;
} else if (level == 1) {
int numNodesOnLevel0 = size;
graphOffsetsByLevel[level] = (1 + (M * 2)) * Integer.BYTES * numNodesOnLevel0;
graphOffsetsByLevel[level] = connectionsAndSizeLevel0Bytes * size;
} else {
int numNodesOnPrevLevel = nodesByLevel[level - 1].length;
graphOffsetsByLevel[level] =
graphOffsetsByLevel[level - 1] + (1 + M) * Integer.BYTES * numNodesOnPrevLevel;
Math.addExact(
graphOffsetsByLevel[level - 1],
Math.multiplyExact(connectionsAndSizeBytes, numNodesOnPrevLevel));
}
}
}
@ -436,8 +440,9 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
this.entryNode = numLevels > 1 ? nodesByLevel[numLevels - 1][0] : 0;
this.size = entry.size();
this.graphOffsetsByLevel = entry.graphOffsetsByLevel;
this.bytesForConns = ((long) entry.M + 1) * Integer.BYTES;
this.bytesForConns0 = ((long) (entry.M * 2) + 1) * Integer.BYTES;
this.bytesForConns = Math.multiplyExact(Math.addExact(entry.M, 1L), Integer.BYTES);
this.bytesForConns0 =
Math.multiplyExact(Math.addExact(Math.multiplyExact(entry.M, 2L), 1), Integer.BYTES);
}
@Override

View File

@ -658,10 +658,11 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
@Override
public long ramBytesUsed() {
if (vectors.size() == 0) return 0;
long vectorSize = vectors.size();
return docsWithField.ramBytesUsed()
+ vectors.size()
+ vectorSize
* (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER)
+ vectors.size() * fieldInfo.getVectorDimension() * fieldInfo.getVectorEncoding().byteSize
+ vectorSize * fieldInfo.getVectorDimension() * fieldInfo.getVectorEncoding().byteSize
+ hnswGraphBuilder.getGraph().ramBytesUsed();
}
}

View File

@ -0,0 +1,67 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document;
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
import org.apache.lucene.index.*;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.KnnVectorQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.LuceneTestCase.Monster;
import org.apache.lucene.tests.util.TestUtil;
@TimeoutSuite(millis = 86_400_000) // 24 hour timeout
@Monster("takes ~2 hours and needs extra heap, disk space, file handles")
public class TestManyKnnDocs extends LuceneTestCase {
// gradlew -p lucene/core test --tests TestManyKnnDocs -Ptests.heapsize=16g -Dtests.monster=true
public void testLargeSegment() throws Exception {
IndexWriterConfig iwc = new IndexWriterConfig();
iwc.setCodec(
TestUtil.getDefaultCodec()); // Make sure to use the default codec instead of a random one
iwc.setRAMBufferSizeMB(64); // Use a 64MB buffer to create larger initial segments
TieredMergePolicy mp = new TieredMergePolicy();
mp.setMaxMergeAtOnce(256); // avoid intermediate merges (waste of time with HNSW?)
mp.setSegmentsPerTier(256); // only merge once at the end when we ask
iwc.setMergePolicy(mp);
String fieldName = "field";
VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.DOT_PRODUCT;
try (Directory dir = FSDirectory.open(createTempDir("ManyKnnVectorDocs"));
IndexWriter iw = new IndexWriter(dir, iwc)) {
int numVectors = 16268816;
float[] vector = new float[1];
Document doc = new Document();
doc.add(new KnnVectorField(fieldName, vector, similarityFunction));
for (int i = 0; i < numVectors; i++) {
vector[0] = (i % 256);
iw.addDocument(doc);
}
// merge to single segment and then verify
iw.forceMerge(1);
iw.commit();
IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir));
TopDocs docs = searcher.search(new KnnVectorQuery("field", new float[] {120}, 10), 5);
assertEquals(5, docs.scoreDocs.length);
}
}
}