mirror of https://github.com/apache/lucene.git
Fix integer overflow when seeking the vector index for connections (#11905)
* Fix integer overflow when seeking the vector index for connections * Adding monster test to cause overflow failure
This commit is contained in:
parent
f7417d5961
commit
1360baaee9
|
@ -185,6 +185,14 @@ Build
|
|||
|
||||
* GITHUB#11886: Upgrade to gradle 7.5.1 (Dawid Weiss)
|
||||
|
||||
======================== Lucene 9.4.2 =======================
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
* GITHUB#11905: Fix integer overflow when seeking the vector index for connections in a single segment.
|
||||
This addresses a bug that was introduced in 9.2.0 where having many vectors is not handled well
|
||||
in the vector connections reader.
|
||||
|
||||
======================== Lucene 9.4.1 =======================
|
||||
|
||||
Bug Fixes
|
||||
|
|
|
@ -383,13 +383,17 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
|
|||
// calculate for each level the start offsets in vectorIndex file from where to read
|
||||
// neighbours
|
||||
graphOffsetsByLevel = new long[numLevels];
|
||||
final long connectionsAndSizeBytes =
|
||||
Math.multiplyExact(Math.addExact(1L, maxConn), Integer.BYTES);
|
||||
for (int level = 0; level < numLevels; level++) {
|
||||
if (level == 0) {
|
||||
graphOffsetsByLevel[level] = 0;
|
||||
} else {
|
||||
int numNodesOnPrevLevel = level == 1 ? size : nodesByLevel[level - 1].length;
|
||||
graphOffsetsByLevel[level] =
|
||||
graphOffsetsByLevel[level - 1] + (1 + maxConn) * Integer.BYTES * numNodesOnPrevLevel;
|
||||
Math.addExact(
|
||||
graphOffsetsByLevel[level - 1],
|
||||
Math.multiplyExact(connectionsAndSizeBytes, numNodesOnPrevLevel));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -542,7 +546,7 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
|
|||
this.entryNode = numLevels > 1 ? nodesByLevel[numLevels - 1][0] : 0;
|
||||
this.size = entry.size();
|
||||
this.graphOffsetsByLevel = entry.graphOffsetsByLevel;
|
||||
this.bytesForConns = ((long) entry.maxConn + 1) * Integer.BYTES;
|
||||
this.bytesForConns = Math.multiplyExact(Math.addExact(entry.maxConn, 1L), Integer.BYTES);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -366,16 +366,20 @@ public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
|
|||
// calculate for each level the start offsets in vectorIndex file from where to read
|
||||
// neighbours
|
||||
graphOffsetsByLevel = new long[numLevels];
|
||||
final long connectionsAndSizeLevel0Bytes =
|
||||
Math.multiplyExact(Math.addExact(1, Math.multiplyExact(M, 2L)), Integer.BYTES);
|
||||
final long connectionsAndSizeBytes = Math.multiplyExact(Math.addExact(1L, M), Integer.BYTES);
|
||||
for (int level = 0; level < numLevels; level++) {
|
||||
if (level == 0) {
|
||||
graphOffsetsByLevel[level] = 0;
|
||||
} else if (level == 1) {
|
||||
int numNodesOnLevel0 = size;
|
||||
graphOffsetsByLevel[level] = (1 + (M * 2)) * Integer.BYTES * numNodesOnLevel0;
|
||||
graphOffsetsByLevel[level] = Math.multiplyExact(connectionsAndSizeLevel0Bytes, size);
|
||||
} else {
|
||||
int numNodesOnPrevLevel = nodesByLevel[level - 1].length;
|
||||
graphOffsetsByLevel[level] =
|
||||
graphOffsetsByLevel[level - 1] + (1 + M) * Integer.BYTES * numNodesOnPrevLevel;
|
||||
Math.addExact(
|
||||
graphOffsetsByLevel[level - 1],
|
||||
Math.multiplyExact(connectionsAndSizeBytes, numNodesOnPrevLevel));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -408,8 +412,9 @@ public final class Lucene92HnswVectorsReader extends KnnVectorsReader {
|
|||
this.entryNode = numLevels > 1 ? nodesByLevel[numLevels - 1][0] : 0;
|
||||
this.size = entry.size();
|
||||
this.graphOffsetsByLevel = entry.graphOffsetsByLevel;
|
||||
this.bytesForConns = ((long) entry.M + 1) * Integer.BYTES;
|
||||
this.bytesForConns0 = ((long) (entry.M * 2) + 1) * Integer.BYTES;
|
||||
this.bytesForConns = Math.multiplyExact(Math.addExact(entry.M, 1L), Integer.BYTES);
|
||||
this.bytesForConns0 =
|
||||
Math.multiplyExact(Math.addExact(Math.multiplyExact(entry.M, 2L), 1), Integer.BYTES);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -394,16 +394,20 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
|
|||
// calculate for each level the start offsets in vectorIndex file from where to read
|
||||
// neighbours
|
||||
graphOffsetsByLevel = new long[numLevels];
|
||||
final long connectionsAndSizeLevel0Bytes =
|
||||
Math.multiplyExact(Math.addExact(1, Math.multiplyExact(M, 2L)), Integer.BYTES);
|
||||
final long connectionsAndSizeBytes = Math.multiplyExact(Math.addExact(1L, M), Integer.BYTES);
|
||||
for (int level = 0; level < numLevels; level++) {
|
||||
if (level == 0) {
|
||||
graphOffsetsByLevel[level] = 0;
|
||||
} else if (level == 1) {
|
||||
int numNodesOnLevel0 = size;
|
||||
graphOffsetsByLevel[level] = (1 + (M * 2)) * Integer.BYTES * numNodesOnLevel0;
|
||||
graphOffsetsByLevel[level] = connectionsAndSizeLevel0Bytes * size;
|
||||
} else {
|
||||
int numNodesOnPrevLevel = nodesByLevel[level - 1].length;
|
||||
graphOffsetsByLevel[level] =
|
||||
graphOffsetsByLevel[level - 1] + (1 + M) * Integer.BYTES * numNodesOnPrevLevel;
|
||||
Math.addExact(
|
||||
graphOffsetsByLevel[level - 1],
|
||||
Math.multiplyExact(connectionsAndSizeBytes, numNodesOnPrevLevel));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -436,8 +440,9 @@ public final class Lucene94HnswVectorsReader extends KnnVectorsReader {
|
|||
this.entryNode = numLevels > 1 ? nodesByLevel[numLevels - 1][0] : 0;
|
||||
this.size = entry.size();
|
||||
this.graphOffsetsByLevel = entry.graphOffsetsByLevel;
|
||||
this.bytesForConns = ((long) entry.M + 1) * Integer.BYTES;
|
||||
this.bytesForConns0 = ((long) (entry.M * 2) + 1) * Integer.BYTES;
|
||||
this.bytesForConns = Math.multiplyExact(Math.addExact(entry.M, 1L), Integer.BYTES);
|
||||
this.bytesForConns0 =
|
||||
Math.multiplyExact(Math.addExact(Math.multiplyExact(entry.M, 2L), 1), Integer.BYTES);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -658,10 +658,11 @@ public final class Lucene94HnswVectorsWriter extends KnnVectorsWriter {
|
|||
@Override
|
||||
public long ramBytesUsed() {
|
||||
if (vectors.size() == 0) return 0;
|
||||
long vectorSize = vectors.size();
|
||||
return docsWithField.ramBytesUsed()
|
||||
+ vectors.size()
|
||||
+ vectorSize
|
||||
* (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER)
|
||||
+ vectors.size() * fieldInfo.getVectorDimension() * fieldInfo.getVectorEncoding().byteSize
|
||||
+ vectorSize * fieldInfo.getVectorDimension() * fieldInfo.getVectorEncoding().byteSize
|
||||
+ hnswGraphBuilder.getGraph().ramBytesUsed();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.document;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.KnnVectorQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase.Monster;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
|
||||
@TimeoutSuite(millis = 86_400_000) // 24 hour timeout
|
||||
@Monster("takes ~2 hours and needs extra heap, disk space, file handles")
|
||||
public class TestManyKnnDocs extends LuceneTestCase {
|
||||
// gradlew -p lucene/core test --tests TestManyKnnDocs -Ptests.heapsize=16g -Dtests.monster=true
|
||||
|
||||
public void testLargeSegment() throws Exception {
|
||||
IndexWriterConfig iwc = new IndexWriterConfig();
|
||||
iwc.setCodec(
|
||||
TestUtil.getDefaultCodec()); // Make sure to use the default codec instead of a random one
|
||||
iwc.setRAMBufferSizeMB(64); // Use a 64MB buffer to create larger initial segments
|
||||
TieredMergePolicy mp = new TieredMergePolicy();
|
||||
mp.setMaxMergeAtOnce(256); // avoid intermediate merges (waste of time with HNSW?)
|
||||
mp.setSegmentsPerTier(256); // only merge once at the end when we ask
|
||||
iwc.setMergePolicy(mp);
|
||||
String fieldName = "field";
|
||||
VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.DOT_PRODUCT;
|
||||
|
||||
try (Directory dir = FSDirectory.open(createTempDir("ManyKnnVectorDocs"));
|
||||
IndexWriter iw = new IndexWriter(dir, iwc)) {
|
||||
|
||||
int numVectors = 16268816;
|
||||
float[] vector = new float[1];
|
||||
Document doc = new Document();
|
||||
doc.add(new KnnVectorField(fieldName, vector, similarityFunction));
|
||||
for (int i = 0; i < numVectors; i++) {
|
||||
vector[0] = (i % 256);
|
||||
iw.addDocument(doc);
|
||||
}
|
||||
|
||||
// merge to single segment and then verify
|
||||
iw.forceMerge(1);
|
||||
iw.commit();
|
||||
IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir));
|
||||
TopDocs docs = searcher.search(new KnnVectorQuery("field", new float[] {120}, 10), 5);
|
||||
assertEquals(5, docs.scoreDocs.length);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue