Fix bug in SQ when just a single vector present in a segment (#13374)

This commit fixes a corner case in the ScalarQuantizer when just a single vector is present. I ran into this when updating a test that previously passed successfully with Lucene 9.10 but fails in 9.x.

The score error correction is calculated to be NaN, as there are no score docs or variance.
This commit is contained in:
Chris Hegarty 2024-05-16 14:59:56 +01:00 committed by GitHub
parent 731cecf730
commit 3d671a0fbe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 58 additions and 1 deletions

View File

@ -372,6 +372,8 @@ Bug Fixes
* GITHUB#13378: Fix points writing with no values (Chris Hegarty)
* GITHUB#13374: Fix bug in SQ when just a single vector present in a segment (Chris Hegarty)
Build
---------------------

View File

@ -698,7 +698,7 @@ public class ScalarQuantizer {
}
corr.add(1 - errors.var() / scoreVariance);
}
return corr.mean;
return Double.isNaN(corr.mean) ? 0.0 : corr.mean;
}
}
}

View File

@ -29,12 +29,14 @@ import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.KnnFloatVectorField;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.StoredFields;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@ -252,6 +254,59 @@ public class TestLucene99ScalarQuantizedVectorScorer extends LuceneTestCase {
}
}
public void testSingleVectorPerSegmentCosine() throws IOException {
testSingleVectorPerSegment(VectorSimilarityFunction.COSINE);
}
public void testSingleVectorPerSegmentDot() throws IOException {
testSingleVectorPerSegment(VectorSimilarityFunction.DOT_PRODUCT);
}
public void testSingleVectorPerSegmentEuclidean() throws IOException {
testSingleVectorPerSegment(VectorSimilarityFunction.EUCLIDEAN);
}
public void testSingleVectorPerSegmentMIP() throws IOException {
testSingleVectorPerSegment(VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT);
}
private void testSingleVectorPerSegment(VectorSimilarityFunction sim) throws IOException {
var codec = getCodec(7, false);
try (Directory dir = newDirectory()) {
try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig().setCodec(codec))) {
Document doc2 = new Document();
doc2.add(new KnnFloatVectorField("field", new float[] {0.8f, 0.6f}, sim));
doc2.add(newTextField("id", "A", Field.Store.YES));
writer.addDocument(doc2);
writer.commit();
Document doc1 = new Document();
doc1.add(new KnnFloatVectorField("field", new float[] {0.6f, 0.8f}, sim));
doc1.add(newTextField("id", "B", Field.Store.YES));
writer.addDocument(doc1);
writer.commit();
Document doc3 = new Document();
doc3.add(new KnnFloatVectorField("field", new float[] {-0.6f, -0.8f}, sim));
doc3.add(newTextField("id", "C", Field.Store.YES));
writer.addDocument(doc3);
writer.commit();
writer.forceMerge(1);
}
try (DirectoryReader reader = DirectoryReader.open(dir)) {
LeafReader leafReader = getOnlyLeafReader(reader);
StoredFields storedFields = reader.storedFields();
float[] queryVector = new float[] {0.6f, 0.8f};
var hits = leafReader.searchNearestVectors("field", queryVector, 3, null, 100);
assertEquals(hits.scoreDocs.length, 3);
assertEquals("B", storedFields.document(hits.scoreDocs[0].doc).get("id"));
assertEquals("A", storedFields.document(hits.scoreDocs[1].doc).get("id"));
assertEquals("C", storedFields.document(hits.scoreDocs[2].doc).get("id"));
}
}
}
private static byte[] floatToByteArray(float value) {
return ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN).putFloat(value).array();
}