From 2ed60e80736dd81da182c51920a7b4ff71f358a1 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Mon, 30 Oct 2023 10:05:52 -0400 Subject: [PATCH] Ensure negative scores are not returned by vector similarity functions (#12727) We shouldn't ever return negative scores from vector similarity functions. Given vector panama and nearly antipodal float[] vectors, it is possible that cosine and (normalized) dot-product become slightly negative due to compounding floating point errors. Since we don't want to make panama vector incredibly slow, we stick to float32 operations for now, and just snap to `0` if the score is negative after our correction. closes: https://github.com/apache/lucene/issues/12700 --- lucene/CHANGES.txt | 2 ++ .../lucene/index/VectorSimilarityFunction.java | 4 ++-- .../org/apache/lucene/util/TestVectorUtil.java | 14 ++++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e3376fdb07c..c95f44eceef 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -264,6 +264,8 @@ Bug Fixes * GITHUB#12682: Scorer should sum up scores into a double. (Shubham Chaudhary) +* GITHUB#12727: Ensure negative scores are not returned by vector similarity functions (Ben Trent) + Build --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java b/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java index ae0633e8c0f..23bd0fd0ec6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java +++ b/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java @@ -52,7 +52,7 @@ public enum VectorSimilarityFunction { DOT_PRODUCT { @Override public float compare(float[] v1, float[] v2) { - return (1 + dotProduct(v1, v2)) / 2; + return Math.max((1 + dotProduct(v1, v2)) / 2, 0); } @Override @@ -70,7 +70,7 @@ public enum VectorSimilarityFunction { COSINE { @Override public float compare(float[] v1, float[] v2) { - return (1 + cosine(v1, v2)) / 2; + return Math.max((1 + cosine(v1, v2)) / 2, 0); } @Override diff --git a/lucene/core/src/test/org/apache/lucene/util/TestVectorUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestVectorUtil.java index 358db95641f..3153f3992fb 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestVectorUtil.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestVectorUtil.java @@ -17,6 +17,7 @@ package org.apache.lucene.util; import java.util.Random; +import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; @@ -115,6 +116,19 @@ public class TestVectorUtil extends LuceneTestCase { expectThrows(IllegalArgumentException.class, () -> VectorUtil.l2normalize(v)); } + public void testExtremeNumerics() { + float[] v1 = new float[1536]; + float[] v2 = new float[1536]; + for (int i = 0; i < 1536; i++) { + v1[i] = 0.888888f; + v2[i] = -0.777777f; + } + for (VectorSimilarityFunction vectorSimilarityFunction : VectorSimilarityFunction.values()) { + float v = vectorSimilarityFunction.compare(v1, v2); + assertTrue(vectorSimilarityFunction + " expected >=0 got:" + v, v >= 0); + } + } + private static float l2(float[] v) { float l2 = 0; for (float x : v) {