fix VectorUtil.dotProductScore normalization (#1073)

This commit is contained in:
Michael Sokolov 2022-08-20 09:15:38 -04:00 committed by GitHub
parent 60fa19d509
commit 798c02dd70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 2 deletions

View File

@ -270,7 +270,8 @@ public final class VectorUtil {
*/ */
public static float dotProductScore(BytesRef a, BytesRef b) { public static float dotProductScore(BytesRef a, BytesRef b) {
// divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len
return (1 + dotProduct(a, b)) / (float) (a.length * (1 << 15)); float denom = (float) (a.length * (1 << 15));
return 0.5f + dotProduct(a, b) / denom;
} }
/** /**

View File

@ -176,7 +176,20 @@ public class TestVectorUtil extends LuceneTestCase {
BytesRef a = new BytesRef(new byte[] {1, 2, 3}); BytesRef a = new BytesRef(new byte[] {1, 2, 3});
BytesRef b = new BytesRef(new byte[] {-10, 0, 5}); BytesRef b = new BytesRef(new byte[] {-10, 0, 5});
assertEquals(5, VectorUtil.dotProduct(a, b), 0); assertEquals(5, VectorUtil.dotProduct(a, b), 0);
assertEquals(5 / (3f * (1 << 15)), VectorUtil.dotProductScore(a, b), DELTA); float denom = a.length * (1 << 15);
assertEquals(0.5 + 5 / denom, VectorUtil.dotProductScore(a, b), DELTA);
// dot product 0 maps to dotProductScore 0.5
BytesRef zero = new BytesRef(new byte[] {0, 0, 0});
assertEquals(0.5, VectorUtil.dotProductScore(a, zero), DELTA);
BytesRef min = new BytesRef(new byte[] {-128, -128});
BytesRef max = new BytesRef(new byte[] {127, 127});
// minimum dot product score is not quite zero because 127 < 128
assertEquals(0.0039, VectorUtil.dotProductScore(min, max), DELTA);
// maximum dot product score
assertEquals(1, VectorUtil.dotProductScore(min, min), DELTA);
} }
public void testSelfDotProductBytes() { public void testSelfDotProductBytes() {