From 5a5aa2c8fae6c2f9814287bfcffd18a3e9f661ef Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Wed, 16 Aug 2023 12:15:25 -0400 Subject: [PATCH] GITHUB#12342 Add new maximum inner product vector similarity method (#12479) The current dot-product score scaling and similarity implementation assumes normalized vectors. This disregards information that the model may store within the magnitude. See: https://github.com/apache/lucene/issues/12342#issuecomment-1658640222 for a good explanation for the need. To prevent from breaking current scoring assumptions in Lucene, a new `MAXIMUM_INNER_PRODUCT` similarity function is added. Because the similarity from a `dotProduct` function call could be negative, this similarity scorer will scale negative dotProducts to between 0-1 and then all positive dotProduct values are from 1-MAX. One concern with adding this similarity function is that it breaks the triangle inequality. It is assumed that this is needed to build graph structures. But, there is conflicting research here when it comes to real-world data. See: - For: https://github.com/apache/lucene/issues/12342#issuecomment-1618258984 - Against: https://github.com/apache/lucene/issues/12342#issuecomment-1631577657, https://github.com/apache/lucene/issues/12342#issuecomment-1631808301 To check if any transformation of the input is required to satisfy the triangle inequality, many tests have been ran See: - https://github.com/apache/lucene/issues/12342#issuecomment-1653420640 - https://github.com/apache/lucene/issues/12342#issuecomment-1656112434 - https://github.com/apache/lucene/issues/12342#issuecomment-1656718447 If there are any additional tests, or issues with the provided tests & scripts, please let me know. We want to make sure this works well for our users. closes: https://github.com/apache/lucene/issues/12342 --- lucene/CHANGES.txt | 3 ++ .../index/VectorSimilarityFunction.java | 18 ++++++++++ .../org/apache/lucene/util/VectorUtil.java | 11 ++++++ .../search/BaseKnnVectorQueryTestCase.java | 35 ++++++++++++++++++- .../components/DocumentsPanelProvider.java | 3 ++ .../index/BaseKnnVectorsFormatTestCase.java | 3 +- 6 files changed, 71 insertions(+), 2 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9e58b4fea57..782675ce4d7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -148,6 +148,9 @@ New Features search results can be provided. The first custom collector provides `ToParentBlockJoin[Float|Byte]KnnVectorQuery` joining child vector documents with their parent documents. (Ben Trent) +* GITHUB#12479: Add new Maximum Inner Product vector similarity function for non-normalized dot-product + vector search. (Jack Mazanec, Ben Trent) + Improvements --------------------- * GITHUB#12374: Add CachingLeafSlicesSupplier to compute the LeafSlices for concurrent segment search (Sorabh Hamirwasia) diff --git a/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java b/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java index 8a515cb79fc..ae0633e8c0f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java +++ b/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java @@ -19,6 +19,7 @@ package org.apache.lucene.index; import static org.apache.lucene.util.VectorUtil.cosine; import static org.apache.lucene.util.VectorUtil.dotProduct; import static org.apache.lucene.util.VectorUtil.dotProductScore; +import static org.apache.lucene.util.VectorUtil.scaleMaxInnerProductScore; import static org.apache.lucene.util.VectorUtil.squareDistance; /** @@ -76,6 +77,23 @@ public enum VectorSimilarityFunction { public float compare(byte[] v1, byte[] v2) { return (1 + cosine(v1, v2)) / 2; } + }, + + /** + * Maximum inner product. This is like {@link VectorSimilarityFunction#DOT_PRODUCT}, but does not + * require normalization of the inputs. Should be used when the embedding vectors store useful + * information within the vector magnitude + */ + MAXIMUM_INNER_PRODUCT { + @Override + public float compare(float[] v1, float[] v2) { + return scaleMaxInnerProductScore(dotProduct(v1, v2)); + } + + @Override + public float compare(byte[] v1, byte[] v2) { + return scaleMaxInnerProductScore(dotProduct(v1, v2)); + } }; /** diff --git a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java index b8819082ba9..1af99245806 100644 --- a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java @@ -164,6 +164,17 @@ public final class VectorUtil { return 0.5f + dotProduct(a, b) / denom; } + /** + * @param vectorDotProductSimilarity the raw similarity between two vectors + * @return A scaled score preventing negative scores for maximum-inner-product + */ + public static float scaleMaxInnerProductScore(float vectorDotProductSimilarity) { + if (vectorDotProductSimilarity < 0) { + return 1 / (1 + -1 * vectorDotProductSimilarity); + } + return vectorDotProductSimilarity + 1; + } + /** * Checks if a float vector only has finite components. * diff --git a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java index d54db77c739..e2f47865051 100644 --- a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java @@ -346,6 +346,29 @@ abstract class BaseKnnVectorQueryTestCase extends LuceneTestCase { } } + public void testScoreMIP() throws IOException { + try (Directory indexStore = + getIndexStore( + "field", + VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT, + new float[] {0, 1}, + new float[] {1, 2}, + new float[] {0, 0}); + IndexReader reader = DirectoryReader.open(indexStore)) { + IndexSearcher searcher = newSearcher(reader); + AbstractKnnVectorQuery kvq = getKnnVectorQuery("field", new float[] {0, -1}, 10); + assertMatches(searcher, kvq, 3); + ScoreDoc[] scoreDocs = searcher.search(kvq, 3).scoreDocs; + assertIdMatches(reader, "id2", scoreDocs[0]); + assertIdMatches(reader, "id0", scoreDocs[1]); + assertIdMatches(reader, "id1", scoreDocs[2]); + + assertEquals(1.0, scoreDocs[0].score, 1e-7); + assertEquals(1 / 2f, scoreDocs[1].score, 1e-7); + assertEquals(1 / 3f, scoreDocs[2].score, 1e-7); + } + } + public void testExplain() throws IOException { try (Directory d = newDirectory()) { try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) { @@ -739,11 +762,21 @@ abstract class BaseKnnVectorQueryTestCase extends LuceneTestCase { /** Creates a new directory and adds documents with the given vectors as kNN vector fields */ Directory getIndexStore(String field, float[]... contents) throws IOException { + return getIndexStore(field, VectorSimilarityFunction.EUCLIDEAN, contents); + } + + /** + * Creates a new directory and adds documents with the given vectors with similarity as kNN vector + * fields + */ + Directory getIndexStore( + String field, VectorSimilarityFunction vectorSimilarityFunction, float[]... contents) + throws IOException { Directory indexStore = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore); for (int i = 0; i < contents.length; ++i) { Document doc = new Document(); - doc.add(getKnnVectorField(field, contents[i])); + doc.add(getKnnVectorField(field, contents[i], vectorSimilarityFunction)); doc.add(new StringField("id", "id" + i, Field.Store.YES)); writer.addDocument(doc); } diff --git a/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/components/DocumentsPanelProvider.java b/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/components/DocumentsPanelProvider.java index 613cca415eb..56774640848 100644 --- a/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/components/DocumentsPanelProvider.java +++ b/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/components/DocumentsPanelProvider.java @@ -1246,6 +1246,9 @@ public final class DocumentsPanelProvider implements DocumentsTabOperator { case EUCLIDEAN: sb.append("euc"); break; + case MAXIMUM_INNER_PRODUCT: + sb.append("mip"); + break; default: sb.append("???"); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index 7f5dce9aa5e..4167e7a8a38 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -1278,7 +1278,8 @@ public abstract class BaseKnnVectorsFormatTestCase extends BaseIndexFileFormatTe assertEquals(0, VectorSimilarityFunction.EUCLIDEAN.ordinal()); assertEquals(1, VectorSimilarityFunction.DOT_PRODUCT.ordinal()); assertEquals(2, VectorSimilarityFunction.COSINE.ordinal()); - assertEquals(3, VectorSimilarityFunction.values().length); + assertEquals(3, VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT.ordinal()); + assertEquals(4, VectorSimilarityFunction.values().length); } public void testVectorEncodingOrdinals() {