Forbid empty doc values on vector functions (#43944)
Currently when a document misses a vector value, vector function returns 0 as a score for this document. We think this is incorrect behaviour. With this change, an error will be thrown if vector functions are used with docs that are missing vector doc values. Also VectorScriptDocValues is modified to allow size() function, which can be used to check if a document has a value for the vector field.
This commit is contained in:
parent
a1a62fded3
commit
37e1ad7062
|
@ -28,3 +28,13 @@ TokenizerFactory now has a `name()` method that must be implemented. Most
|
|||
plugin-provided TokenizerFactory implementations will extend `AbstractTokenizerFactory`,
|
||||
which now takes a `name` parameter in its constructor.
|
||||
|
||||
[float]
|
||||
[[breaking_74_search_changes]]
|
||||
=== Search Changes
|
||||
|
||||
[float]
|
||||
==== Forbid empty doc values in vector functions
|
||||
If a document doesn't have a value for a vector field (dense_vector
|
||||
or sparse_vector) on which a vector function is executed, an error will
|
||||
be thrown.
|
||||
|
||||
|
|
|
@ -182,8 +182,16 @@ between a given query vector and document vectors.
|
|||
// NOTCONSOLE
|
||||
|
||||
NOTE: If a document doesn't have a value for a vector field on which
|
||||
a vector function is executed, 0 is returned as a result
|
||||
for this document.
|
||||
a vector function is executed, an error will be thrown.
|
||||
|
||||
You can check if a document has a value for the field `my_vector` by
|
||||
`doc['my_vector'].size() == 0`. Your overall script can look like this:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
"source": "doc['my_vector'].size() == 0 ? 0 : cosineSimilarity(params.queryVector, doc['my_vector'])"
|
||||
--------------------------------------------------
|
||||
// NOTCONSOLE
|
||||
|
||||
NOTE: If a document's dense vector field has a number of dimensions
|
||||
different from the query's vector, an error will be thrown.
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
setup:
|
||||
- skip:
|
||||
features: headers
|
||||
version: " - 7.2.99"
|
||||
reason: "dense_vector dims parameter was added from 7.3"
|
||||
version: " - 7.3.99"
|
||||
reason: "dense_vector functions check on empty values was added from 7.4"
|
||||
|
||||
- do:
|
||||
indices.create:
|
||||
|
@ -131,7 +131,7 @@ setup:
|
|||
- match: { error.root_cause.0.type: "script_exception" }
|
||||
|
||||
---
|
||||
"Distance functions for documents missing vector field should return 0":
|
||||
"Documents missing a vector field":
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
|
@ -149,7 +149,9 @@ setup:
|
|||
- do:
|
||||
indices.refresh: {}
|
||||
|
||||
# expect an error when documents miss a vector field
|
||||
- do:
|
||||
catch: bad_request
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
search:
|
||||
|
@ -162,6 +164,22 @@ setup:
|
|||
source: "cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
|
||||
params:
|
||||
query_vector: [10.0, 10.0, 10.0]
|
||||
- match: { error.root_cause.0.type: "script_exception" }
|
||||
|
||||
# guard against missing values by checking size()
|
||||
- do:
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "doc['my_dense_vector'].size() == 0 ? 0 : cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
|
||||
params:
|
||||
query_vector: [10.0, 10.0, 10.0]
|
||||
|
||||
- match: {hits.total: 2}
|
||||
- match: {hits.hits.0._id: "1"}
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
setup:
|
||||
- skip:
|
||||
features: headers
|
||||
version: " - 7.2.99"
|
||||
reason: "sparse_vector functions were introduced in 7.3.0"
|
||||
version: " - 7.3.99"
|
||||
reason: "sparse_vector functions check on empty values was added from 7.4"
|
||||
|
||||
- do:
|
||||
indices.create:
|
||||
|
@ -87,7 +87,7 @@ setup:
|
|||
- match: {hits.hits.2._id: "3"}
|
||||
|
||||
---
|
||||
"Distance functions for documents missing vector field should return 0":
|
||||
"Documents missing a vector field":
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
|
@ -105,7 +105,9 @@ setup:
|
|||
- do:
|
||||
indices.refresh: {}
|
||||
|
||||
# expect an error when documents miss a vector field
|
||||
- do:
|
||||
catch: bad_request
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
search:
|
||||
|
@ -118,6 +120,22 @@ setup:
|
|||
source: "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])"
|
||||
params:
|
||||
query_vector: {"1": 10.0}
|
||||
- match: { error.root_cause.0.type: "script_exception" }
|
||||
|
||||
# guard against missing values by checking size()
|
||||
- do:
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "doc['my_sparse_vector'].size() == 0 ? 0 : cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])"
|
||||
params:
|
||||
query_vector: {"1": 10.0}
|
||||
|
||||
- match: {hits.total: 2}
|
||||
- match: {hits.hits.0._id: "1"}
|
||||
|
|
|
@ -28,7 +28,6 @@ public class ScoreScriptUtils {
|
|||
*/
|
||||
public static double dotProduct(List<Number> queryVector, VectorScriptDocValues.DenseVectorScriptDocValues dvs){
|
||||
BytesRef value = dvs.getEncodedValue();
|
||||
if (value == null) return 0;
|
||||
float[] docVector = VectorEncoderDecoder.decodeDenseVector(value);
|
||||
if (queryVector.size() != docVector.length) {
|
||||
throw new IllegalArgumentException("Can't calculate dotProduct! The number of dimensions of the query vector [" +
|
||||
|
@ -63,7 +62,6 @@ public class ScoreScriptUtils {
|
|||
|
||||
public double cosineSimilarity(VectorScriptDocValues.DenseVectorScriptDocValues dvs) {
|
||||
BytesRef value = dvs.getEncodedValue();
|
||||
if (value == null) return 0;
|
||||
float[] docVector = VectorEncoderDecoder.decodeDenseVector(value);
|
||||
if (queryVector.size() != docVector.length) {
|
||||
throw new IllegalArgumentException("Can't calculate cosineSimilarity! The number of dimensions of the query vector [" +
|
||||
|
@ -129,7 +127,6 @@ public class ScoreScriptUtils {
|
|||
|
||||
public double dotProductSparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) {
|
||||
BytesRef value = dvs.getEncodedValue();
|
||||
if (value == null) return 0;
|
||||
int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value);
|
||||
float[] docValues = VectorEncoderDecoder.decodeSparseVector(value);
|
||||
return intDotProductSparse(queryValues, queryDims, docValues, docDims);
|
||||
|
@ -174,7 +171,6 @@ public class ScoreScriptUtils {
|
|||
|
||||
public double cosineSimilaritySparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) {
|
||||
BytesRef value = dvs.getEncodedValue();
|
||||
if (value == null) return 0;
|
||||
int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value);
|
||||
float[] docValues = VectorEncoderDecoder.decodeSparseVector(value);
|
||||
|
||||
|
|
|
@ -41,12 +41,16 @@ public abstract class VectorScriptDocValues extends ScriptDocValues<BytesRef> {
|
|||
|
||||
@Override
|
||||
public BytesRef get(int index) {
|
||||
throw new UnsupportedOperationException("vector fields may only be used via vector functions in scripts");
|
||||
throw new UnsupportedOperationException("accessing a vector field's value through 'get' or 'value' is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
throw new UnsupportedOperationException("vector fields may only be used via vector functions in scripts");
|
||||
if (value == null) {
|
||||
return 0;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// not final, as it needs to be extended by Mockito for tests
|
||||
|
|
Loading…
Reference in New Issue