Forbid empty doc values on vector functions (#43944)

Currently when a document misses a vector value, vector function
returns 0 as a score for this document. We think this is incorrect
behaviour.
With this change, an error will be thrown if vector functions are
used with docs that are missing vector doc values.
Also VectorScriptDocValues is modified to allow size() function,
which can be used to check if a document has a value for the
vector field.
This commit is contained in:
Mayya Sharipova 2019-07-05 17:59:13 -04:00
parent a1a62fded3
commit 37e1ad7062
6 changed files with 68 additions and 14 deletions

View File

@ -28,3 +28,13 @@ TokenizerFactory now has a `name()` method that must be implemented. Most
plugin-provided TokenizerFactory implementations will extend `AbstractTokenizerFactory`,
which now takes a `name` parameter in its constructor.
[float]
[[breaking_74_search_changes]]
=== Search Changes
[float]
==== Forbid empty doc values in vector functions
If a document doesn't have a value for a vector field (dense_vector
or sparse_vector) on which a vector function is executed, an error will
be thrown.

View File

@ -182,8 +182,16 @@ between a given query vector and document vectors.
// NOTCONSOLE
NOTE: If a document doesn't have a value for a vector field on which
a vector function is executed, 0 is returned as a result
for this document.
a vector function is executed, an error will be thrown.
You can check if a document has a value for the field `my_vector` by
`doc['my_vector'].size() == 0`. Your overall script can look like this:
[source,js]
--------------------------------------------------
"source": "doc['my_vector'].size() == 0 ? 0 : cosineSimilarity(params.queryVector, doc['my_vector'])"
--------------------------------------------------
// NOTCONSOLE
NOTE: If a document's dense vector field has a number of dimensions
different from the query's vector, an error will be thrown.

View File

@ -1,8 +1,8 @@
setup:
- skip:
features: headers
version: " - 7.2.99"
reason: "dense_vector dims parameter was added from 7.3"
version: " - 7.3.99"
reason: "dense_vector functions check on empty values was added from 7.4"
- do:
indices.create:
@ -131,7 +131,7 @@ setup:
- match: { error.root_cause.0.type: "script_exception" }
---
"Distance functions for documents missing vector field should return 0":
"Documents missing a vector field":
- do:
index:
index: test-index
@ -149,7 +149,9 @@ setup:
- do:
indices.refresh: {}
# expect an error when documents miss a vector field
- do:
catch: bad_request
headers:
Content-Type: application/json
search:
@ -162,6 +164,22 @@ setup:
source: "cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
params:
query_vector: [10.0, 10.0, 10.0]
- match: { error.root_cause.0.type: "script_exception" }
# guard against missing values by checking size()
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "doc['my_dense_vector'].size() == 0 ? 0 : cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
params:
query_vector: [10.0, 10.0, 10.0]
- match: {hits.total: 2}
- match: {hits.hits.0._id: "1"}

View File

@ -1,8 +1,8 @@
setup:
- skip:
features: headers
version: " - 7.2.99"
reason: "sparse_vector functions were introduced in 7.3.0"
version: " - 7.3.99"
reason: "sparse_vector functions check on empty values was added from 7.4"
- do:
indices.create:
@ -87,7 +87,7 @@ setup:
- match: {hits.hits.2._id: "3"}
---
"Distance functions for documents missing vector field should return 0":
"Documents missing a vector field":
- do:
index:
index: test-index
@ -105,7 +105,9 @@ setup:
- do:
indices.refresh: {}
# expect an error when documents miss a vector field
- do:
catch: bad_request
headers:
Content-Type: application/json
search:
@ -118,6 +120,22 @@ setup:
source: "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])"
params:
query_vector: {"1": 10.0}
- match: { error.root_cause.0.type: "script_exception" }
# guard against missing values by checking size()
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "doc['my_sparse_vector'].size() == 0 ? 0 : cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])"
params:
query_vector: {"1": 10.0}
- match: {hits.total: 2}
- match: {hits.hits.0._id: "1"}

View File

@ -28,7 +28,6 @@ public class ScoreScriptUtils {
*/
public static double dotProduct(List<Number> queryVector, VectorScriptDocValues.DenseVectorScriptDocValues dvs){
BytesRef value = dvs.getEncodedValue();
if (value == null) return 0;
float[] docVector = VectorEncoderDecoder.decodeDenseVector(value);
if (queryVector.size() != docVector.length) {
throw new IllegalArgumentException("Can't calculate dotProduct! The number of dimensions of the query vector [" +
@ -63,7 +62,6 @@ public class ScoreScriptUtils {
public double cosineSimilarity(VectorScriptDocValues.DenseVectorScriptDocValues dvs) {
BytesRef value = dvs.getEncodedValue();
if (value == null) return 0;
float[] docVector = VectorEncoderDecoder.decodeDenseVector(value);
if (queryVector.size() != docVector.length) {
throw new IllegalArgumentException("Can't calculate cosineSimilarity! The number of dimensions of the query vector [" +
@ -129,7 +127,6 @@ public class ScoreScriptUtils {
public double dotProductSparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) {
BytesRef value = dvs.getEncodedValue();
if (value == null) return 0;
int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value);
float[] docValues = VectorEncoderDecoder.decodeSparseVector(value);
return intDotProductSparse(queryValues, queryDims, docValues, docDims);
@ -174,7 +171,6 @@ public class ScoreScriptUtils {
public double cosineSimilaritySparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) {
BytesRef value = dvs.getEncodedValue();
if (value == null) return 0;
int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value);
float[] docValues = VectorEncoderDecoder.decodeSparseVector(value);

View File

@ -41,12 +41,16 @@ public abstract class VectorScriptDocValues extends ScriptDocValues<BytesRef> {
@Override
public BytesRef get(int index) {
throw new UnsupportedOperationException("vector fields may only be used via vector functions in scripts");
throw new UnsupportedOperationException("accessing a vector field's value through 'get' or 'value' is not supported");
}
@Override
public int size() {
throw new UnsupportedOperationException("vector fields may only be used via vector functions in scripts");
if (value == null) {
return 0;
} else {
return 1;
}
}
// not final, as it needs to be extended by Mockito for tests