mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-25 09:28:27 +00:00
Add l1norm and l2norm distances for vectors (#44116)
Add L1norm - Manhattan distance Add L2norm - Euclidean distance relates to #37947
This commit is contained in:
parent
31725ef390
commit
32cb47b91c
@ -11,7 +11,6 @@ a function to be used to compute a new score for each document returned
|
||||
by the query. For more information on scripting see
|
||||
<<modules-scripting, scripting documentation>>.
|
||||
|
||||
|
||||
Here is an example of using `script_score` to assign each matched document
|
||||
a score equal to the number of likes divided by 10:
|
||||
|
||||
@ -32,7 +31,6 @@ GET /_search
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
// TEST[setup:twitter]
|
||||
|
||||
==== Accessing the score of a document within a script
|
||||
|
||||
@ -72,131 +70,6 @@ to be the most efficient by using the internal mechanisms.
|
||||
--------------------------------------------------
|
||||
// NOTCONSOLE
|
||||
|
||||
[role="xpack"]
|
||||
[testenv="basic"]
|
||||
[[vector-functions]]
|
||||
===== Functions for vector fields
|
||||
|
||||
experimental[]
|
||||
|
||||
These functions are used for
|
||||
for <<dense-vector,`dense_vector`>> and
|
||||
<<sparse-vector,`sparse_vector`>> fields.
|
||||
|
||||
NOTE: During vector functions' calculation, all matched documents are
|
||||
linearly scanned. Thus, expect the query time grow linearly
|
||||
with the number of matched documents. For this reason, we recommend
|
||||
to limit the number of matched documents with a `query` parameter.
|
||||
|
||||
For dense_vector fields, `cosineSimilarity` calculates the measure of
|
||||
cosine similarity between a given query vector and document vectors.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"query": {
|
||||
"script_score": {
|
||||
"query": {
|
||||
"match_all": {}
|
||||
},
|
||||
"script": {
|
||||
"source": "cosineSimilarity(params.queryVector, doc['my_dense_vector'])",
|
||||
"params": {
|
||||
"queryVector": [4, 3.4, -0.2] <1>
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// NOTCONSOLE
|
||||
<1> To take advantage of the script optimizations, provide a query vector as a script parameter.
|
||||
|
||||
Similarly, for sparse_vector fields, `cosineSimilaritySparse` calculates cosine similarity
|
||||
between a given query vector and document vectors.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"query": {
|
||||
"script_score": {
|
||||
"query": {
|
||||
"match_all": {}
|
||||
},
|
||||
"script": {
|
||||
"source": "cosineSimilaritySparse(params.queryVector, doc['my_sparse_vector'])",
|
||||
"params": {
|
||||
"queryVector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// NOTCONSOLE
|
||||
|
||||
For dense_vector fields, `dotProduct` calculates the measure of
|
||||
dot product between a given query vector and document vectors.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"query": {
|
||||
"script_score": {
|
||||
"query": {
|
||||
"match_all": {}
|
||||
},
|
||||
"script": {
|
||||
"source": "dotProduct(params.queryVector, doc['my_dense_vector'])",
|
||||
"params": {
|
||||
"queryVector": [4, 3.4, -0.2]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// NOTCONSOLE
|
||||
|
||||
Similarly, for sparse_vector fields, `dotProductSparse` calculates dot product
|
||||
between a given query vector and document vectors.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"query": {
|
||||
"script_score": {
|
||||
"query": {
|
||||
"match_all": {}
|
||||
},
|
||||
"script": {
|
||||
"source": "dotProductSparse(params.queryVector, doc['my_sparse_vector'])",
|
||||
"params": {
|
||||
"queryVector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// NOTCONSOLE
|
||||
|
||||
NOTE: If a document doesn't have a value for a vector field on which
|
||||
a vector function is executed, an error will be thrown.
|
||||
|
||||
You can check if a document has a value for the field `my_vector` by
|
||||
`doc['my_vector'].size() == 0`. Your overall script can look like this:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
"source": "doc['my_vector'].size() == 0 ? 0 : cosineSimilarity(params.queryVector, doc['my_vector'])"
|
||||
--------------------------------------------------
|
||||
// NOTCONSOLE
|
||||
|
||||
NOTE: If a document's dense vector field has a number of dimensions
|
||||
different from the query's vector, an error will be thrown.
|
||||
|
||||
|
||||
[[random-score-function]]
|
||||
===== Random score function
|
||||
`random_score` function generates scores that are uniformly distributed
|
||||
@ -310,6 +183,9 @@ You can read more about decay functions
|
||||
NOTE: Decay functions on dates are limited to dates in the default format
|
||||
and default time zone. Also calculations with `now` are not supported.
|
||||
|
||||
===== Functions for vector fields
|
||||
<<vector-functions, Functions for vector fields>> are accessible through
|
||||
`script_score` query.
|
||||
|
||||
==== Faster alternatives
|
||||
Script Score Query calculates the score for every hit (matching document).
|
||||
@ -409,5 +285,4 @@ through a script:
|
||||
Script Score query has equivalent <<decay-functions, decay functions>>
|
||||
that can be used in script.
|
||||
|
||||
|
||||
|
||||
include::{es-repo-dir}/vectors/vector-functions.asciidoc[]
|
||||
|
279
docs/reference/vectors/vector-functions.asciidoc
Normal file
279
docs/reference/vectors/vector-functions.asciidoc
Normal file
@ -0,0 +1,279 @@
|
||||
[role="xpack"]
|
||||
[testenv="basic"]
|
||||
[[vector-functions]]
|
||||
===== Functions for vector fields
|
||||
|
||||
experimental[]
|
||||
|
||||
These functions are used for
|
||||
for <<dense-vector,`dense_vector`>> and
|
||||
<<sparse-vector,`sparse_vector`>> fields.
|
||||
|
||||
NOTE: During vector functions' calculation, all matched documents are
|
||||
linearly scanned. Thus, expect the query time grow linearly
|
||||
with the number of matched documents. For this reason, we recommend
|
||||
to limit the number of matched documents with a `query` parameter.
|
||||
|
||||
Let's create an index with the following mapping and index a couple
|
||||
of documents into it.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"my_dense_vector": {
|
||||
"type": "dense_vector",
|
||||
"dims": 3
|
||||
},
|
||||
"my_sparse_vector" : {
|
||||
"type" : "sparse_vector"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PUT my_index/_doc/1
|
||||
{
|
||||
"my_dense_vector": [0.5, 10, 6],
|
||||
"my_sparse_vector": {"2": 1.5, "15" : 2, "50": -1.1, "4545": 1.1}
|
||||
}
|
||||
|
||||
PUT my_index/_doc/2
|
||||
{
|
||||
"my_dense_vector": [-0.5, 10, 10],
|
||||
"my_sparse_vector": {"2": 2.5, "10" : 1.3, "55": -2.3, "113": 1.6}
|
||||
}
|
||||
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
// TESTSETUP
|
||||
|
||||
For dense_vector fields, `cosineSimilarity` calculates the measure of
|
||||
cosine similarity between a given query vector and document vectors.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
GET my_index/_search
|
||||
{
|
||||
"query": {
|
||||
"script_score": {
|
||||
"query": {
|
||||
"match_all": {}
|
||||
},
|
||||
"script": {
|
||||
"source": "cosineSimilarity(params.query_vector, doc['my_dense_vector']) + 1.0", <1>
|
||||
"params": {
|
||||
"query_vector": [4, 3.4, -0.2] <2>
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
<1> The script adds 1.0 to the cosine similarity to prevent the score from being negative.
|
||||
<2> To take advantage of the script optimizations, provide a query vector as a script parameter.
|
||||
|
||||
NOTE: If a document's dense vector field has a number of dimensions
|
||||
different from the query's vector, an error will be thrown.
|
||||
|
||||
Similarly, for sparse_vector fields, `cosineSimilaritySparse` calculates cosine similarity
|
||||
between a given query vector and document vectors.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
GET my_index/_search
|
||||
{
|
||||
"query": {
|
||||
"script_score": {
|
||||
"query": {
|
||||
"match_all": {}
|
||||
},
|
||||
"script": {
|
||||
"source": "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector']) + 1.0",
|
||||
"params": {
|
||||
"query_vector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
For dense_vector fields, `dotProduct` calculates the measure of
|
||||
dot product between a given query vector and document vectors.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
GET my_index/_search
|
||||
{
|
||||
"query": {
|
||||
"script_score": {
|
||||
"query": {
|
||||
"match_all": {}
|
||||
},
|
||||
"script": {
|
||||
"source": """
|
||||
double value = dotProduct(params.query_vector, doc['my_dense_vector']);
|
||||
return sigmoid(1, Math.E, -value); <1>
|
||||
""",
|
||||
"params": {
|
||||
"query_vector": [4, 3.4, -0.2]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
<1> Using the standard sigmoid function prevents scores from being negative.
|
||||
|
||||
Similarly, for sparse_vector fields, `dotProductSparse` calculates dot product
|
||||
between a given query vector and document vectors.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
GET my_index/_search
|
||||
{
|
||||
"query": {
|
||||
"script_score": {
|
||||
"query": {
|
||||
"match_all": {}
|
||||
},
|
||||
"script": {
|
||||
"source": """
|
||||
double value = dotProductSparse(params.query_vector, doc['my_sparse_vector']);
|
||||
return sigmoid(1, Math.E, -value);
|
||||
""",
|
||||
"params": {
|
||||
"query_vector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
For dense_vector fields, `l1norm` calculates L^1^ distance
|
||||
(Manhattan distance) between a given query vector and
|
||||
document vectors.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
GET my_index/_search
|
||||
{
|
||||
"query": {
|
||||
"script_score": {
|
||||
"query": {
|
||||
"match_all": {}
|
||||
},
|
||||
"script": {
|
||||
"source": "1 / (1 + l1norm(params.queryVector, doc['my_dense_vector']))", <1>
|
||||
"params": {
|
||||
"queryVector": [4, 3.4, -0.2]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
<1> Unlike `cosineSimilarity` that represent similarity, `l1norm` and
|
||||
`l2norm` shown below represent distances or differences. This means, that
|
||||
the more similar the vectors are, the lower the scores will be that are
|
||||
produced by the `l1norm` and `l2norm` functions.
|
||||
Thus, as we need more similar vectors to score higher,
|
||||
we reversed the output from `l1norm` and `l2norm`. Also, to avoid
|
||||
division by 0 when a document vector matches the query exactly,
|
||||
we added `1` in the denominator.
|
||||
|
||||
For sparse_vector fields, `l1normSparse` calculates L^1^ distance
|
||||
between a given query vector and document vectors.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
GET my_index/_search
|
||||
{
|
||||
"query": {
|
||||
"script_score": {
|
||||
"query": {
|
||||
"match_all": {}
|
||||
},
|
||||
"script": {
|
||||
"source": "1 / (1 + l1normSparse(params.queryVector, doc['my_sparse_vector']))",
|
||||
"params": {
|
||||
"queryVector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
For dense_vector fields, `l2norm` calculates L^2^ distance
|
||||
(Euclidean distance) between a given query vector and
|
||||
document vectors.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
GET my_index/_search
|
||||
{
|
||||
"query": {
|
||||
"script_score": {
|
||||
"query": {
|
||||
"match_all": {}
|
||||
},
|
||||
"script": {
|
||||
"source": "1 / (1 + l2norm(params.queryVector, doc['my_dense_vector']))",
|
||||
"params": {
|
||||
"queryVector": [4, 3.4, -0.2]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
Similarly, for sparse_vector fields, `l2normSparse` calculates L^2^ distance
|
||||
between a given query vector and document vectors.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
GET my_index/_search
|
||||
{
|
||||
"query": {
|
||||
"script_score": {
|
||||
"query": {
|
||||
"match_all": {}
|
||||
},
|
||||
"script": {
|
||||
"source": "1 / (1 + l2normSparse(params.queryVector, doc['my_sparse_vector']))",
|
||||
"params": {
|
||||
"queryVector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
NOTE: If a document doesn't have a value for a vector field on which
|
||||
a vector function is executed, an error will be thrown.
|
||||
|
||||
You can check if a document has a value for the field `my_vector` by
|
||||
`doc['my_vector'].size() == 0`. Your overall script can look like this:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
"source": "doc['my_vector'].size() == 0 ? 0 : cosineSimilarity(params.queryVector, doc['my_vector'])"
|
||||
--------------------------------------------------
|
||||
// NOTCONSOLE
|
@ -0,0 +1,102 @@
|
||||
setup:
|
||||
- skip:
|
||||
features: headers
|
||||
version: " - 7.3.99"
|
||||
reason: "l1norm and l2norm functions were added from 7.4"
|
||||
|
||||
- do:
|
||||
indices.create:
|
||||
include_type_name: false
|
||||
index: test-index
|
||||
body:
|
||||
settings:
|
||||
number_of_replicas: 0
|
||||
mappings:
|
||||
properties:
|
||||
my_dense_vector:
|
||||
type: dense_vector
|
||||
dims: 5
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
id: 1
|
||||
body:
|
||||
my_dense_vector: [230.0, 300.33, -34.8988, 15.555, -200.0]
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
id: 2
|
||||
body:
|
||||
my_dense_vector: [-0.5, 100.0, -13, 14.8, -156.0]
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
id: 3
|
||||
body:
|
||||
my_dense_vector: [0.5, 111.3, -13.0, 14.8, -156.0]
|
||||
|
||||
- do:
|
||||
indices.refresh: {}
|
||||
|
||||
|
||||
---
|
||||
"L1 norm":
|
||||
- do:
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "l1norm(params.query_vector, doc['my_dense_vector'])"
|
||||
params:
|
||||
query_vector: [0.5, 111.3, -13.0, 14.8, -156.0]
|
||||
|
||||
- match: {hits.total: 3}
|
||||
|
||||
- match: {hits.hits.0._id: "1"}
|
||||
- gte: {hits.hits.0._score: 485.18}
|
||||
- lte: {hits.hits.0._score: 485.19}
|
||||
|
||||
- match: {hits.hits.1._id: "2"}
|
||||
- gte: {hits.hits.1._score: 12.29}
|
||||
- lte: {hits.hits.1._score: 12.30}
|
||||
|
||||
- match: {hits.hits.2._id: "3"}
|
||||
- gte: {hits.hits.2._score: 0.00}
|
||||
- lte: {hits.hits.2._score: 0.01}
|
||||
|
||||
---
|
||||
"L2 norm":
|
||||
- do:
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "l2norm(params.query_vector, doc['my_dense_vector'])"
|
||||
params:
|
||||
query_vector: [0.5, 111.3, -13.0, 14.8, -156.0]
|
||||
|
||||
- match: {hits.total: 3}
|
||||
|
||||
- match: {hits.hits.0._id: "1"}
|
||||
- gte: {hits.hits.0._score: 301.36}
|
||||
- lte: {hits.hits.0._score: 301.37}
|
||||
|
||||
- match: {hits.hits.1._id: "2"}
|
||||
- gte: {hits.hits.1._score: 11.34}
|
||||
- lte: {hits.hits.1._score: 11.35}
|
||||
|
||||
- match: {hits.hits.2._id: "3"}
|
||||
- gte: {hits.hits.2._score: 0.00}
|
||||
- lte: {hits.hits.2._score: 0.01}
|
@ -0,0 +1,101 @@
|
||||
setup:
|
||||
- skip:
|
||||
features: headers
|
||||
version: " - 7.3.99"
|
||||
reason: "l1norm and l2norm functions were added from 7.4"
|
||||
|
||||
- do:
|
||||
indices.create:
|
||||
include_type_name: false
|
||||
index: test-index
|
||||
body:
|
||||
settings:
|
||||
number_of_replicas: 0
|
||||
mappings:
|
||||
properties:
|
||||
my_sparse_vector:
|
||||
type: sparse_vector
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
id: 1
|
||||
body:
|
||||
my_sparse_vector: {"2": 230.0, "10" : 300.33, "50": -34.8988, "113": 15.555, "4545": -200.0}
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
id: 2
|
||||
body:
|
||||
my_sparse_vector: {"2": -0.5, "10" : 100.0, "50": -13, "113": 14.8, "4545": -156.0}
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
id: 3
|
||||
body:
|
||||
my_sparse_vector: {"2": 0.5, "10" : 111.3, "50": -13.0, "113": 14.8, "4545": -156.0}
|
||||
|
||||
- do:
|
||||
indices.refresh: {}
|
||||
|
||||
---
|
||||
"L1 norm":
|
||||
- do:
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "l1normSparse(params.query_vector, doc['my_sparse_vector'])"
|
||||
params:
|
||||
query_vector: {"2": 0.5, "10" : 111.3, "50": -13.0, "113": 14.8, "4545": -156.0}
|
||||
|
||||
- match: {hits.total: 3}
|
||||
|
||||
- match: {hits.hits.0._id: "1"}
|
||||
- gte: {hits.hits.0._score: 485.18}
|
||||
- lte: {hits.hits.0._score: 485.19}
|
||||
|
||||
- match: {hits.hits.1._id: "2"}
|
||||
- gte: {hits.hits.1._score: 12.29}
|
||||
- lte: {hits.hits.1._score: 12.30}
|
||||
|
||||
- match: {hits.hits.2._id: "3"}
|
||||
- gte: {hits.hits.2._score: 0.00}
|
||||
- lte: {hits.hits.2._score: 0.01}
|
||||
|
||||
|
||||
---
|
||||
"L2 norm":
|
||||
- do:
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "l2normSparse(params.query_vector, doc['my_sparse_vector'])"
|
||||
params:
|
||||
query_vector: {"2": 0.5, "10" : 111.3, "50": -13.0, "113": 14.8, "4545": -156.0}
|
||||
|
||||
- match: {hits.total: 3}
|
||||
|
||||
- match: {hits.hits.0._id: "1"}
|
||||
- gte: {hits.hits.0._score: 301.36}
|
||||
- lte: {hits.hits.0._score: 301.37}
|
||||
|
||||
- match: {hits.hits.1._id: "2"}
|
||||
- gte: {hits.hits.1._score: 11.34}
|
||||
- lte: {hits.hits.1._score: 11.35}
|
||||
|
||||
- match: {hits.hits.2._id: "3"}
|
||||
- gte: {hits.hits.2._score: 0.00}
|
||||
- lte: {hits.hits.2._score: 0.01}
|
@ -219,3 +219,139 @@ setup:
|
||||
params:
|
||||
query_vector: [0.5, 111]
|
||||
- match: { error.root_cause.0.type: "script_exception" }
|
||||
|
||||
---
|
||||
"Query vector has different dimensions from documents' vectors":
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
id: 1
|
||||
body:
|
||||
my_sparse_vector: {"1": 10}
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
id: 2
|
||||
body:
|
||||
my_sparse_vector: {"1": 10, "10" : 10.5}
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
id: 3
|
||||
body:
|
||||
my_sparse_vector: {"1": 10, "10" : 10.5, "100": 100.5}
|
||||
|
||||
- do:
|
||||
indices.refresh: {}
|
||||
|
||||
- do:
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "dotProductSparse(params.query_vector, doc['my_sparse_vector'])"
|
||||
params:
|
||||
query_vector: {"1": 10, "5": 5}
|
||||
|
||||
- match: {hits.total: 3}
|
||||
|
||||
- match: {hits.hits.0._id: "1"}
|
||||
- gte: {hits.hits.0._score: 99.99}
|
||||
- lte: {hits.hits.0._score: 100.01}
|
||||
|
||||
- match: {hits.hits.1._id: "2"}
|
||||
- gte: {hits.hits.0._score: 99.99}
|
||||
- lte: {hits.hits.0._score: 100.01}
|
||||
|
||||
- match: {hits.hits.2._id: "3"}
|
||||
- gte: {hits.hits.0._score: 99.99}
|
||||
- lte: {hits.hits.0._score: 100.01}
|
||||
|
||||
|
||||
- do:
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])"
|
||||
params:
|
||||
query_vector: {"1": 10, "5" : 5}
|
||||
|
||||
- match: {hits.total: 3}
|
||||
|
||||
- match: {hits.hits.0._id: "1"}
|
||||
- gte: {hits.hits.0._score: 0.894}
|
||||
- lte: {hits.hits.0._score: 0.895}
|
||||
|
||||
- match: {hits.hits.1._id: "2"}
|
||||
- gte: {hits.hits.1._score: 0.61}
|
||||
- lte: {hits.hits.1._score: 0.62}
|
||||
|
||||
- match: {hits.hits.2._id: "3"}
|
||||
- gte: {hits.hits.2._score: 0.08}
|
||||
- lte: {hits.hits.2._score: 0.09}
|
||||
|
||||
- do:
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "l1normSparse(params.query_vector, doc['my_sparse_vector'])"
|
||||
params:
|
||||
query_vector: {"1": 10, "5": 5}
|
||||
|
||||
- match: {hits.total: 3}
|
||||
|
||||
- match: {hits.hits.0._id: "3"}
|
||||
- match: {hits.hits.0._score: 116}
|
||||
|
||||
- match: {hits.hits.1._id: "2"}
|
||||
- match: {hits.hits.1._score: 15.5}
|
||||
|
||||
- match: {hits.hits.2._id: "1"}
|
||||
- match: {hits.hits.2._score: 5}
|
||||
|
||||
- do:
|
||||
headers:
|
||||
Content-Type: application/json
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "l2normSparse(params.query_vector, doc['my_sparse_vector'])"
|
||||
params:
|
||||
query_vector: {"1": 10, "5": 5}
|
||||
|
||||
- match: {hits.total: 3}
|
||||
|
||||
- match: {hits.hits.0._id: "3"}
|
||||
- gte: {hits.hits.0._score: 101.17}
|
||||
- lte: {hits.hits.0._score: 101.18}
|
||||
|
||||
- match: {hits.hits.1._id: "2"}
|
||||
- gte: {hits.hits.1._score: 11.62}
|
||||
- lte: {hits.hits.1._score: 11.63}
|
||||
|
||||
- match: {hits.hits.2._id: "1"}
|
||||
- gte: {hits.hits.2._score: 5.0}
|
||||
- lte: {hits.hits.2._score: 5.0}
|
||||
|
@ -20,6 +20,52 @@ public class ScoreScriptUtils {
|
||||
|
||||
//**************FUNCTIONS FOR DENSE VECTORS
|
||||
|
||||
/**
|
||||
* Calculate l1 norm - Manhattan distance
|
||||
* between a query's dense vector and documents' dense vectors
|
||||
*
|
||||
* @param queryVector the query vector parsed as {@code List<Number>} from json
|
||||
* @param dvs VectorScriptDocValues representing encoded documents' vectors
|
||||
*/
|
||||
public static double l1norm(List<Number> queryVector, VectorScriptDocValues.DenseVectorScriptDocValues dvs){
|
||||
BytesRef value = dvs.getEncodedValue();
|
||||
float[] docVector = VectorEncoderDecoder.decodeDenseVector(value);
|
||||
if (queryVector.size() != docVector.length) {
|
||||
throw new IllegalArgumentException("Can't calculate l1norm! The number of dimensions of the query vector [" +
|
||||
queryVector.size() + "] is different from the documents' vectors [" + docVector.length + "].");
|
||||
}
|
||||
Iterator<Number> queryVectorIter = queryVector.iterator();
|
||||
double l1norm = 0;
|
||||
for (int dim = 0; dim < docVector.length; dim++){
|
||||
l1norm += Math.abs(queryVectorIter.next().doubleValue() - docVector[dim]);
|
||||
}
|
||||
return l1norm;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate l2 norm - Euclidean distance
|
||||
* between a query's dense vector and documents' dense vectors
|
||||
*
|
||||
* @param queryVector the query vector parsed as {@code List<Number>} from json
|
||||
* @param dvs VectorScriptDocValues representing encoded documents' vectors
|
||||
*/
|
||||
public static double l2norm(List<Number> queryVector, VectorScriptDocValues.DenseVectorScriptDocValues dvs){
|
||||
BytesRef value = dvs.getEncodedValue();
|
||||
float[] docVector = VectorEncoderDecoder.decodeDenseVector(value);
|
||||
if (queryVector.size() != docVector.length) {
|
||||
throw new IllegalArgumentException("Can't calculate l2norm! The number of dimensions of the query vector [" +
|
||||
queryVector.size() + "] is different from the documents' vectors [" + docVector.length + "].");
|
||||
}
|
||||
Iterator<Number> queryVectorIter = queryVector.iterator();
|
||||
double l2norm = 0;
|
||||
for (int dim = 0; dim < docVector.length; dim++){
|
||||
double diff = queryVectorIter.next().doubleValue() - docVector[dim];
|
||||
l2norm += diff * diff;
|
||||
}
|
||||
return Math.sqrt(l2norm);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculate a dot product between a query's dense vector and documents' dense vectors
|
||||
*
|
||||
@ -92,25 +138,17 @@ public class ScoreScriptUtils {
|
||||
|
||||
//**************FUNCTIONS FOR SPARSE VECTORS
|
||||
|
||||
/**
|
||||
* Calculate a dot product between a query's sparse vector and documents' sparse vectors
|
||||
*
|
||||
* DotProductSparse is implemented as a class to use
|
||||
* painless script caching to prepare queryVector
|
||||
* only once per script execution for all documents.
|
||||
* A user will call `dotProductSparse(params.queryVector, doc['my_vector'])`
|
||||
*/
|
||||
public static final class DotProductSparse {
|
||||
public static class VectorSparseFunctions {
|
||||
final double[] queryValues;
|
||||
final int[] queryDims;
|
||||
|
||||
// prepare queryVector once per script execution
|
||||
// queryVector represents a map of dimensions to values
|
||||
public DotProductSparse(Map<String, Number> queryVector) {
|
||||
public VectorSparseFunctions(Map<String, Number> queryVector) {
|
||||
//break vector into two arrays dims and values
|
||||
int n = queryVector.size();
|
||||
queryDims = new int[n];
|
||||
queryValues = new double[n];
|
||||
queryDims = new int[n];
|
||||
int i = 0;
|
||||
for (Map.Entry<String, Number> dimValue : queryVector.entrySet()) {
|
||||
try {
|
||||
@ -124,6 +162,115 @@ public class ScoreScriptUtils {
|
||||
// Sort dimensions in the ascending order and sort values in the same order as their corresponding dimensions
|
||||
sortSparseDimsDoubleValues(queryDims, queryValues, n);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate l1 norm - Manhattan distance
|
||||
* between a query's sparse vector and documents' sparse vectors
|
||||
*
|
||||
* L1NormSparse is implemented as a class to use
|
||||
* painless script caching to prepare queryVector
|
||||
* only once per script execution for all documents.
|
||||
* A user will call `l1normSparse(params.queryVector, doc['my_vector'])`
|
||||
*/
|
||||
public static final class L1NormSparse extends VectorSparseFunctions {
|
||||
public L1NormSparse(Map<String, Number> queryVector) {
|
||||
super(queryVector);
|
||||
}
|
||||
|
||||
public double l1normSparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) {
|
||||
BytesRef value = dvs.getEncodedValue();
|
||||
int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value);
|
||||
float[] docValues = VectorEncoderDecoder.decodeSparseVector(value);
|
||||
int queryIndex = 0;
|
||||
int docIndex = 0;
|
||||
double l1norm = 0;
|
||||
while (queryIndex < queryDims.length && docIndex < docDims.length) {
|
||||
if (queryDims[queryIndex] == docDims[docIndex]) {
|
||||
l1norm += Math.abs(queryValues[queryIndex] - docValues[docIndex]);
|
||||
queryIndex++;
|
||||
docIndex++;
|
||||
} else if (queryDims[queryIndex] > docDims[docIndex]) {
|
||||
l1norm += Math.abs(docValues[docIndex]); // 0 for missing query dim
|
||||
docIndex++;
|
||||
} else {
|
||||
l1norm += Math.abs(queryValues[queryIndex]); // 0 for missing doc dim
|
||||
queryIndex++;
|
||||
}
|
||||
}
|
||||
while (queryIndex < queryDims.length) {
|
||||
l1norm += Math.abs(queryValues[queryIndex]); // 0 for missing doc dim
|
||||
queryIndex++;
|
||||
}
|
||||
while (docIndex < docDims.length) {
|
||||
l1norm += Math.abs(docValues[docIndex]); // 0 for missing query dim
|
||||
docIndex++;
|
||||
}
|
||||
return l1norm;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate l2 norm - Euclidean distance
|
||||
* between a query's sparse vector and documents' sparse vectors
|
||||
*
|
||||
* L2NormSparse is implemented as a class to use
|
||||
* painless script caching to prepare queryVector
|
||||
* only once per script execution for all documents.
|
||||
* A user will call `l2normSparse(params.queryVector, doc['my_vector'])`
|
||||
*/
|
||||
public static final class L2NormSparse extends VectorSparseFunctions {
|
||||
public L2NormSparse(Map<String, Number> queryVector) {
|
||||
super(queryVector);
|
||||
}
|
||||
|
||||
public double l2normSparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) {
|
||||
BytesRef value = dvs.getEncodedValue();
|
||||
int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value);
|
||||
float[] docValues = VectorEncoderDecoder.decodeSparseVector(value);
|
||||
int queryIndex = 0;
|
||||
int docIndex = 0;
|
||||
double l2norm = 0;
|
||||
while (queryIndex < queryDims.length && docIndex < docDims.length) {
|
||||
if (queryDims[queryIndex] == docDims[docIndex]) {
|
||||
double diff = queryValues[queryIndex] - docValues[docIndex];
|
||||
l2norm += diff * diff;
|
||||
queryIndex++;
|
||||
docIndex++;
|
||||
} else if (queryDims[queryIndex] > docDims[docIndex]) {
|
||||
double diff = docValues[docIndex]; // 0 for missing query dim
|
||||
l2norm += diff * diff;
|
||||
docIndex++;
|
||||
} else {
|
||||
double diff = queryValues[queryIndex]; // 0 for missing doc dim
|
||||
l2norm += diff * diff;
|
||||
queryIndex++;
|
||||
}
|
||||
}
|
||||
while (queryIndex < queryDims.length) {
|
||||
l2norm += queryValues[queryIndex] * queryValues[queryIndex]; // 0 for missing doc dims
|
||||
queryIndex++;
|
||||
}
|
||||
while (docIndex < docDims.length) {
|
||||
l2norm += docValues[docIndex]* docValues[docIndex]; // 0 for missing query dims
|
||||
docIndex++;
|
||||
}
|
||||
return Math.sqrt(l2norm);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate a dot product between a query's sparse vector and documents' sparse vectors
|
||||
*
|
||||
* DotProductSparse is implemented as a class to use
|
||||
* painless script caching to prepare queryVector
|
||||
* only once per script execution for all documents.
|
||||
* A user will call `dotProductSparse(params.queryVector, doc['my_vector'])`
|
||||
*/
|
||||
public static final class DotProductSparse extends VectorSparseFunctions {
|
||||
public DotProductSparse(Map<String, Number> queryVector) {
|
||||
super(queryVector);
|
||||
}
|
||||
|
||||
public double dotProductSparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) {
|
||||
BytesRef value = dvs.getEncodedValue();
|
||||
@ -141,32 +288,16 @@ public class ScoreScriptUtils {
|
||||
* only once per script execution for all documents.
|
||||
* A user will call `cosineSimilaritySparse(params.queryVector, doc['my_vector'])`
|
||||
*/
|
||||
public static final class CosineSimilaritySparse {
|
||||
final double[] queryValues;
|
||||
final int[] queryDims;
|
||||
public static final class CosineSimilaritySparse extends VectorSparseFunctions {
|
||||
final double queryVectorMagnitude;
|
||||
|
||||
// prepare queryVector once per script execution
|
||||
public CosineSimilaritySparse(Map<String, Number> queryVector) {
|
||||
//break vector into two arrays dims and values
|
||||
int n = queryVector.size();
|
||||
queryValues = new double[n];
|
||||
queryDims = new int[n];
|
||||
super(queryVector);
|
||||
double dotProduct = 0;
|
||||
int i = 0;
|
||||
for (Map.Entry<String, Number> dimValue : queryVector.entrySet()) {
|
||||
try {
|
||||
queryDims[i] = Integer.parseInt(dimValue.getKey());
|
||||
} catch (final NumberFormatException e) {
|
||||
throw new IllegalArgumentException("Failed to parse a query vector dimension, it must be an integer!", e);
|
||||
}
|
||||
queryValues[i] = dimValue.getValue().doubleValue();
|
||||
for (int i = 0; i< queryDims.length; i++) {
|
||||
dotProduct += queryValues[i] * queryValues[i];
|
||||
i++;
|
||||
}
|
||||
this.queryVectorMagnitude = Math.sqrt(dotProduct);
|
||||
// Sort dimensions in the ascending order and sort values in the same order as their corresponding dimensions
|
||||
sortSparseDimsDoubleValues(queryDims, queryValues, n);
|
||||
}
|
||||
|
||||
public double cosineSimilaritySparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) {
|
||||
|
@ -11,8 +11,12 @@ class org.elasticsearch.xpack.vectors.query.VectorScriptDocValues$SparseVectorSc
|
||||
}
|
||||
|
||||
static_import {
|
||||
double l1norm(List, VectorScriptDocValues.DenseVectorScriptDocValues) from_class org.elasticsearch.xpack.vectors.query.ScoreScriptUtils
|
||||
double l2norm(List, VectorScriptDocValues.DenseVectorScriptDocValues) from_class org.elasticsearch.xpack.vectors.query.ScoreScriptUtils
|
||||
double cosineSimilarity(List, VectorScriptDocValues.DenseVectorScriptDocValues) bound_to org.elasticsearch.xpack.vectors.query.ScoreScriptUtils$CosineSimilarity
|
||||
double dotProduct(List, VectorScriptDocValues.DenseVectorScriptDocValues) from_class org.elasticsearch.xpack.vectors.query.ScoreScriptUtils
|
||||
double l1normSparse(Map, VectorScriptDocValues.SparseVectorScriptDocValues) bound_to org.elasticsearch.xpack.vectors.query.ScoreScriptUtils$L1NormSparse
|
||||
double l2normSparse(Map, VectorScriptDocValues.SparseVectorScriptDocValues) bound_to org.elasticsearch.xpack.vectors.query.ScoreScriptUtils$L2NormSparse
|
||||
double dotProductSparse(Map, VectorScriptDocValues.SparseVectorScriptDocValues) bound_to org.elasticsearch.xpack.vectors.query.ScoreScriptUtils$DotProductSparse
|
||||
double cosineSimilaritySparse(Map, VectorScriptDocValues.SparseVectorScriptDocValues) bound_to org.elasticsearch.xpack.vectors.query.ScoreScriptUtils$CosineSimilaritySparse
|
||||
}
|
@ -12,6 +12,8 @@ import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.CosineSimilarity;
|
||||
import org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.DotProductSparse;
|
||||
import org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.CosineSimilaritySparse;
|
||||
import org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.L1NormSparse;
|
||||
import org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.L2NormSparse;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
@ -20,6 +22,9 @@ import java.util.Map;
|
||||
|
||||
import static org.elasticsearch.xpack.vectors.mapper.VectorEncoderDecoderTests.mockEncodeDenseVector;
|
||||
import static org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.dotProduct;
|
||||
import static org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.l1norm;
|
||||
import static org.elasticsearch.xpack.vectors.query.ScoreScriptUtils.l2norm;
|
||||
|
||||
import static org.hamcrest.Matchers.containsString;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
@ -35,12 +40,20 @@ public class ScoreScriptUtilsTests extends ESTestCase {
|
||||
|
||||
// test dotProduct
|
||||
double result = dotProduct(queryVector, dvs);
|
||||
assertEquals("dotProduct result is not equal to the expected value!", 65425.62, result, 0.1);
|
||||
assertEquals("dotProduct result is not equal to the expected value!", 65425.626, result, 0.001);
|
||||
|
||||
// test cosineSimilarity
|
||||
CosineSimilarity cosineSimilarity = new CosineSimilarity(queryVector);
|
||||
double result2 = cosineSimilarity.cosineSimilarity(dvs);
|
||||
assertEquals("cosineSimilarity result is not equal to the expected value!", 0.78, result2, 0.1);
|
||||
assertEquals("cosineSimilarity result is not equal to the expected value!", 0.790, result2, 0.001);
|
||||
|
||||
// test l1Norm
|
||||
double result3 = l1norm(queryVector, dvs);
|
||||
assertEquals("l1norm result is not equal to the expected value!", 485.184, result3, 0.001);
|
||||
|
||||
// test l2norm
|
||||
double result4 = l2norm(queryVector, dvs);
|
||||
assertEquals("l2norm result is not equal to the expected value!", 301.361, result4, 0.001);
|
||||
|
||||
// test dotProduct fails when queryVector has wrong number of dims
|
||||
List<Number> invalidQueryVector = Arrays.asList(0.5, 111.3);
|
||||
@ -52,6 +65,13 @@ public class ScoreScriptUtilsTests extends ESTestCase {
|
||||
e = expectThrows(IllegalArgumentException.class, () -> cosineSimilarity2.cosineSimilarity(dvs));
|
||||
assertThat(e.getMessage(), containsString("dimensions of the query vector [2] is different from the documents' vectors [5]"));
|
||||
|
||||
// test l1norm fails when queryVector has wrong number of dims
|
||||
e = expectThrows(IllegalArgumentException.class, () -> l1norm(invalidQueryVector, dvs));
|
||||
assertThat(e.getMessage(), containsString("dimensions of the query vector [2] is different from the documents' vectors [5]"));
|
||||
|
||||
// test l2norm fails when queryVector has wrong number of dims
|
||||
e = expectThrows(IllegalArgumentException.class, () -> l2norm(invalidQueryVector, dvs));
|
||||
assertThat(e.getMessage(), containsString("dimensions of the query vector [2] is different from the documents' vectors [5]"));
|
||||
}
|
||||
|
||||
public void testSparseVectorFunctions() {
|
||||
@ -71,11 +91,95 @@ public class ScoreScriptUtilsTests extends ESTestCase {
|
||||
// test dotProduct
|
||||
DotProductSparse docProductSparse = new DotProductSparse(queryVector);
|
||||
double result = docProductSparse.dotProductSparse(dvs);
|
||||
assertEquals("dotProductSparse result is not equal to the expected value!", 65425.62, result, 0.1);
|
||||
assertEquals("dotProductSparse result is not equal to the expected value!", 65425.626, result, 0.001);
|
||||
|
||||
// test cosineSimilarity
|
||||
CosineSimilaritySparse cosineSimilaritySparse = new CosineSimilaritySparse(queryVector);
|
||||
double result2 = cosineSimilaritySparse.cosineSimilaritySparse(dvs);
|
||||
assertEquals("cosineSimilaritySparse result is not equal to the expected value!", 0.78, result2, 0.1);
|
||||
assertEquals("cosineSimilaritySparse result is not equal to the expected value!", 0.790, result2, 0.001);
|
||||
|
||||
// test l1norm
|
||||
L1NormSparse l1Norm = new L1NormSparse(queryVector);
|
||||
double result3 = l1Norm.l1normSparse(dvs);
|
||||
assertEquals("l1normSparse result is not equal to the expected value!", 485.184, result3, 0.001);
|
||||
|
||||
// test l2norm
|
||||
L2NormSparse l2Norm = new L2NormSparse(queryVector);
|
||||
double result4 = l2Norm.l2normSparse(dvs);
|
||||
assertEquals("l2normSparse result is not equal to the expected value!", 301.361, result4, 0.001);
|
||||
}
|
||||
|
||||
public void testSparseVectorMissingDimensions1() {
|
||||
// Document vector's biggest dimension > query vector's biggest dimension
|
||||
int[] docVectorDims = {2, 10, 50, 113, 4545, 4546};
|
||||
float[] docVectorValues = {230.0f, 300.33f, -34.8988f, 15.555f, -200.0f, 11.5f};
|
||||
BytesRef encodedDocVector = VectorEncoderDecoder.encodeSparseVector(docVectorDims, docVectorValues, docVectorDims.length);
|
||||
VectorScriptDocValues.SparseVectorScriptDocValues dvs = mock(VectorScriptDocValues.SparseVectorScriptDocValues.class);
|
||||
when(dvs.getEncodedValue()).thenReturn(encodedDocVector);
|
||||
Map<String, Number> queryVector = new HashMap<String, Number>() {{
|
||||
put("2", 0.5);
|
||||
put("10", 111.3);
|
||||
put("50", -13.0);
|
||||
put("113", 14.8);
|
||||
put("114", -20.5);
|
||||
put("4545", -156.0);
|
||||
}};
|
||||
|
||||
// test dotProduct
|
||||
DotProductSparse docProductSparse = new DotProductSparse(queryVector);
|
||||
double result = docProductSparse.dotProductSparse(dvs);
|
||||
assertEquals("dotProductSparse result is not equal to the expected value!", 65425.626, result, 0.001);
|
||||
|
||||
// test cosineSimilarity
|
||||
CosineSimilaritySparse cosineSimilaritySparse = new CosineSimilaritySparse(queryVector);
|
||||
double result2 = cosineSimilaritySparse.cosineSimilaritySparse(dvs);
|
||||
assertEquals("cosineSimilaritySparse result is not equal to the expected value!", 0.786, result2, 0.001);
|
||||
|
||||
// test l1norm
|
||||
L1NormSparse l1Norm = new L1NormSparse(queryVector);
|
||||
double result3 = l1Norm.l1normSparse(dvs);
|
||||
assertEquals("l1normSparse result is not equal to the expected value!", 517.184, result3, 0.001);
|
||||
|
||||
// test l2norm
|
||||
L2NormSparse l2Norm = new L2NormSparse(queryVector);
|
||||
double result4 = l2Norm.l2normSparse(dvs);
|
||||
assertEquals("l2normSparse result is not equal to the expected value!", 302.277, result4, 0.001);
|
||||
}
|
||||
|
||||
public void testSparseVectorMissingDimensions2() {
|
||||
// Document vector's biggest dimension < query vector's biggest dimension
|
||||
int[] docVectorDims = {2, 10, 50, 113, 4545, 4546};
|
||||
float[] docVectorValues = {230.0f, 300.33f, -34.8988f, 15.555f, -200.0f, 11.5f};
|
||||
BytesRef encodedDocVector = VectorEncoderDecoder.encodeSparseVector(docVectorDims, docVectorValues, docVectorDims.length);
|
||||
VectorScriptDocValues.SparseVectorScriptDocValues dvs = mock(VectorScriptDocValues.SparseVectorScriptDocValues.class);
|
||||
when(dvs.getEncodedValue()).thenReturn(encodedDocVector);
|
||||
Map<String, Number> queryVector = new HashMap<String, Number>() {{
|
||||
put("2", 0.5);
|
||||
put("10", 111.3);
|
||||
put("50", -13.0);
|
||||
put("113", 14.8);
|
||||
put("4545", -156.0);
|
||||
put("4548", -20.5);
|
||||
}};
|
||||
|
||||
// test dotProduct
|
||||
DotProductSparse docProductSparse = new DotProductSparse(queryVector);
|
||||
double result = docProductSparse.dotProductSparse(dvs);
|
||||
assertEquals("dotProductSparse result is not equal to the expected value!", 65425.626, result, 0.001);
|
||||
|
||||
// test cosineSimilarity
|
||||
CosineSimilaritySparse cosineSimilaritySparse = new CosineSimilaritySparse(queryVector);
|
||||
double result2 = cosineSimilaritySparse.cosineSimilaritySparse(dvs);
|
||||
assertEquals("cosineSimilaritySparse result is not equal to the expected value!", 0.786, result2, 0.001);
|
||||
|
||||
// test l1norm
|
||||
L1NormSparse l1Norm = new L1NormSparse(queryVector);
|
||||
double result3 = l1Norm.l1normSparse(dvs);
|
||||
assertEquals("l1normSparse result is not equal to the expected value!", 517.184, result3, 0.001);
|
||||
|
||||
// test l2norm
|
||||
L2NormSparse l2Norm = new L2NormSparse(queryVector);
|
||||
double result4 = l2Norm.l2normSparse(dvs);
|
||||
assertEquals("l2normSparse result is not equal to the expected value!", 302.277, result4, 0.001);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user