Backport distance functions vectors (#39330)

Distance functions for dense and sparse vectors

Backport for #37947, #39313
This commit is contained in:
Mayya Sharipova 2019-02-23 11:52:43 -05:00 committed by GitHub
parent 6e06f82106
commit e80284231d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 1343 additions and 74 deletions

View File

@ -9,7 +9,7 @@ not exceed 500. The number of dimensions can be
different across documents. A `dense_vector` field is
a single-valued field.
These vectors can be used for document scoring.
These vectors can be used for <<vector-functions,document scoring>>.
For example, a document score can represent a distance between
a given query vector and the indexed document vector.

View File

@ -9,7 +9,7 @@ not exceed 500. The number of dimensions can be
different across documents. A `sparse_vector` field is
a single-valued field.
These vectors can be used for document scoring.
These vectors can be used for <<vector-functions,document scoring>>.
For example, a document score can represent a distance between
a given query vector and the indexed document vector.

View File

@ -74,6 +74,113 @@ to be the most efficient by using the internal mechanisms.
--------------------------------------------------
// NOTCONSOLE
[[vector-functions]]
===== Functions for vector fields
These functions are used for
for <<dense-vector,`dense_vector`>> and
<<sparse-vector,`sparse_vector`>> fields.
For dense_vector fields, `cosineSimilarity` calculates the measure of
cosine similarity between a given query vector and document vectors.
[source,js]
--------------------------------------------------
{
"query": {
"script_score": {
"query": {
"match_all": {}
},
"script": {
"source": "cosineSimilarity(params.queryVector, doc['my_dense_vector'])",
"params": {
"queryVector": [4, 3.4, -0.2] <1>
}
}
}
}
}
--------------------------------------------------
// NOTCONSOLE
<1> To take advantage of the script optimizations, provide a query vector as a script parameter.
Similarly, for sparse_vector fields, `cosineSimilaritySparse` calculates cosine similarity
between a given query vector and document vectors.
[source,js]
--------------------------------------------------
{
"query": {
"script_score": {
"query": {
"match_all": {}
},
"script": {
"source": "cosineSimilaritySparse(params.queryVector, doc['my_sparse_vector'])",
"params": {
"queryVector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0}
}
}
}
}
}
--------------------------------------------------
// NOTCONSOLE
For dense_vector fields, `dotProduct` calculates the measure of
dot product between a given query vector and document vectors.
[source,js]
--------------------------------------------------
{
"query": {
"script_score": {
"query": {
"match_all": {}
},
"script": {
"source": "dotProduct(params.queryVector, doc['my_dense_vector'])",
"params": {
"queryVector": [4, 3.4, -0.2]
}
}
}
}
}
--------------------------------------------------
// NOTCONSOLE
Similarly, for sparse_vector fields, `dotProductSparse` calculates dot product
between a given query vector and document vectors.
[source,js]
--------------------------------------------------
{
"query": {
"script_score": {
"query": {
"match_all": {}
},
"script": {
"source": "dotProductSparse(params.queryVector, doc['my_sparse_vector'])",
"params": {
"queryVector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0}
}
}
}
}
}
--------------------------------------------------
// NOTCONSOLE
NOTE: If a document doesn't have a value for a vector field on which
a vector function is executed, 0 is returned as a result
for this document.
NOTE: If a document's dense vector field has a number of dimensions
different from the query's vector, 0 is used for missing dimensions
in the calculations of vector functions.
[[random-functions]]
===== Random functions

View File

@ -20,4 +20,13 @@
esplugin {
description 'Adds advanced field mappers'
classname 'org.elasticsearch.index.mapper.MapperExtrasPlugin'
extendedPlugins = ['lang-painless']
}
dependencies {
compileOnly project(':modules:lang-painless')
}
integTestCluster {
module project(':modules:lang-painless')
}

View File

@ -30,6 +30,7 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentParser.Token;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.query.VectorDVIndexFieldData;
import org.elasticsearch.search.DocValueFormat;
import java.io.IOException;
@ -119,8 +120,7 @@ public class DenseVectorFieldMapper extends FieldMapper implements ArrayValueMap
@Override
public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) {
throw new UnsupportedOperationException(
"Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating");
return new VectorDVIndexFieldData.Builder(true);
}
@Override

View File

@ -30,6 +30,7 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentParser.Token;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.query.VectorDVIndexFieldData;
import org.elasticsearch.search.DocValueFormat;
import java.io.IOException;
@ -119,8 +120,7 @@ public class SparseVectorFieldMapper extends FieldMapper {
@Override
public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) {
throw new UnsupportedOperationException(
"Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating");
return new VectorDVIndexFieldData.Builder(false);
}
@Override

View File

@ -23,7 +23,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.InPlaceMergeSorter;
// static utility functions for encoding and decoding dense_vector and sparse_vector fields
final class VectorEncoderDecoder {
public final class VectorEncoderDecoder {
static final byte INT_BYTES = 4;
static final byte SHORT_BYTES = 2;
@ -34,10 +34,11 @@ final class VectorEncoderDecoder {
* BytesRef: int[] floats encoded as integers values, 2 bytes for each dimension
* @param values - values of the sparse array
* @param dims - dims of the sparse array
* @param dimCount - number of the dimension
* @param dimCount - number of the dimensions, necessary as values and dims are dynamically created arrays,
* and may be over-allocated
* @return BytesRef
*/
static BytesRef encodeSparseVector(int[] dims, float[] values, int dimCount) {
public static BytesRef encodeSparseVector(int[] dims, float[] values, int dimCount) {
// 1. Sort dims and values
sortSparseDimsValues(dims, values, dimCount);
byte[] buf = new byte[dimCount * (INT_BYTES + SHORT_BYTES)];
@ -66,9 +67,12 @@ final class VectorEncoderDecoder {
/**
* Decodes the first part of BytesRef into sparse vector dimensions
* @param vectorBR - vector decoded in BytesRef
* @param vectorBR - sparse vector encoded in BytesRef
*/
static int[] decodeSparseVectorDims(BytesRef vectorBR) {
public static int[] decodeSparseVectorDims(BytesRef vectorBR) {
if (vectorBR == null) {
throw new IllegalArgumentException("A document doesn't have a value for a vector field!");
}
int dimCount = vectorBR.length / (INT_BYTES + SHORT_BYTES);
int[] dims = new int[dimCount];
int offset = vectorBR.offset;
@ -81,9 +85,12 @@ final class VectorEncoderDecoder {
/**
* Decodes the second part of the BytesRef into sparse vector values
* @param vectorBR - vector decoded in BytesRef
* @param vectorBR - sparse vector encoded in BytesRef
*/
static float[] decodeSparseVector(BytesRef vectorBR) {
public static float[] decodeSparseVector(BytesRef vectorBR) {
if (vectorBR == null) {
throw new IllegalArgumentException("A document doesn't have a value for a vector field!");
}
int dimCount = vectorBR.length / (INT_BYTES + SHORT_BYTES);
int offset = vectorBR.offset + SHORT_BYTES * dimCount; //calculate the offset from where values are encoded
float[] vector = new float[dimCount];
@ -100,10 +107,14 @@ final class VectorEncoderDecoder {
/**
Sort dimensions in the ascending order and
sort values in the same order as their corresponding dimensions
**/
static void sortSparseDimsValues(int[] dims, float[] values, int n) {
* Sorts dimensions in the ascending order and
* sorts values in the same order as their corresponding dimensions
*
* @param dims - dimensions of the sparse query vector
* @param values - values for the sparse query vector
* @param n - number of dimensions
*/
public static void sortSparseDimsValues(int[] dims, float[] values, int n) {
new InPlaceMergeSorter() {
@Override
public int compare(int i, int j) {
@ -123,8 +134,42 @@ final class VectorEncoderDecoder {
}.sort(0, n);
}
// Decodes a BytesRef into an array of floats
static float[] decodeDenseVector(BytesRef vectorBR) {
/**
* Sorts dimensions in the ascending order and
* sorts values in the same order as their corresponding dimensions
*
* @param dims - dimensions of the sparse query vector
* @param values - values for the sparse query vector
* @param n - number of dimensions
*/
public static void sortSparseDimsDoubleValues(int[] dims, double[] values, int n) {
new InPlaceMergeSorter() {
@Override
public int compare(int i, int j) {
return Integer.compare(dims[i], dims[j]);
}
@Override
public void swap(int i, int j) {
int tempDim = dims[i];
dims[i] = dims[j];
dims[j] = tempDim;
double tempValue = values[j];
values[j] = values[i];
values[i] = tempValue;
}
}.sort(0, n);
}
/**
* Decodes a BytesRef into an array of floats
* @param vectorBR - dense vector encoded in BytesRef
*/
public static float[] decodeDenseVector(BytesRef vectorBR) {
if (vectorBR == null) {
throw new IllegalArgumentException("A document doesn't have a value for a vector field!");
}
int dimCount = vectorBR.length / INT_BYTES;
float[] vector = new float[dimCount];
int offset = vectorBR.offset;

View File

@ -0,0 +1,42 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.query;
import org.elasticsearch.painless.spi.PainlessExtension;
import org.elasticsearch.painless.spi.Whitelist;
import org.elasticsearch.painless.spi.WhitelistLoader;
import org.elasticsearch.script.ScoreScript;
import org.elasticsearch.script.ScriptContext;
import java.util.Collections;
import java.util.List;
import java.util.Map;
public class DocValuesWhitelistExtension implements PainlessExtension {
private static final Whitelist WHITELIST =
WhitelistLoader.loadFromResourceFiles(DocValuesWhitelistExtension.class, "docvalues_whitelist.txt");
@Override
public Map<ScriptContext<?>, List<Whitelist>> getContextWhitelists() {
return Collections.singletonMap(ScoreScript.CONTEXT, Collections.singletonList(WHITELIST));
}
}

View File

@ -0,0 +1,218 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.query;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.index.mapper.VectorEncoderDecoder;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import static org.elasticsearch.index.mapper.VectorEncoderDecoder.sortSparseDimsDoubleValues;
public class ScoreScriptUtils {
//**************FUNCTIONS FOR DENSE VECTORS
/**
* Calculate a dot product between a query's dense vector and documents' dense vectors
*
* @param queryVector the query vector parsed as {@code List<Number>} from json
* @param dvs VectorScriptDocValues representing encoded documents' vectors
*/
public static double dotProduct(List<Number> queryVector, VectorScriptDocValues.DenseVectorScriptDocValues dvs){
BytesRef value = dvs.getEncodedValue();
if (value == null) return 0;
float[] docVector = VectorEncoderDecoder.decodeDenseVector(value);
return intDotProduct(queryVector, docVector);
}
/**
* Calculate cosine similarity between a query's dense vector and documents' dense vectors
*
* CosineSimilarity is implemented as a class to use
* painless script caching to calculate queryVectorMagnitude
* only once per script execution for all documents.
* A user will call `cosineSimilarity(params.queryVector, doc['my_vector'])`
*/
public static final class CosineSimilarity {
final double queryVectorMagnitude;
final List<Number> queryVector;
// calculate queryVectorMagnitude once per query execution
public CosineSimilarity(List<Number> queryVector) {
this.queryVector = queryVector;
double doubleValue;
double dotProduct = 0;
for (Number value : queryVector) {
doubleValue = value.doubleValue();
dotProduct += doubleValue * doubleValue;
}
this.queryVectorMagnitude = Math.sqrt(dotProduct);
}
public double cosineSimilarity(VectorScriptDocValues.DenseVectorScriptDocValues dvs) {
BytesRef value = dvs.getEncodedValue();
if (value == null) return 0;
float[] docVector = VectorEncoderDecoder.decodeDenseVector(value);
// calculate docVector magnitude
double dotProduct = 0f;
for (int dim = 0; dim < docVector.length; dim++) {
dotProduct += (double) docVector[dim] * docVector[dim];
}
final double docVectorMagnitude = Math.sqrt(dotProduct);
double docQueryDotProduct = intDotProduct(queryVector, docVector);
return docQueryDotProduct / (docVectorMagnitude * queryVectorMagnitude);
}
}
private static double intDotProduct(List<Number> v1, float[] v2){
int dims = Math.min(v1.size(), v2.length);
double v1v2DotProduct = 0;
int dim = 0;
Iterator<Number> v1Iter = v1.iterator();
while(dim < dims) {
v1v2DotProduct += v1Iter.next().doubleValue() * v2[dim];
dim++;
}
return v1v2DotProduct;
}
//**************FUNCTIONS FOR SPARSE VECTORS
/**
* Calculate a dot product between a query's sparse vector and documents' sparse vectors
*
* DotProductSparse is implemented as a class to use
* painless script caching to prepare queryVector
* only once per script execution for all documents.
* A user will call `dotProductSparse(params.queryVector, doc['my_vector'])`
*/
public static final class DotProductSparse {
final double[] queryValues;
final int[] queryDims;
// prepare queryVector once per script execution
// queryVector represents a map of dimensions to values
public DotProductSparse(Map<String, Number> queryVector) {
//break vector into two arrays dims and values
int n = queryVector.size();
queryDims = new int[n];
queryValues = new double[n];
int i = 0;
for (Map.Entry<String, Number> dimValue : queryVector.entrySet()) {
try {
queryDims[i] = Integer.parseInt(dimValue.getKey());
} catch (final NumberFormatException e) {
throw new IllegalArgumentException("Failed to parse a query vector dimension, it must be an integer!", e);
}
queryValues[i] = dimValue.getValue().doubleValue();
i++;
}
// Sort dimensions in the ascending order and sort values in the same order as their corresponding dimensions
sortSparseDimsDoubleValues(queryDims, queryValues, n);
}
public double dotProductSparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) {
BytesRef value = dvs.getEncodedValue();
if (value == null) return 0;
int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value);
float[] docValues = VectorEncoderDecoder.decodeSparseVector(value);
return intDotProductSparse(queryValues, queryDims, docValues, docDims);
}
}
/**
* Calculate cosine similarity between a query's sparse vector and documents' sparse vectors
*
* CosineSimilaritySparse is implemented as a class to use
* painless script caching to prepare queryVector and calculate queryVectorMagnitude
* only once per script execution for all documents.
* A user will call `cosineSimilaritySparse(params.queryVector, doc['my_vector'])`
*/
public static final class CosineSimilaritySparse {
final double[] queryValues;
final int[] queryDims;
final double queryVectorMagnitude;
// prepare queryVector once per script execution
public CosineSimilaritySparse(Map<String, Number> queryVector) {
//break vector into two arrays dims and values
int n = queryVector.size();
queryValues = new double[n];
queryDims = new int[n];
double dotProduct = 0;
int i = 0;
for (Map.Entry<String, Number> dimValue : queryVector.entrySet()) {
try {
queryDims[i] = Integer.parseInt(dimValue.getKey());
} catch (final NumberFormatException e) {
throw new IllegalArgumentException("Failed to parse a query vector dimension, it must be an integer!", e);
}
queryValues[i] = dimValue.getValue().doubleValue();
dotProduct += queryValues[i] * queryValues[i];
i++;
}
this.queryVectorMagnitude = Math.sqrt(dotProduct);
// Sort dimensions in the ascending order and sort values in the same order as their corresponding dimensions
sortSparseDimsDoubleValues(queryDims, queryValues, n);
}
public double cosineSimilaritySparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) {
BytesRef value = dvs.getEncodedValue();
if (value == null) return 0;
int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value);
float[] docValues = VectorEncoderDecoder.decodeSparseVector(value);
// calculate docVector magnitude
double dotProduct = 0;
for (float docValue : docValues) {
dotProduct += (double) docValue * docValue;
}
final double docVectorMagnitude = Math.sqrt(dotProduct);
double docQueryDotProduct = intDotProductSparse(queryValues, queryDims, docValues, docDims);
return docQueryDotProduct / (docVectorMagnitude * queryVectorMagnitude);
}
}
private static double intDotProductSparse(double[] v1Values, int[] v1Dims, float[] v2Values, int[] v2Dims) {
double v1v2DotProduct = 0;
int v1Index = 0;
int v2Index = 0;
// find common dimensions among vectors v1 and v2 and calculate dotProduct based on common dimensions
while (v1Index < v1Values.length && v2Index < v2Values.length) {
if (v1Dims[v1Index] == v2Dims[v2Index]) {
v1v2DotProduct += v1Values[v1Index] * v2Values[v2Index];
v1Index++;
v2Index++;
} else if (v1Dims[v1Index] > v2Dims[v2Index]) {
v2Index++;
} else {
v1Index++;
}
}
return v1v2DotProduct;
}
}

View File

@ -0,0 +1,80 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.query;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.index.fielddata.AtomicFieldData;
import org.elasticsearch.index.fielddata.ScriptDocValues;
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
final class VectorDVAtomicFieldData implements AtomicFieldData {
private final LeafReader reader;
private final String field;
private final boolean isDense;
VectorDVAtomicFieldData(LeafReader reader, String field, boolean isDense) {
this.reader = reader;
this.field = field;
this.isDense = isDense;
}
@Override
public long ramBytesUsed() {
return 0; // not exposed by Lucene
}
@Override
public Collection<Accountable> getChildResources() {
return Collections.emptyList();
}
@Override
public SortedBinaryDocValues getBytesValues() {
throw new UnsupportedOperationException("String representation of doc values for vector fields is not supported");
}
@Override
public ScriptDocValues<BytesRef> getScriptValues() {
try {
final BinaryDocValues values = DocValues.getBinary(reader, field);
if (isDense) {
return new VectorScriptDocValues.DenseVectorScriptDocValues(values);
} else {
return new VectorScriptDocValues.SparseVectorScriptDocValues(values);
}
} catch (IOException e) {
throw new IllegalStateException("Cannot load doc values for vector field!", e);
}
}
@Override
public void close() {
// no-op
}
}

View File

@ -0,0 +1,74 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.query;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.SortField;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.fielddata.plain.DocValuesIndexFieldData;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.indices.breaker.CircuitBreakerService;
import org.elasticsearch.search.MultiValueMode;
public class VectorDVIndexFieldData extends DocValuesIndexFieldData implements IndexFieldData<VectorDVAtomicFieldData> {
private final boolean isDense;
public VectorDVIndexFieldData(Index index, String fieldName, boolean isDense) {
super(index, fieldName);
this.isDense = isDense;
}
@Override
public SortField sortField(@Nullable Object missingValue, MultiValueMode sortMode, Nested nested, boolean reverse) {
throw new IllegalArgumentException("can't sort on the vector field");
}
@Override
public VectorDVAtomicFieldData load(LeafReaderContext context) {
return new VectorDVAtomicFieldData(context.reader(), fieldName, isDense);
}
@Override
public VectorDVAtomicFieldData loadDirect(LeafReaderContext context) throws Exception {
return load(context);
}
public static class Builder implements IndexFieldData.Builder {
private final boolean isDense;
public Builder(boolean isDense) {
this.isDense = isDense;
}
@Override
public IndexFieldData<?> build(IndexSettings indexSettings, MappedFieldType fieldType, IndexFieldDataCache cache,
CircuitBreakerService breakerService, MapperService mapperService) {
final String fieldName = fieldType.name();
return new VectorDVIndexFieldData(indexSettings.getIndex(), fieldName, isDense);
}
}
}

View File

@ -0,0 +1,78 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.query;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.index.fielddata.ScriptDocValues;
import java.io.IOException;
/**
* VectorScriptDocValues represents docValues for dense and sparse vector fields
*/
public abstract class VectorScriptDocValues extends ScriptDocValues<BytesRef> {
private final BinaryDocValues in;
private BytesRef value;
VectorScriptDocValues(BinaryDocValues in) {
this.in = in;
}
@Override
public void setNextDocId(int docId) throws IOException {
if (in.advanceExact(docId)) {
value = in.binaryValue();
} else {
value = null;
}
}
// package private access only for {@link ScoreScriptUtils}
BytesRef getEncodedValue() {
return value;
}
@Override
public BytesRef get(int index) {
throw new UnsupportedOperationException("vector fields may only be used via vector functions in scripts");
}
@Override
public int size() {
throw new UnsupportedOperationException("vector fields may only be used via vector functions in scripts");
}
// not final, as it needs to be extended by Mockito for tests
public static class DenseVectorScriptDocValues extends VectorScriptDocValues {
public DenseVectorScriptDocValues(BinaryDocValues in) {
super(in);
}
}
// not final, as it needs to be extended by Mockito for tests
public static class SparseVectorScriptDocValues extends VectorScriptDocValues {
public SparseVectorScriptDocValues(BinaryDocValues in) {
super(in);
}
}
}

View File

@ -0,0 +1 @@
org.elasticsearch.index.query.DocValuesWhitelistExtension

View File

@ -0,0 +1,32 @@
#
# Licensed to Elasticsearch under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
class org.elasticsearch.index.query.VectorScriptDocValues {
}
class org.elasticsearch.index.query.VectorScriptDocValues$DenseVectorScriptDocValues {
}
class org.elasticsearch.index.query.VectorScriptDocValues$SparseVectorScriptDocValues {
}
static_import {
double cosineSimilarity(List, VectorScriptDocValues.DenseVectorScriptDocValues) bound_to org.elasticsearch.index.query.ScoreScriptUtils$CosineSimilarity
double dotProduct(List, VectorScriptDocValues.DenseVectorScriptDocValues) from_class org.elasticsearch.index.query.ScoreScriptUtils
double dotProductSparse(Map, VectorScriptDocValues.SparseVectorScriptDocValues) bound_to org.elasticsearch.index.query.ScoreScriptUtils$DotProductSparse
double cosineSimilaritySparse(Map, VectorScriptDocValues.SparseVectorScriptDocValues) bound_to org.elasticsearch.index.query.ScoreScriptUtils$CosineSimilaritySparse
}

View File

@ -83,7 +83,7 @@ public class VectorEncoderDecoderTests extends ESTestCase {
}
// imitates the code in DenseVectorFieldMapper::parse
private BytesRef mockEncodeDenseVector(float[] dims) {
public static BytesRef mockEncodeDenseVector(float[] dims) {
final short INT_BYTES = VectorEncoderDecoder.INT_BYTES;
byte[] buf = new byte[INT_BYTES * dims.length];
int offset = 0;

View File

@ -0,0 +1,82 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.query;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.index.mapper.VectorEncoderDecoder;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.index.query.ScoreScriptUtils.CosineSimilarity;
import org.elasticsearch.index.query.ScoreScriptUtils.DotProductSparse;
import org.elasticsearch.index.query.ScoreScriptUtils.CosineSimilaritySparse;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.elasticsearch.index.mapper.VectorEncoderDecoderTests.mockEncodeDenseVector;
import static org.elasticsearch.index.query.ScoreScriptUtils.dotProduct;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
public class ScoreScriptUtilsTests extends ESTestCase {
public void testDenseVectorFunctions() {
float[] docVector = {230.0f, 300.33f, -34.8988f, 15.555f, -200.0f};
BytesRef encodedDocVector = mockEncodeDenseVector(docVector);
VectorScriptDocValues.DenseVectorScriptDocValues dvs = mock(VectorScriptDocValues.DenseVectorScriptDocValues.class);
when(dvs.getEncodedValue()).thenReturn(encodedDocVector);
List<Number> queryVector = Arrays.asList(0.5, 111.3, -13.0, 14.8, -156.0);
// test dotProduct
double result = dotProduct(queryVector, dvs);
assertEquals("dotProduct result is not equal to the expected value!", 65425.62, result, 0.1);
// test cosineSimilarity
CosineSimilarity cosineSimilarity = new CosineSimilarity(queryVector);
double result2 = cosineSimilarity.cosineSimilarity(dvs);
assertEquals("cosineSimilarity result is not equal to the expected value!", 0.78, result2, 0.1);
}
public void testSparseVectorFunctions() {
int[] docVectorDims = {2, 10, 50, 113, 4545};
float[] docVectorValues = {230.0f, 300.33f, -34.8988f, 15.555f, -200.0f};
BytesRef encodedDocVector = VectorEncoderDecoder.encodeSparseVector(docVectorDims, docVectorValues, docVectorDims.length);
VectorScriptDocValues.SparseVectorScriptDocValues dvs = mock(VectorScriptDocValues.SparseVectorScriptDocValues.class);
when(dvs.getEncodedValue()).thenReturn(encodedDocVector);
Map<String, Number> queryVector = new HashMap<String, Number>() {{
put("2", 0.5);
put("10", 111.3);
put("50", -13.0);
put("113", 14.8);
put("4545", -156.0);
}};
// test dotProduct
DotProductSparse docProductSparse = new DotProductSparse(queryVector);
double result = docProductSparse.dotProductSparse(dvs);
assertEquals("dotProductSparse result is not equal to the expected value!", 65425.62, result, 0.1);
// test cosineSimilarity
CosineSimilaritySparse cosineSimilaritySparse = new CosineSimilaritySparse(queryVector);
double result2 = cosineSimilaritySparse.cosineSimilaritySparse(dvs);
assertEquals("cosineSimilaritySparse result is not equal to the expected value!", 0.78, result2, 0.1);
}
}

View File

@ -0,0 +1,100 @@
setup:
- skip:
features: headers
version: " - 7.0.99"
reason: "dense_vector functions were introduced in 7.1.0"
- do:
indices.create:
include_type_name: false
index: test-index
body:
settings:
number_of_replicas: 0
mappings:
properties:
my_dense_vector:
type: dense_vector
- do:
index:
index: test-index
id: 1
body:
my_dense_vector: [230.0, 300.33, -34.8988, 15.555, -200.0]
- do:
index:
index: test-index
id: 2
body:
my_dense_vector: [-0.5, 100.0, -13, 14.8, -156.0]
- do:
index:
index: test-index
id: 3
body:
my_dense_vector: [0.5, 111.3, -13.0, 14.8, -156.0]
- do:
indices.refresh: {}
---
"Dot Product":
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "dotProduct(params.query_vector, doc['my_dense_vector'])"
params:
query_vector: [0.5, 111.3, -13.0, 14.8, -156.0]
- match: {hits.total: 3}
- match: {hits.hits.0._id: "1"}
- gte: {hits.hits.0._score: 65425.62}
- lte: {hits.hits.0._score: 65425.63}
- match: {hits.hits.1._id: "3"}
- gte: {hits.hits.1._score: 37111.98}
- lte: {hits.hits.1._score: 37111.99}
- match: {hits.hits.2._id: "2"}
- gte: {hits.hits.2._score: 35853.78}
- lte: {hits.hits.2._score: 35853.79}
---
"Cosine Similarity":
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
params:
query_vector: [0.5, 111.3, -13.0, 14.8, -156.0]
- match: {hits.total: 3}
- match: {hits.hits.0._id: "3"}
- gte: {hits.hits.0._score: 0.999}
- lte: {hits.hits.0._score: 1.001}
- match: {hits.hits.1._id: "2"}
- gte: {hits.hits.1._score: 0.998}
- lte: {hits.hits.1._score: 1.0}
- match: {hits.hits.2._id: "1"}
- gte: {hits.hits.2._score: 0.78}
- lte: {hits.hits.2._score: 0.791}

View File

@ -1,27 +0,0 @@
setup:
- skip:
version: " - 6.99.99"
reason: "dense_vector field was introduced in 7.0.0"
- do:
indices.create:
index: test-index
body:
settings:
number_of_replicas: 0
mappings:
properties:
my_dense_vector:
type: dense_vector
---
"Indexing":
- do:
index:
index: test-index
id: 1
body:
my_dense_vector: [1.5, -10, 3455, 345452.4545]
- match: { result: created }

View File

@ -0,0 +1,152 @@
setup:
- skip:
features: headers
version: " - 7.0.99"
reason: "dense_vector functions were introduced in 7.1.0"
- do:
indices.create:
include_type_name: false
index: test-index
body:
settings:
number_of_replicas: 0
# we need to have 1 shard to get request failure in test "Dense vectors should error with sparse vector functions"
number_of_shards: 1
mappings:
properties:
my_dense_vector:
type: dense_vector
---
"Vectors of different dimensions and data types":
# document vectors of different dimensions
- do:
index:
index: test-index
id: 1
body:
my_dense_vector: [10]
- do:
index:
index: test-index
id: 2
body:
my_dense_vector: [10, 10.5]
- do:
index:
index: test-index
id: 3
body:
my_dense_vector: [10, 10.5, 100.5]
- do:
indices.refresh: {}
# query vector of type integer
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
params:
query_vector: [10]
- match: {hits.total: 3}
- match: {hits.hits.0._id: "1"}
- match: {hits.hits.1._id: "2"}
- match: {hits.hits.2._id: "3"}
# query vector of type double
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
params:
query_vector: [10.0]
- match: {hits.total: 3}
- match: {hits.hits.0._id: "1"}
- match: {hits.hits.1._id: "2"}
- match: {hits.hits.2._id: "3"}
---
"Distance functions for documents missing vector field should return 0":
- do:
index:
index: test-index
id: 1
body:
my_dense_vector: [10]
- do:
index:
index: test-index
id: 2
body:
some_other_field: "random_value"
- do:
indices.refresh: {}
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
params:
query_vector: [10.0]
- match: {hits.total: 2}
- match: {hits.hits.0._id: "1"}
- match: {hits.hits.1._id: "2"}
- match: {hits.hits.1._score: 0.0}
---
"Dense vectors should error with sparse vector functions":
- do:
index:
index: test-index
id: 1
body:
my_dense_vector: [10, 2, 0.15]
- do:
indices.refresh: {}
- do:
catch: bad_request
headers:
Content-Type: application/json
search:
body:
query:
script_score:
query: {match_all: {} }
script:
source: "dotProductSparse(params.query_vector, doc['my_dense_vector'])"
params:
query_vector: {"2": 0.5, "10" : 111.3}
- match: { error.root_cause.0.type: "script_exception" }

View File

@ -0,0 +1,100 @@
setup:
- skip:
features: headers
version: " - 7.0.99"
reason: "sparse_vector functions were introduced in 7.1.0"
- do:
indices.create:
include_type_name: false
index: test-index
body:
settings:
number_of_replicas: 0
mappings:
properties:
my_sparse_vector:
type: sparse_vector
- do:
index:
index: test-index
id: 1
body:
my_sparse_vector: {"2": 230.0, "10" : 300.33, "50": -34.8988, "113": 15.555, "4545": -200.0}
- do:
index:
index: test-index
id: 2
body:
my_sparse_vector: {"2": -0.5, "10" : 100.0, "50": -13, "113": 14.8, "4545": -156.0}
- do:
index:
index: test-index
id: 3
body:
my_sparse_vector: {"2": 0.5, "10" : 111.3, "50": -13.0, "113": 14.8, "4545": -156.0}
- do:
indices.refresh: {}
---
"Dot Product":
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "dotProductSparse(params.query_vector, doc['my_sparse_vector'])"
params:
query_vector: {"2": 0.5, "10" : 111.3, "50": -13.0, "113": 14.8, "4545": -156.0}
- match: {hits.total: 3}
- match: {hits.hits.0._id: "1"}
- gte: {hits.hits.0._score: 65425.62}
- lte: {hits.hits.0._score: 65425.63}
- match: {hits.hits.1._id: "3"}
- gte: {hits.hits.1._score: 37111.98}
- lte: {hits.hits.1._score: 37111.99}
- match: {hits.hits.2._id: "2"}
- gte: {hits.hits.2._score: 35853.78}
- lte: {hits.hits.2._score: 35853.79}
---
"Cosine Similarity":
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])"
params:
query_vector: {"2": -0.5, "10" : 111.3, "50": -13.0, "113": 14.8, "4545": -156.0}
- match: {hits.total: 3}
- match: {hits.hits.0._id: "3"}
- gte: {hits.hits.0._score: 0.999}
- lte: {hits.hits.0._score: 1.001}
- match: {hits.hits.1._id: "2"}
- gte: {hits.hits.1._score: 0.998}
- lte: {hits.hits.1._score: 1.0}
- match: {hits.hits.2._id: "1"}
- gte: {hits.hits.2._score: 0.78}
- lte: {hits.hits.2._score: 0.791}

View File

@ -1,27 +0,0 @@
setup:
- skip:
version: " - 6.99.99"
reason: "sparse_vector field was introduced in 7.0.0"
- do:
indices.create:
index: test-index
body:
settings:
number_of_replicas: 0
mappings:
properties:
my_sparse_vector:
type: sparse_vector
---
"Indexing":
- do:
index:
index: test-index
id: 1
body:
my_sparse_vector: { "50" : 1.8, "2" : -0.4, "10" : 1000.3, "4545" : -0.00004}
- match: { result: created }

View File

@ -0,0 +1,203 @@
setup:
- skip:
features: headers
version: " - 7.0.99"
reason: "sparse_vector functions were introduced in 7.1.0"
- do:
indices.create:
include_type_name: false
index: test-index
body:
settings:
number_of_replicas: 0
# we need to have 1 shard to get request failure in test "Sparse vectors should error with dense vector functions"
number_of_shards: 1
mappings:
properties:
my_sparse_vector:
type: sparse_vector
---
"Vectors of different dimensions and data types":
# document vectors of different dimensions
- do:
index:
index: test-index
id: 1
body:
my_sparse_vector: {"1": 10}
- do:
index:
index: test-index
id: 2
body:
my_sparse_vector: {"1": 10, "10" : 10.5}
- do:
index:
index: test-index
id: 3
body:
my_sparse_vector: {"1": 10, "10" : 10.5, "100": 100.5}
- do:
indices.refresh: {}
# query vector of type integer
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])"
params:
query_vector: {"1": 10}
- match: {hits.total: 3}
- match: {hits.hits.0._id: "1"}
- match: {hits.hits.1._id: "2"}
- match: {hits.hits.2._id: "3"}
# query vector of type double
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])"
params:
query_vector: {"1": 10.0}
- match: {hits.total: 3}
- match: {hits.hits.0._id: "1"}
- match: {hits.hits.1._id: "2"}
- match: {hits.hits.2._id: "3"}
---
"Distance functions for documents missing vector field should return 0":
- do:
index:
index: test-index
id: 1
body:
my_sparse_vector: {"1": 10}
- do:
index:
index: test-index
id: 2
body:
some_other_field: "random_value"
- do:
indices.refresh: {}
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])"
params:
query_vector: {"1": 10.0}
- match: {hits.total: 2}
- match: {hits.hits.0._id: "1"}
- match: {hits.hits.1._id: "2"}
- match: {hits.hits.1._score: 0.0}
---
"Dimensions can be sorted differently":
# All the documents' and query's vectors are the same, and should return cosineSimilarity equal to 1
- do:
index:
index: test-index
id: 1
body:
my_sparse_vector: {"2": 230.0, "11" : 300.33, "12": -34.8988, "30": 15.555, "100": -200.0}
- do:
index:
index: test-index
id: 2
body:
my_sparse_vector: {"100": -200.0, "12": -34.8988, "11" : 300.33, "113": 15.555, "2": 230.0}
- do:
index:
index: test-index
id: 3
body:
my_sparse_vector: {"100": -200.0, "30": 15.555, "12": -34.8988, "11" : 300.33, "2": 230.0}
- do:
indices.refresh: {}
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])"
params:
query_vector: {"100": -200.0, "11" : 300.33, "12": -34.8988, "2": 230.0, "30": 15.555}
- match: {hits.total: 3}
- gte: {hits.hits.0._score: 0.99}
- lte: {hits.hits.0._score: 1.001}
- gte: {hits.hits.1._score: 0.99}
- lte: {hits.hits.1._score: 1.001}
- gte: {hits.hits.2._score: 0.99}
- lte: {hits.hits.2._score: 1.001}
---
"Sparse vectors should error with dense vector functions":
- do:
index:
index: test-index
id: 1
body:
my_sparse_vector: {"100": -200.0, "30": 15.555}
- do:
indices.refresh: {}
- do:
catch: bad_request
headers:
Content-Type: application/json
search:
body:
query:
script_score:
query: {match_all: {} }
script:
source: "dotProduct(params.query_vector, doc['my_sparse_vector'])"
params:
query_vector: [0.5, 111]
- match: { error.root_cause.0.type: "script_exception" }