Vector field (#33022)
1. Dense vector PUT dindex { "mappings": { "_doc": { "properties": { "my_vector": { "type": "dense_vector" }, "my_text" : { "type" : "keyword" } } } } } PUT dinex/_doc/1 { "my_text" : "text1", "my_vector" : [ 0.5, 10, 6 ] } 2. Sparse vector PUT sindex { "mappings": { "_doc": { "properties": { "my_vector": { "type": "sparse_vector" }, "my_text" : { "type" : "keyword" } } } } } PUT sindex/_doc/1 { "my_text" : "text1", "my_vector" : {"1": 0.5, "99": -0.5, "5": 1} }
This commit is contained in:
parent
9c1cdea839
commit
b5d532f9e3
|
@ -47,6 +47,11 @@ string:: <<text,`text`>> and <<keyword,`keyword`>>
|
|||
|
||||
<<feature-vector>>:: Record numeric feature vectors to boost hits at query time.
|
||||
|
||||
<<dense-vector>>:: Record dense vectors of float values.
|
||||
|
||||
<<sparse-vector>>:: Record sparse vectors of float values.
|
||||
|
||||
|
||||
[float]
|
||||
=== Multi-fields
|
||||
|
||||
|
@ -98,3 +103,7 @@ include::types/parent-join.asciidoc[]
|
|||
include::types/feature.asciidoc[]
|
||||
|
||||
include::types/feature-vector.asciidoc[]
|
||||
|
||||
include::types/dense-vector.asciidoc[]
|
||||
|
||||
include::types/sparse-vector.asciidoc[]
|
|
@ -0,0 +1,52 @@
|
|||
[[dense-vector]]
|
||||
=== Dense vector datatype
|
||||
|
||||
A `dense_vector` field stores dense vectors of float values.
|
||||
The maximum number of dimensions that can be in a vector should
|
||||
not exceed 500. The number of dimensions can be
|
||||
different across documents. A `dense_vector` field is
|
||||
a single-valued field.
|
||||
|
||||
These vectors can be used for document scoring.
|
||||
For example, a document score can represent a distance between
|
||||
a given query vector and the indexed document vector.
|
||||
|
||||
You index a dense vector as an array of floats.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"mappings": {
|
||||
"_doc": {
|
||||
"properties": {
|
||||
"my_vector": {
|
||||
"type": "dense_vector"
|
||||
},
|
||||
"my_text" : {
|
||||
"type" : "keyword"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PUT my_index/_doc/1
|
||||
{
|
||||
"my_text" : "text1",
|
||||
"my_vector" : [0.5, 10, 6]
|
||||
}
|
||||
|
||||
PUT my_index/_doc/2
|
||||
{
|
||||
"my_text" : "text2",
|
||||
"my_vector" : [-0.5, 10, 10, 4]
|
||||
}
|
||||
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
Internally, each document's dense vector is encoded as a binary
|
||||
doc value. Its size in bytes is equal to
|
||||
`4 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` -
|
||||
number of the vector's dimensions.
|
|
@ -0,0 +1,55 @@
|
|||
[[sparse-vector]]
|
||||
=== Sparse vector datatype
|
||||
|
||||
A `sparse_vector` field stores sparse vectors of float values.
|
||||
The maximum number of dimensions that can be in a vector should
|
||||
not exceed 500. The number of dimensions can be
|
||||
different across documents. A `sparse_vector` field is
|
||||
a single-valued field.
|
||||
|
||||
These vectors can be used for document scoring.
|
||||
For example, a document score can represent a distance between
|
||||
a given query vector and the indexed document vector.
|
||||
|
||||
You represent a sparse vector as an object, where object fields
|
||||
are dimensions, and fields values are values for these dimensions.
|
||||
Dimensions are integer values from `0` to `65535` encoded as strings.
|
||||
Dimensions don't need to be in order.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"mappings": {
|
||||
"_doc": {
|
||||
"properties": {
|
||||
"my_vector": {
|
||||
"type": "sparse_vector"
|
||||
},
|
||||
"my_text" : {
|
||||
"type" : "keyword"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PUT my_index/_doc/1
|
||||
{
|
||||
"my_text" : "text1",
|
||||
"my_vector" : {"1": 0.5, "5": -0.5, "100": 1}
|
||||
}
|
||||
|
||||
PUT my_index/_doc/2
|
||||
{
|
||||
"my_text" : "text2",
|
||||
"my_vector" : {"103": 0.5, "4": -0.5, "5": 1, "11" : 1.2}
|
||||
}
|
||||
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
Internally, each document's sparse vector is encoded as a binary
|
||||
doc value. Its size in bytes is equal to
|
||||
`6 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` -
|
||||
number of the vector's dimensions.
|
|
@ -0,0 +1,195 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.mapper;
|
||||
|
||||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.search.DocValuesFieldExistsQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.XContentParser.Token;
|
||||
import org.elasticsearch.index.fielddata.IndexFieldData;
|
||||
import org.elasticsearch.index.query.QueryShardContext;
|
||||
import org.elasticsearch.search.DocValueFormat;
|
||||
import org.joda.time.DateTimeZone;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.elasticsearch.common.xcontent.XContentParserUtils.ensureExpectedToken;
|
||||
|
||||
/**
|
||||
* A {@link FieldMapper} for indexing a dense vector of floats.
|
||||
*/
|
||||
public class DenseVectorFieldMapper extends FieldMapper implements ArrayValueMapperParser {
|
||||
|
||||
public static final String CONTENT_TYPE = "dense_vector";
|
||||
public static short MAX_DIMS_COUNT = 500; //maximum allowed number of dimensions
|
||||
private static final byte INT_BYTES = 4;
|
||||
|
||||
public static class Defaults {
|
||||
public static final MappedFieldType FIELD_TYPE = new DenseVectorFieldType();
|
||||
|
||||
static {
|
||||
FIELD_TYPE.setTokenized(false);
|
||||
FIELD_TYPE.setIndexOptions(IndexOptions.NONE);
|
||||
FIELD_TYPE.setHasDocValues(true);
|
||||
FIELD_TYPE.setOmitNorms(true);
|
||||
FIELD_TYPE.freeze();
|
||||
}
|
||||
}
|
||||
|
||||
public static class Builder extends FieldMapper.Builder<Builder, DenseVectorFieldMapper> {
|
||||
|
||||
public Builder(String name) {
|
||||
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
|
||||
builder = this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DenseVectorFieldType fieldType() {
|
||||
return (DenseVectorFieldType) super.fieldType();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DenseVectorFieldMapper build(BuilderContext context) {
|
||||
setupFieldType(context);
|
||||
return new DenseVectorFieldMapper(
|
||||
name, fieldType, defaultFieldType,
|
||||
context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
|
||||
}
|
||||
}
|
||||
|
||||
public static class TypeParser implements Mapper.TypeParser {
|
||||
@Override
|
||||
public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
|
||||
DenseVectorFieldMapper.Builder builder = new DenseVectorFieldMapper.Builder(name);
|
||||
return builder;
|
||||
}
|
||||
}
|
||||
|
||||
public static final class DenseVectorFieldType extends MappedFieldType {
|
||||
|
||||
public DenseVectorFieldType() {}
|
||||
|
||||
protected DenseVectorFieldType(DenseVectorFieldType ref) {
|
||||
super(ref);
|
||||
}
|
||||
|
||||
public DenseVectorFieldType clone() {
|
||||
return new DenseVectorFieldType(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String typeName() {
|
||||
return CONTENT_TYPE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValueFormat docValueFormat(String format, DateTimeZone timeZone) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Field [" + name() + "] of type [" + typeName() + "] doesn't support docvalue_fields or aggregations");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query existsQuery(QueryShardContext context) {
|
||||
return new DocValuesFieldExistsQuery(name());
|
||||
}
|
||||
|
||||
@Override
|
||||
public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query termQuery(Object value, QueryShardContext context) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Field [" + name() + "] of type [" + typeName() + "] doesn't support queries");
|
||||
}
|
||||
}
|
||||
|
||||
private DenseVectorFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
|
||||
Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
|
||||
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
|
||||
assert fieldType.indexOptions() == IndexOptions.NONE;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected DenseVectorFieldMapper clone() {
|
||||
return (DenseVectorFieldMapper) super.clone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DenseVectorFieldType fieldType() {
|
||||
return (DenseVectorFieldType) super.fieldType();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void parse(ParseContext context) throws IOException {
|
||||
if (context.externalValueSet()) {
|
||||
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] can't be used in multi-fields");
|
||||
}
|
||||
|
||||
// encode array of floats as array of integers and store into buf
|
||||
// this code is here and not int the VectorEncoderDecoder so not to create extra arrays
|
||||
byte[] buf = new byte[0];
|
||||
int offset = 0;
|
||||
int dim = 0;
|
||||
for (Token token = context.parser().nextToken(); token != Token.END_ARRAY; token = context.parser().nextToken()) {
|
||||
ensureExpectedToken(Token.VALUE_NUMBER, token, context.parser()::getTokenLocation);
|
||||
float value = context.parser().floatValue(true);
|
||||
if (buf.length < (offset + INT_BYTES)) {
|
||||
buf = ArrayUtil.grow(buf, (offset + INT_BYTES));
|
||||
}
|
||||
int intValue = Float.floatToIntBits(value);
|
||||
buf[offset] = (byte) (intValue >> 24);
|
||||
buf[offset+1] = (byte) (intValue >> 16);
|
||||
buf[offset+2] = (byte) (intValue >> 8);
|
||||
buf[offset+3] = (byte) intValue;
|
||||
offset += INT_BYTES;
|
||||
dim++;
|
||||
if (dim >= MAX_DIMS_COUNT) {
|
||||
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
|
||||
"] has exceeded the maximum allowed number of dimensions of :[" + MAX_DIMS_COUNT + "]");
|
||||
}
|
||||
}
|
||||
BinaryDocValuesField field = new BinaryDocValuesField(fieldType().name(), new BytesRef(buf, 0, offset));
|
||||
if (context.doc().getByKey(fieldType().name()) != null) {
|
||||
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
|
||||
"] doesn't not support indexing multiple values for the same field in the same document");
|
||||
}
|
||||
context.doc().addWithKey(fieldType().name(), field);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void parseCreateField(ParseContext context, List<IndexableField> fields) {
|
||||
throw new AssertionError("parse is implemented directly");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String contentType() {
|
||||
return CONTENT_TYPE;
|
||||
}
|
||||
}
|
|
@ -39,6 +39,8 @@ public class MapperExtrasPlugin extends Plugin implements MapperPlugin, SearchPl
|
|||
mappers.put(TokenCountFieldMapper.CONTENT_TYPE, new TokenCountFieldMapper.TypeParser());
|
||||
mappers.put(FeatureFieldMapper.CONTENT_TYPE, new FeatureFieldMapper.TypeParser());
|
||||
mappers.put(FeatureVectorFieldMapper.CONTENT_TYPE, new FeatureVectorFieldMapper.TypeParser());
|
||||
mappers.put(DenseVectorFieldMapper.CONTENT_TYPE, new DenseVectorFieldMapper.TypeParser());
|
||||
mappers.put(SparseVectorFieldMapper.CONTENT_TYPE, new SparseVectorFieldMapper.TypeParser());
|
||||
return Collections.unmodifiableMap(mappers);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,207 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.mapper;
|
||||
|
||||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.search.DocValuesFieldExistsQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.XContentParser.Token;
|
||||
import org.elasticsearch.index.fielddata.IndexFieldData;
|
||||
import org.elasticsearch.index.query.QueryShardContext;
|
||||
import org.elasticsearch.search.DocValueFormat;
|
||||
import org.joda.time.DateTimeZone;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.elasticsearch.common.xcontent.XContentParserUtils.ensureExpectedToken;
|
||||
|
||||
/**
|
||||
* A {@link FieldMapper} for indexing a sparse vector of floats.
|
||||
*/
|
||||
public class SparseVectorFieldMapper extends FieldMapper {
|
||||
|
||||
public static final String CONTENT_TYPE = "sparse_vector";
|
||||
public static short MAX_DIMS_COUNT = 500; //maximum allowed number of dimensions
|
||||
public static int MAX_DIMS_NUMBER = 65535; //maximum allowed dimension's number
|
||||
|
||||
public static class Defaults {
|
||||
public static final MappedFieldType FIELD_TYPE = new SparseVectorFieldType();
|
||||
|
||||
static {
|
||||
FIELD_TYPE.setTokenized(false);
|
||||
FIELD_TYPE.setIndexOptions(IndexOptions.NONE);
|
||||
FIELD_TYPE.setHasDocValues(true);
|
||||
FIELD_TYPE.setOmitNorms(true);
|
||||
FIELD_TYPE.freeze();
|
||||
}
|
||||
}
|
||||
|
||||
public static class Builder extends FieldMapper.Builder<Builder, SparseVectorFieldMapper> {
|
||||
|
||||
public Builder(String name) {
|
||||
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
|
||||
builder = this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SparseVectorFieldType fieldType() {
|
||||
return (SparseVectorFieldType) super.fieldType();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SparseVectorFieldMapper build(BuilderContext context) {
|
||||
setupFieldType(context);
|
||||
return new SparseVectorFieldMapper(
|
||||
name, fieldType, defaultFieldType,
|
||||
context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
|
||||
}
|
||||
}
|
||||
|
||||
public static class TypeParser implements Mapper.TypeParser {
|
||||
@Override
|
||||
public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
|
||||
SparseVectorFieldMapper.Builder builder = new SparseVectorFieldMapper.Builder(name);
|
||||
return builder;
|
||||
}
|
||||
}
|
||||
|
||||
public static final class SparseVectorFieldType extends MappedFieldType {
|
||||
|
||||
public SparseVectorFieldType() {}
|
||||
|
||||
protected SparseVectorFieldType(SparseVectorFieldType ref) {
|
||||
super(ref);
|
||||
}
|
||||
|
||||
public SparseVectorFieldType clone() {
|
||||
return new SparseVectorFieldType(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String typeName() {
|
||||
return CONTENT_TYPE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValueFormat docValueFormat(String format, DateTimeZone timeZone) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Field [" + name() + "] of type [" + typeName() + "] doesn't support docvalue_fields or aggregations");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query existsQuery(QueryShardContext context) {
|
||||
return new DocValuesFieldExistsQuery(name());
|
||||
}
|
||||
|
||||
@Override
|
||||
public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query termQuery(Object value, QueryShardContext context) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Field [" + name() + "] of type [" + typeName() + "] doesn't support queries");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private SparseVectorFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
|
||||
Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
|
||||
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
|
||||
assert fieldType.indexOptions() == IndexOptions.NONE;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected SparseVectorFieldMapper clone() {
|
||||
return (SparseVectorFieldMapper) super.clone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SparseVectorFieldType fieldType() {
|
||||
return (SparseVectorFieldType) super.fieldType();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void parse(ParseContext context) throws IOException {
|
||||
if (context.externalValueSet()) {
|
||||
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] can't be used in multi-fields");
|
||||
}
|
||||
ensureExpectedToken(Token.START_OBJECT, context.parser().currentToken(), context.parser()::getTokenLocation);
|
||||
int[] dims = new int[0];
|
||||
float[] values = new float[0];
|
||||
int dimCount = 0;
|
||||
int dim = 0;
|
||||
float value;
|
||||
for (Token token = context.parser().nextToken(); token != Token.END_OBJECT; token = context.parser().nextToken()) {
|
||||
if (token == Token.FIELD_NAME) {
|
||||
try {
|
||||
dim = Integer.parseInt(context.parser().currentName());
|
||||
if (dim < 0 || dim > MAX_DIMS_NUMBER) {
|
||||
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "]'s dimension number " +
|
||||
"must be a non-negative integer value not exceeding [" + MAX_DIMS_NUMBER + "], got [" + dim + "]");
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "]'s dimensions should be " +
|
||||
"integers represented as strings, but got [" + context.parser().currentName() + "]", e);
|
||||
}
|
||||
} else if (token == Token.VALUE_NUMBER) {
|
||||
value = context.parser().floatValue(true);
|
||||
if (dims.length <= dimCount) { // ensure arrays have enough capacity
|
||||
values = ArrayUtil.grow(values, dimCount + 1);
|
||||
dims = ArrayUtil.grow(dims, dimCount + 1);
|
||||
}
|
||||
dims[dimCount] = dim;
|
||||
values[dimCount] = value;
|
||||
dimCount ++;
|
||||
if (dimCount >= MAX_DIMS_COUNT) {
|
||||
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
|
||||
"] has exceeded the maximum allowed number of dimensions of :[" + MAX_DIMS_COUNT + "]");
|
||||
}
|
||||
} else {
|
||||
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
|
||||
"] takes an object that maps a dimension number to a float, " + "but got unexpected token [" + token + "]");
|
||||
}
|
||||
}
|
||||
|
||||
BytesRef br = VectorEncoderDecoder.encodeSparseVector(dims, values, dimCount);
|
||||
BinaryDocValuesField field = new BinaryDocValuesField(fieldType().name(), br);
|
||||
context.doc().addWithKey(fieldType().name(), field);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected void parseCreateField(ParseContext context, List<IndexableField> fields) {
|
||||
throw new AssertionError("parse is implemented directly");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String contentType() {
|
||||
return CONTENT_TYPE;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,141 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.mapper;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.InPlaceMergeSorter;
|
||||
|
||||
// static utility functions for encoding and decoding dense_vector and sparse_vector fields
|
||||
final class VectorEncoderDecoder {
|
||||
static final byte INT_BYTES = 4;
|
||||
static final byte SHORT_BYTES = 2;
|
||||
|
||||
private VectorEncoderDecoder() { }
|
||||
|
||||
/**
|
||||
* Encodes a sparse array represented by values, dims and dimCount into a bytes array - BytesRef
|
||||
* BytesRef: int[] floats encoded as integers values, 2 bytes for each dimension
|
||||
* @param values - values of the sparse array
|
||||
* @param dims - dims of the sparse array
|
||||
* @param dimCount - number of the dimension
|
||||
* @return BytesRef
|
||||
*/
|
||||
static BytesRef encodeSparseVector(int[] dims, float[] values, int dimCount) {
|
||||
// 1. Sort dims and values
|
||||
sortSparseDimsValues(dims, values, dimCount);
|
||||
byte[] buf = new byte[dimCount * (INT_BYTES + SHORT_BYTES)];
|
||||
|
||||
// 2. Encode dimensions
|
||||
// as each dimension is a positive value that doesn't exceed 65535, 2 bytes is enough for encoding it
|
||||
int offset = 0;
|
||||
for (int dim = 0; dim < dimCount; dim++) {
|
||||
buf[offset] = (byte) (dims[dim] >> 8);
|
||||
buf[offset+1] = (byte) dims[dim];
|
||||
offset += SHORT_BYTES;
|
||||
}
|
||||
|
||||
// 3. Encode values
|
||||
for (int dim = 0; dim < dimCount; dim++) {
|
||||
int intValue = Float.floatToIntBits(values[dim]);
|
||||
buf[offset] = (byte) (intValue >> 24);
|
||||
buf[offset+1] = (byte) (intValue >> 16);
|
||||
buf[offset+2] = (byte) (intValue >> 8);
|
||||
buf[offset+3] = (byte) intValue;
|
||||
offset += INT_BYTES;
|
||||
}
|
||||
|
||||
return new BytesRef(buf);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes the first part of BytesRef into sparse vector dimensions
|
||||
* @param vectorBR - vector decoded in BytesRef
|
||||
*/
|
||||
static int[] decodeSparseVectorDims(BytesRef vectorBR) {
|
||||
int dimCount = vectorBR.length / (INT_BYTES + SHORT_BYTES);
|
||||
int[] dims = new int[dimCount];
|
||||
int offset = vectorBR.offset;
|
||||
for (int dim = 0; dim < dimCount; dim++) {
|
||||
dims[dim] = ((vectorBR.bytes[offset] & 0xFF) << 8) | (vectorBR.bytes[offset+1] & 0xFF);
|
||||
offset += SHORT_BYTES;
|
||||
}
|
||||
return dims;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes the second part of the BytesRef into sparse vector values
|
||||
* @param vectorBR - vector decoded in BytesRef
|
||||
*/
|
||||
static float[] decodeSparseVector(BytesRef vectorBR) {
|
||||
int dimCount = vectorBR.length / (INT_BYTES + SHORT_BYTES);
|
||||
int offset = vectorBR.offset + SHORT_BYTES * dimCount; //calculate the offset from where values are encoded
|
||||
float[] vector = new float[dimCount];
|
||||
for (int dim = 0; dim < dimCount; dim++) {
|
||||
int intValue = ((vectorBR.bytes[offset] & 0xFF) << 24) |
|
||||
((vectorBR.bytes[offset+1] & 0xFF) << 16) |
|
||||
((vectorBR.bytes[offset+2] & 0xFF) << 8) |
|
||||
(vectorBR.bytes[offset+3] & 0xFF);
|
||||
vector[dim] = Float.intBitsToFloat(intValue);
|
||||
offset = offset + INT_BYTES;
|
||||
}
|
||||
return vector;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Sort dimensions in the ascending order and
|
||||
sort values in the same order as their corresponding dimensions
|
||||
**/
|
||||
static void sortSparseDimsValues(int[] dims, float[] values, int n) {
|
||||
new InPlaceMergeSorter() {
|
||||
@Override
|
||||
public int compare(int i, int j) {
|
||||
return Integer.compare(dims[i], dims[j]);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void swap(int i, int j) {
|
||||
int tempDim = dims[i];
|
||||
dims[i] = dims[j];
|
||||
dims[j] = tempDim;
|
||||
|
||||
float tempValue = values[j];
|
||||
values[j] = values[i];
|
||||
values[i] = tempValue;
|
||||
}
|
||||
}.sort(0, n);
|
||||
}
|
||||
|
||||
// Decodes a BytesRef into an array of floats
|
||||
static float[] decodeDenseVector(BytesRef vectorBR) {
|
||||
int dimCount = vectorBR.length / INT_BYTES;
|
||||
float[] vector = new float[dimCount];
|
||||
int offset = vectorBR.offset;
|
||||
for (int dim = 0; dim < dimCount; dim++) {
|
||||
int intValue = ((vectorBR.bytes[offset] & 0xFF) << 24) |
|
||||
((vectorBR.bytes[offset+1] & 0xFF) << 16) |
|
||||
((vectorBR.bytes[offset+2] & 0xFF) << 8) |
|
||||
(vectorBR.bytes[offset+3] & 0xFF);
|
||||
vector[dim] = Float.intBitsToFloat(intValue);
|
||||
offset = offset + INT_BYTES;
|
||||
}
|
||||
return vector;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.mapper;
|
||||
|
||||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
import org.elasticsearch.common.compress.CompressedXContent;
|
||||
import org.elasticsearch.common.xcontent.XContentFactory;
|
||||
import org.elasticsearch.common.xcontent.XContentType;
|
||||
import org.elasticsearch.index.IndexService;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
import org.elasticsearch.test.ESSingleNodeTestCase;
|
||||
import org.hamcrest.Matchers;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
public class DenseVectorFieldMapperTests extends ESSingleNodeTestCase {
|
||||
|
||||
@Override
|
||||
protected Collection<Class<? extends Plugin>> getPlugins() {
|
||||
return pluginList(MapperExtrasPlugin.class);
|
||||
}
|
||||
|
||||
public void testDefaults() throws Exception {
|
||||
IndexService indexService = createIndex("test-index");
|
||||
DocumentMapperParser parser = indexService.mapperService().documentMapperParser();
|
||||
String mapping = Strings.toString(XContentFactory.jsonBuilder()
|
||||
.startObject()
|
||||
.startObject("_doc")
|
||||
.startObject("properties")
|
||||
.startObject("my-dense-vector").field("type", "dense_vector")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject());
|
||||
|
||||
DocumentMapper mapper = parser.parse("_doc", new CompressedXContent(mapping));
|
||||
assertEquals(mapping, mapper.mappingSource().toString());
|
||||
|
||||
float[] expectedArray = {-12.1f, 100.7f, -4};
|
||||
ParsedDocument doc1 = mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
|
||||
.bytes(XContentFactory.jsonBuilder()
|
||||
.startObject()
|
||||
.startArray("my-dense-vector").value(expectedArray[0]).value(expectedArray[1]).value(expectedArray[2]).endArray()
|
||||
.endObject()),
|
||||
XContentType.JSON));
|
||||
IndexableField[] fields = doc1.rootDoc().getFields("my-dense-vector");
|
||||
assertEquals(1, fields.length);
|
||||
assertThat(fields[0], Matchers.instanceOf(BinaryDocValuesField.class));
|
||||
|
||||
// assert that after decoding the indexed value is equal to expected
|
||||
BytesRef vectorBR = ((BinaryDocValuesField) fields[0]).binaryValue();
|
||||
float[] decodedValues = VectorEncoderDecoder.decodeDenseVector(vectorBR);
|
||||
assertArrayEquals(
|
||||
"Decoded dense vector values is not equal to the indexed one.",
|
||||
expectedArray,
|
||||
decodedValues,
|
||||
0.001f
|
||||
);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.mapper;
|
||||
|
||||
public class DenseVectorFieldTypeTests extends FieldTypeTestCase {
|
||||
|
||||
@Override
|
||||
protected MappedFieldType createDefaultFieldType() {
|
||||
return new DenseVectorFieldMapper.DenseVectorFieldType();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,164 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.mapper;
|
||||
|
||||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
import org.elasticsearch.common.compress.CompressedXContent;
|
||||
import org.elasticsearch.common.xcontent.XContentFactory;
|
||||
import org.elasticsearch.common.xcontent.XContentType;
|
||||
import org.elasticsearch.index.IndexService;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
import org.elasticsearch.test.ESSingleNodeTestCase;
|
||||
import org.hamcrest.Matchers;
|
||||
import org.junit.Before;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
import static org.hamcrest.Matchers.containsString;
|
||||
import static org.hamcrest.core.IsInstanceOf.instanceOf;
|
||||
|
||||
public class SparseVectorFieldMapperTests extends ESSingleNodeTestCase {
|
||||
private DocumentMapper mapper;
|
||||
|
||||
@Before
|
||||
public void setup() throws Exception {
|
||||
IndexService indexService = createIndex("test-index");
|
||||
DocumentMapperParser parser = indexService.mapperService().documentMapperParser();
|
||||
String mapping = Strings.toString(XContentFactory.jsonBuilder()
|
||||
.startObject()
|
||||
.startObject("_doc")
|
||||
.startObject("properties")
|
||||
.startObject("my-sparse-vector").field("type", "sparse_vector")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject());
|
||||
mapper = parser.parse("_doc", new CompressedXContent(mapping));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<Class<? extends Plugin>> getPlugins() {
|
||||
return pluginList(MapperExtrasPlugin.class);
|
||||
}
|
||||
|
||||
public void testDefaults() throws Exception {
|
||||
int[] indexedDims = {65535, 50, 2};
|
||||
float[] indexedValues = {0.5f, 1800f, -34567.11f};
|
||||
ParsedDocument doc1 = mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
|
||||
.bytes(XContentFactory.jsonBuilder()
|
||||
.startObject()
|
||||
.startObject("my-sparse-vector")
|
||||
.field(Integer.toString(indexedDims[0]), indexedValues[0])
|
||||
.field(Integer.toString(indexedDims[1]), indexedValues[1])
|
||||
.field(Integer.toString(indexedDims[2]), indexedValues[2])
|
||||
.endObject()
|
||||
.endObject()),
|
||||
XContentType.JSON));
|
||||
IndexableField[] fields = doc1.rootDoc().getFields("my-sparse-vector");
|
||||
assertEquals(1, fields.length);
|
||||
assertThat(fields[0], Matchers.instanceOf(BinaryDocValuesField.class));
|
||||
|
||||
// assert that after decoding the indexed values are equal to expected
|
||||
int[] expectedDims = {2, 50, 65535}; //the same as indexed but sorted
|
||||
float[] expectedValues = {-34567.11f, 1800f, 0.5f}; //the same as indexed but sorted by their dimensions
|
||||
|
||||
// assert that after decoding the indexed dims and values are equal to expected
|
||||
BytesRef vectorBR = ((BinaryDocValuesField) fields[0]).binaryValue();
|
||||
int[] decodedDims = VectorEncoderDecoder.decodeSparseVectorDims(vectorBR);
|
||||
assertArrayEquals(
|
||||
"Decoded sparse vector dimensions are not equal to the indexed ones.",
|
||||
expectedDims,
|
||||
decodedDims
|
||||
);
|
||||
float[] decodedValues = VectorEncoderDecoder.decodeSparseVector(vectorBR);
|
||||
assertArrayEquals(
|
||||
"Decoded sparse vector values are not equal to the indexed ones.",
|
||||
expectedValues,
|
||||
decodedValues,
|
||||
0.001f
|
||||
);
|
||||
}
|
||||
|
||||
public void testErrors() {
|
||||
// 1. test for an error on negative dimension
|
||||
MapperParsingException e = expectThrows(MapperParsingException.class, () -> {
|
||||
mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
|
||||
.bytes(XContentFactory.jsonBuilder()
|
||||
.startObject()
|
||||
.startObject("my-sparse-vector")
|
||||
.field(Integer.toString(-50), 100f)
|
||||
.endObject()
|
||||
.endObject()),
|
||||
XContentType.JSON));
|
||||
});
|
||||
assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
|
||||
assertThat(e.getCause().getMessage(), containsString(
|
||||
"dimension number must be a non-negative integer value not exceeding [65535], got [-50]"));
|
||||
|
||||
// 2. test for an error on a dimension greater than MAX_DIMS_NUMBER
|
||||
e = expectThrows(MapperParsingException.class, () -> {
|
||||
mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
|
||||
.bytes(XContentFactory.jsonBuilder()
|
||||
.startObject()
|
||||
.startObject("my-sparse-vector")
|
||||
.field(Integer.toString(70000), 100f)
|
||||
.endObject()
|
||||
.endObject()),
|
||||
XContentType.JSON));
|
||||
});
|
||||
assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
|
||||
assertThat(e.getCause().getMessage(), containsString(
|
||||
"dimension number must be a non-negative integer value not exceeding [65535], got [70000]"));
|
||||
|
||||
// 3. test for an error on a wrong formatted dimension
|
||||
e = expectThrows(MapperParsingException.class, () -> {
|
||||
mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
|
||||
.bytes(XContentFactory.jsonBuilder()
|
||||
.startObject()
|
||||
.startObject("my-sparse-vector")
|
||||
.field("WrongDim123", 100f)
|
||||
.endObject()
|
||||
.endObject()),
|
||||
XContentType.JSON));
|
||||
});
|
||||
assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
|
||||
assertThat(e.getCause().getMessage(), containsString(
|
||||
"dimensions should be integers represented as strings, but got [WrongDim123]"));
|
||||
|
||||
// 4. test for an error on a wrong format for the map of dims to values
|
||||
e = expectThrows(MapperParsingException.class, () -> {
|
||||
mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
|
||||
.bytes(XContentFactory.jsonBuilder()
|
||||
.startObject()
|
||||
.startObject("my-sparse-vector")
|
||||
.startArray(Integer.toString(10)).value(10f).value(100f).endArray()
|
||||
.endObject()
|
||||
.endObject()),
|
||||
XContentType.JSON));
|
||||
});
|
||||
assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
|
||||
assertThat(e.getCause().getMessage(), containsString(
|
||||
"takes an object that maps a dimension number to a float, but got unexpected token [START_ARRAY]"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.mapper;
|
||||
|
||||
public class SparseVectorFieldTypeTests extends FieldTypeTestCase {
|
||||
|
||||
@Override
|
||||
protected MappedFieldType createDefaultFieldType() {
|
||||
return new SparseVectorFieldMapper.SparseVectorFieldType();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.mapper;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class VectorEncoderDecoderTests extends ESTestCase {
|
||||
|
||||
public void testDenseVectorEncodingDecoding() {
|
||||
int dimCount = randomIntBetween(0, 300);
|
||||
float[] expectedValues = new float[dimCount];
|
||||
for (int i = 0; i < dimCount; i++) {
|
||||
expectedValues[i] = randomFloat();
|
||||
}
|
||||
|
||||
// test that values that went through encoding and decoding are equal to their original
|
||||
BytesRef encodedDenseVector = mockEncodeDenseVector(expectedValues);
|
||||
float[] decodedValues = VectorEncoderDecoder.decodeDenseVector(encodedDenseVector);
|
||||
assertArrayEquals(
|
||||
"Decoded dense vector values are not equal to their original.",
|
||||
expectedValues,
|
||||
decodedValues,
|
||||
0.001f
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
public void testSparseVectorEncodingDecoding() {
|
||||
int dimCount = randomIntBetween(0, 100);
|
||||
float[] expectedValues = new float[dimCount];
|
||||
int[] expectedDims = randomUniqueDims(dimCount);
|
||||
for (int i = 0; i < dimCount; i++) {
|
||||
expectedValues[i] = randomFloat();
|
||||
}
|
||||
|
||||
// test that sorting in the encoding works as expected
|
||||
int[] sortedDims = Arrays.copyOf(expectedDims, dimCount);
|
||||
Arrays.sort(sortedDims);
|
||||
VectorEncoderDecoder.sortSparseDimsValues(expectedDims, expectedValues, dimCount);
|
||||
assertArrayEquals(
|
||||
"Sparse vector dims are not properly sorted!",
|
||||
sortedDims,
|
||||
expectedDims
|
||||
);
|
||||
|
||||
// test that values that went through encoding and decoding are equal to their original
|
||||
BytesRef encodedSparseVector = VectorEncoderDecoder.encodeSparseVector(expectedDims, expectedValues, dimCount);
|
||||
int[] decodedDims = VectorEncoderDecoder.decodeSparseVectorDims(encodedSparseVector);
|
||||
float[] decodedValues = VectorEncoderDecoder.decodeSparseVector(encodedSparseVector);
|
||||
assertArrayEquals(
|
||||
"Decoded sparse vector dims are not equal to their original!",
|
||||
expectedDims,
|
||||
decodedDims
|
||||
);
|
||||
assertArrayEquals(
|
||||
"Decoded sparse vector values are not equal to their original.",
|
||||
expectedValues,
|
||||
decodedValues,
|
||||
0.001f
|
||||
);
|
||||
}
|
||||
|
||||
// imitates the code in DenseVectorFieldMapper::parse
|
||||
private BytesRef mockEncodeDenseVector(float[] dims) {
|
||||
final short INT_BYTES = VectorEncoderDecoder.INT_BYTES;
|
||||
byte[] buf = new byte[INT_BYTES * dims.length];
|
||||
int offset = 0;
|
||||
int intValue;
|
||||
for (float value: dims) {
|
||||
intValue = Float.floatToIntBits(value);
|
||||
buf[offset] = (byte) (intValue >> 24);
|
||||
buf[offset+1] = (byte) (intValue >> 16);
|
||||
buf[offset+2] = (byte) (intValue >> 8);
|
||||
buf[offset+3] = (byte) intValue;
|
||||
offset += INT_BYTES;
|
||||
}
|
||||
return new BytesRef(buf, 0, offset);
|
||||
}
|
||||
|
||||
// generate unique random dims
|
||||
private int[] randomUniqueDims(int dimCount) {
|
||||
int[] values = new int[dimCount];
|
||||
Set<Integer> usedValues = new HashSet<>();
|
||||
int value;
|
||||
for (int i = 0; i < dimCount; i++) {
|
||||
value = randomValueOtherThanMany(usedValues::contains, () -> randomIntBetween(0, SparseVectorFieldMapper.MAX_DIMS_NUMBER));
|
||||
usedValues.add(value);
|
||||
values[i] = value;
|
||||
}
|
||||
return values;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
setup:
|
||||
- skip:
|
||||
version: " - 6.99.99"
|
||||
reason: "dense_vector field was introduced in 7.0.0"
|
||||
|
||||
- do:
|
||||
indices.create:
|
||||
index: test-index
|
||||
body:
|
||||
settings:
|
||||
number_of_replicas: 0
|
||||
mappings:
|
||||
_doc:
|
||||
properties:
|
||||
my_dense_vector:
|
||||
type: dense_vector
|
||||
|
||||
|
||||
---
|
||||
"Indexing":
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
type: _doc
|
||||
id: 1
|
||||
body:
|
||||
my_dense_vector: [1.5, -10, 3455, 345452.4545]
|
||||
|
||||
- match: { result: created }
|
|
@ -0,0 +1,29 @@
|
|||
setup:
|
||||
- skip:
|
||||
version: " - 6.99.99"
|
||||
reason: "sparse_vector field was introduced in 7.0.0"
|
||||
|
||||
- do:
|
||||
indices.create:
|
||||
index: test-index
|
||||
body:
|
||||
settings:
|
||||
number_of_replicas: 0
|
||||
mappings:
|
||||
_doc:
|
||||
properties:
|
||||
my_sparse_vector:
|
||||
type: sparse_vector
|
||||
|
||||
|
||||
---
|
||||
"Indexing":
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
type: _doc
|
||||
id: 1
|
||||
body:
|
||||
my_sparse_vector: { "50" : 1.8, "2" : -0.4, "10" : 1000.3, "4545" : -0.00004}
|
||||
|
||||
- match: { result: created }
|
Loading…
Reference in New Issue