Vector field (#33022)

1. Dense vector

PUT dindex
{
  "mappings": {
    "_doc": {
      "properties": {
        "my_vector": {
          "type": "dense_vector"
        },
        "my_text" : {
          "type" : "keyword"
        }
      }
    }
  }
}

PUT dinex/_doc/1
{
  "my_text" : "text1",
  "my_vector" : [ 0.5, 10, 6 ]
}

2. Sparse vector

PUT sindex
{
  "mappings": {
    "_doc": {
      "properties": {
        "my_vector": {
          "type": "sparse_vector"
        },
        "my_text" : {
          "type" : "keyword"
        }
      }
    }
  }
}

PUT sindex/_doc/1
{
  "my_text" : "text1",
  "my_vector" : {"1": 0.5, "99": -0.5,  "5": 1}
}
This commit is contained in:
Mayya Sharipova 2018-12-12 21:20:53 -05:00 committed by GitHub
parent 9c1cdea839
commit b5d532f9e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 1135 additions and 0 deletions

View File

@ -47,6 +47,11 @@ string:: <<text,`text`>> and <<keyword,`keyword`>>
<<feature-vector>>:: Record numeric feature vectors to boost hits at query time.
<<dense-vector>>:: Record dense vectors of float values.
<<sparse-vector>>:: Record sparse vectors of float values.
[float]
=== Multi-fields
@ -98,3 +103,7 @@ include::types/parent-join.asciidoc[]
include::types/feature.asciidoc[]
include::types/feature-vector.asciidoc[]
include::types/dense-vector.asciidoc[]
include::types/sparse-vector.asciidoc[]

View File

@ -0,0 +1,52 @@
[[dense-vector]]
=== Dense vector datatype
A `dense_vector` field stores dense vectors of float values.
The maximum number of dimensions that can be in a vector should
not exceed 500. The number of dimensions can be
different across documents. A `dense_vector` field is
a single-valued field.
These vectors can be used for document scoring.
For example, a document score can represent a distance between
a given query vector and the indexed document vector.
You index a dense vector as an array of floats.
[source,js]
--------------------------------------------------
PUT my_index
{
"mappings": {
"_doc": {
"properties": {
"my_vector": {
"type": "dense_vector"
},
"my_text" : {
"type" : "keyword"
}
}
}
}
}
PUT my_index/_doc/1
{
"my_text" : "text1",
"my_vector" : [0.5, 10, 6]
}
PUT my_index/_doc/2
{
"my_text" : "text2",
"my_vector" : [-0.5, 10, 10, 4]
}
--------------------------------------------------
// CONSOLE
Internally, each document's dense vector is encoded as a binary
doc value. Its size in bytes is equal to
`4 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` -
number of the vector's dimensions.

View File

@ -0,0 +1,55 @@
[[sparse-vector]]
=== Sparse vector datatype
A `sparse_vector` field stores sparse vectors of float values.
The maximum number of dimensions that can be in a vector should
not exceed 500. The number of dimensions can be
different across documents. A `sparse_vector` field is
a single-valued field.
These vectors can be used for document scoring.
For example, a document score can represent a distance between
a given query vector and the indexed document vector.
You represent a sparse vector as an object, where object fields
are dimensions, and fields values are values for these dimensions.
Dimensions are integer values from `0` to `65535` encoded as strings.
Dimensions don't need to be in order.
[source,js]
--------------------------------------------------
PUT my_index
{
"mappings": {
"_doc": {
"properties": {
"my_vector": {
"type": "sparse_vector"
},
"my_text" : {
"type" : "keyword"
}
}
}
}
}
PUT my_index/_doc/1
{
"my_text" : "text1",
"my_vector" : {"1": 0.5, "5": -0.5, "100": 1}
}
PUT my_index/_doc/2
{
"my_text" : "text2",
"my_vector" : {"103": 0.5, "4": -0.5, "5": 1, "11" : 1.2}
}
--------------------------------------------------
// CONSOLE
Internally, each document's sparse vector is encoded as a binary
doc value. Its size in bytes is equal to
`6 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` -
number of the vector's dimensions.

View File

@ -0,0 +1,195 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentParser.Token;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.search.DocValueFormat;
import org.joda.time.DateTimeZone;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import static org.elasticsearch.common.xcontent.XContentParserUtils.ensureExpectedToken;
/**
* A {@link FieldMapper} for indexing a dense vector of floats.
*/
public class DenseVectorFieldMapper extends FieldMapper implements ArrayValueMapperParser {
public static final String CONTENT_TYPE = "dense_vector";
public static short MAX_DIMS_COUNT = 500; //maximum allowed number of dimensions
private static final byte INT_BYTES = 4;
public static class Defaults {
public static final MappedFieldType FIELD_TYPE = new DenseVectorFieldType();
static {
FIELD_TYPE.setTokenized(false);
FIELD_TYPE.setIndexOptions(IndexOptions.NONE);
FIELD_TYPE.setHasDocValues(true);
FIELD_TYPE.setOmitNorms(true);
FIELD_TYPE.freeze();
}
}
public static class Builder extends FieldMapper.Builder<Builder, DenseVectorFieldMapper> {
public Builder(String name) {
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
builder = this;
}
@Override
public DenseVectorFieldType fieldType() {
return (DenseVectorFieldType) super.fieldType();
}
@Override
public DenseVectorFieldMapper build(BuilderContext context) {
setupFieldType(context);
return new DenseVectorFieldMapper(
name, fieldType, defaultFieldType,
context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
}
}
public static class TypeParser implements Mapper.TypeParser {
@Override
public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
DenseVectorFieldMapper.Builder builder = new DenseVectorFieldMapper.Builder(name);
return builder;
}
}
public static final class DenseVectorFieldType extends MappedFieldType {
public DenseVectorFieldType() {}
protected DenseVectorFieldType(DenseVectorFieldType ref) {
super(ref);
}
public DenseVectorFieldType clone() {
return new DenseVectorFieldType(this);
}
@Override
public String typeName() {
return CONTENT_TYPE;
}
@Override
public DocValueFormat docValueFormat(String format, DateTimeZone timeZone) {
throw new UnsupportedOperationException(
"Field [" + name() + "] of type [" + typeName() + "] doesn't support docvalue_fields or aggregations");
}
@Override
public Query existsQuery(QueryShardContext context) {
return new DocValuesFieldExistsQuery(name());
}
@Override
public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) {
throw new UnsupportedOperationException(
"Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating");
}
@Override
public Query termQuery(Object value, QueryShardContext context) {
throw new UnsupportedOperationException(
"Field [" + name() + "] of type [" + typeName() + "] doesn't support queries");
}
}
private DenseVectorFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
assert fieldType.indexOptions() == IndexOptions.NONE;
}
@Override
protected DenseVectorFieldMapper clone() {
return (DenseVectorFieldMapper) super.clone();
}
@Override
public DenseVectorFieldType fieldType() {
return (DenseVectorFieldType) super.fieldType();
}
@Override
public void parse(ParseContext context) throws IOException {
if (context.externalValueSet()) {
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] can't be used in multi-fields");
}
// encode array of floats as array of integers and store into buf
// this code is here and not int the VectorEncoderDecoder so not to create extra arrays
byte[] buf = new byte[0];
int offset = 0;
int dim = 0;
for (Token token = context.parser().nextToken(); token != Token.END_ARRAY; token = context.parser().nextToken()) {
ensureExpectedToken(Token.VALUE_NUMBER, token, context.parser()::getTokenLocation);
float value = context.parser().floatValue(true);
if (buf.length < (offset + INT_BYTES)) {
buf = ArrayUtil.grow(buf, (offset + INT_BYTES));
}
int intValue = Float.floatToIntBits(value);
buf[offset] = (byte) (intValue >> 24);
buf[offset+1] = (byte) (intValue >> 16);
buf[offset+2] = (byte) (intValue >> 8);
buf[offset+3] = (byte) intValue;
offset += INT_BYTES;
dim++;
if (dim >= MAX_DIMS_COUNT) {
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
"] has exceeded the maximum allowed number of dimensions of :[" + MAX_DIMS_COUNT + "]");
}
}
BinaryDocValuesField field = new BinaryDocValuesField(fieldType().name(), new BytesRef(buf, 0, offset));
if (context.doc().getByKey(fieldType().name()) != null) {
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
"] doesn't not support indexing multiple values for the same field in the same document");
}
context.doc().addWithKey(fieldType().name(), field);
}
@Override
protected void parseCreateField(ParseContext context, List<IndexableField> fields) {
throw new AssertionError("parse is implemented directly");
}
@Override
protected String contentType() {
return CONTENT_TYPE;
}
}

View File

@ -39,6 +39,8 @@ public class MapperExtrasPlugin extends Plugin implements MapperPlugin, SearchPl
mappers.put(TokenCountFieldMapper.CONTENT_TYPE, new TokenCountFieldMapper.TypeParser());
mappers.put(FeatureFieldMapper.CONTENT_TYPE, new FeatureFieldMapper.TypeParser());
mappers.put(FeatureVectorFieldMapper.CONTENT_TYPE, new FeatureVectorFieldMapper.TypeParser());
mappers.put(DenseVectorFieldMapper.CONTENT_TYPE, new DenseVectorFieldMapper.TypeParser());
mappers.put(SparseVectorFieldMapper.CONTENT_TYPE, new SparseVectorFieldMapper.TypeParser());
return Collections.unmodifiableMap(mappers);
}

View File

@ -0,0 +1,207 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentParser.Token;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.search.DocValueFormat;
import org.joda.time.DateTimeZone;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import static org.elasticsearch.common.xcontent.XContentParserUtils.ensureExpectedToken;
/**
* A {@link FieldMapper} for indexing a sparse vector of floats.
*/
public class SparseVectorFieldMapper extends FieldMapper {
public static final String CONTENT_TYPE = "sparse_vector";
public static short MAX_DIMS_COUNT = 500; //maximum allowed number of dimensions
public static int MAX_DIMS_NUMBER = 65535; //maximum allowed dimension's number
public static class Defaults {
public static final MappedFieldType FIELD_TYPE = new SparseVectorFieldType();
static {
FIELD_TYPE.setTokenized(false);
FIELD_TYPE.setIndexOptions(IndexOptions.NONE);
FIELD_TYPE.setHasDocValues(true);
FIELD_TYPE.setOmitNorms(true);
FIELD_TYPE.freeze();
}
}
public static class Builder extends FieldMapper.Builder<Builder, SparseVectorFieldMapper> {
public Builder(String name) {
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
builder = this;
}
@Override
public SparseVectorFieldType fieldType() {
return (SparseVectorFieldType) super.fieldType();
}
@Override
public SparseVectorFieldMapper build(BuilderContext context) {
setupFieldType(context);
return new SparseVectorFieldMapper(
name, fieldType, defaultFieldType,
context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
}
}
public static class TypeParser implements Mapper.TypeParser {
@Override
public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
SparseVectorFieldMapper.Builder builder = new SparseVectorFieldMapper.Builder(name);
return builder;
}
}
public static final class SparseVectorFieldType extends MappedFieldType {
public SparseVectorFieldType() {}
protected SparseVectorFieldType(SparseVectorFieldType ref) {
super(ref);
}
public SparseVectorFieldType clone() {
return new SparseVectorFieldType(this);
}
@Override
public String typeName() {
return CONTENT_TYPE;
}
@Override
public DocValueFormat docValueFormat(String format, DateTimeZone timeZone) {
throw new UnsupportedOperationException(
"Field [" + name() + "] of type [" + typeName() + "] doesn't support docvalue_fields or aggregations");
}
@Override
public Query existsQuery(QueryShardContext context) {
return new DocValuesFieldExistsQuery(name());
}
@Override
public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) {
throw new UnsupportedOperationException(
"Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating");
}
@Override
public Query termQuery(Object value, QueryShardContext context) {
throw new UnsupportedOperationException(
"Field [" + name() + "] of type [" + typeName() + "] doesn't support queries");
}
}
private SparseVectorFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
assert fieldType.indexOptions() == IndexOptions.NONE;
}
@Override
protected SparseVectorFieldMapper clone() {
return (SparseVectorFieldMapper) super.clone();
}
@Override
public SparseVectorFieldType fieldType() {
return (SparseVectorFieldType) super.fieldType();
}
@Override
public void parse(ParseContext context) throws IOException {
if (context.externalValueSet()) {
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] can't be used in multi-fields");
}
ensureExpectedToken(Token.START_OBJECT, context.parser().currentToken(), context.parser()::getTokenLocation);
int[] dims = new int[0];
float[] values = new float[0];
int dimCount = 0;
int dim = 0;
float value;
for (Token token = context.parser().nextToken(); token != Token.END_OBJECT; token = context.parser().nextToken()) {
if (token == Token.FIELD_NAME) {
try {
dim = Integer.parseInt(context.parser().currentName());
if (dim < 0 || dim > MAX_DIMS_NUMBER) {
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "]'s dimension number " +
"must be a non-negative integer value not exceeding [" + MAX_DIMS_NUMBER + "], got [" + dim + "]");
}
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "]'s dimensions should be " +
"integers represented as strings, but got [" + context.parser().currentName() + "]", e);
}
} else if (token == Token.VALUE_NUMBER) {
value = context.parser().floatValue(true);
if (dims.length <= dimCount) { // ensure arrays have enough capacity
values = ArrayUtil.grow(values, dimCount + 1);
dims = ArrayUtil.grow(dims, dimCount + 1);
}
dims[dimCount] = dim;
values[dimCount] = value;
dimCount ++;
if (dimCount >= MAX_DIMS_COUNT) {
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
"] has exceeded the maximum allowed number of dimensions of :[" + MAX_DIMS_COUNT + "]");
}
} else {
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
"] takes an object that maps a dimension number to a float, " + "but got unexpected token [" + token + "]");
}
}
BytesRef br = VectorEncoderDecoder.encodeSparseVector(dims, values, dimCount);
BinaryDocValuesField field = new BinaryDocValuesField(fieldType().name(), br);
context.doc().addWithKey(fieldType().name(), field);
}
@Override
protected void parseCreateField(ParseContext context, List<IndexableField> fields) {
throw new AssertionError("parse is implemented directly");
}
@Override
protected String contentType() {
return CONTENT_TYPE;
}
}

View File

@ -0,0 +1,141 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.InPlaceMergeSorter;
// static utility functions for encoding and decoding dense_vector and sparse_vector fields
final class VectorEncoderDecoder {
static final byte INT_BYTES = 4;
static final byte SHORT_BYTES = 2;
private VectorEncoderDecoder() { }
/**
* Encodes a sparse array represented by values, dims and dimCount into a bytes array - BytesRef
* BytesRef: int[] floats encoded as integers values, 2 bytes for each dimension
* @param values - values of the sparse array
* @param dims - dims of the sparse array
* @param dimCount - number of the dimension
* @return BytesRef
*/
static BytesRef encodeSparseVector(int[] dims, float[] values, int dimCount) {
// 1. Sort dims and values
sortSparseDimsValues(dims, values, dimCount);
byte[] buf = new byte[dimCount * (INT_BYTES + SHORT_BYTES)];
// 2. Encode dimensions
// as each dimension is a positive value that doesn't exceed 65535, 2 bytes is enough for encoding it
int offset = 0;
for (int dim = 0; dim < dimCount; dim++) {
buf[offset] = (byte) (dims[dim] >> 8);
buf[offset+1] = (byte) dims[dim];
offset += SHORT_BYTES;
}
// 3. Encode values
for (int dim = 0; dim < dimCount; dim++) {
int intValue = Float.floatToIntBits(values[dim]);
buf[offset] = (byte) (intValue >> 24);
buf[offset+1] = (byte) (intValue >> 16);
buf[offset+2] = (byte) (intValue >> 8);
buf[offset+3] = (byte) intValue;
offset += INT_BYTES;
}
return new BytesRef(buf);
}
/**
* Decodes the first part of BytesRef into sparse vector dimensions
* @param vectorBR - vector decoded in BytesRef
*/
static int[] decodeSparseVectorDims(BytesRef vectorBR) {
int dimCount = vectorBR.length / (INT_BYTES + SHORT_BYTES);
int[] dims = new int[dimCount];
int offset = vectorBR.offset;
for (int dim = 0; dim < dimCount; dim++) {
dims[dim] = ((vectorBR.bytes[offset] & 0xFF) << 8) | (vectorBR.bytes[offset+1] & 0xFF);
offset += SHORT_BYTES;
}
return dims;
}
/**
* Decodes the second part of the BytesRef into sparse vector values
* @param vectorBR - vector decoded in BytesRef
*/
static float[] decodeSparseVector(BytesRef vectorBR) {
int dimCount = vectorBR.length / (INT_BYTES + SHORT_BYTES);
int offset = vectorBR.offset + SHORT_BYTES * dimCount; //calculate the offset from where values are encoded
float[] vector = new float[dimCount];
for (int dim = 0; dim < dimCount; dim++) {
int intValue = ((vectorBR.bytes[offset] & 0xFF) << 24) |
((vectorBR.bytes[offset+1] & 0xFF) << 16) |
((vectorBR.bytes[offset+2] & 0xFF) << 8) |
(vectorBR.bytes[offset+3] & 0xFF);
vector[dim] = Float.intBitsToFloat(intValue);
offset = offset + INT_BYTES;
}
return vector;
}
/**
Sort dimensions in the ascending order and
sort values in the same order as their corresponding dimensions
**/
static void sortSparseDimsValues(int[] dims, float[] values, int n) {
new InPlaceMergeSorter() {
@Override
public int compare(int i, int j) {
return Integer.compare(dims[i], dims[j]);
}
@Override
public void swap(int i, int j) {
int tempDim = dims[i];
dims[i] = dims[j];
dims[j] = tempDim;
float tempValue = values[j];
values[j] = values[i];
values[i] = tempValue;
}
}.sort(0, n);
}
// Decodes a BytesRef into an array of floats
static float[] decodeDenseVector(BytesRef vectorBR) {
int dimCount = vectorBR.length / INT_BYTES;
float[] vector = new float[dimCount];
int offset = vectorBR.offset;
for (int dim = 0; dim < dimCount; dim++) {
int intValue = ((vectorBR.bytes[offset] & 0xFF) << 24) |
((vectorBR.bytes[offset+1] & 0xFF) << 16) |
((vectorBR.bytes[offset+2] & 0xFF) << 8) |
(vectorBR.bytes[offset+3] & 0xFF);
vector[dim] = Float.intBitsToFloat(intValue);
offset = offset + INT_BYTES;
}
return vector;
}
}

View File

@ -0,0 +1,81 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.hamcrest.Matchers;
import java.util.Collection;
public class DenseVectorFieldMapperTests extends ESSingleNodeTestCase {
@Override
protected Collection<Class<? extends Plugin>> getPlugins() {
return pluginList(MapperExtrasPlugin.class);
}
public void testDefaults() throws Exception {
IndexService indexService = createIndex("test-index");
DocumentMapperParser parser = indexService.mapperService().documentMapperParser();
String mapping = Strings.toString(XContentFactory.jsonBuilder()
.startObject()
.startObject("_doc")
.startObject("properties")
.startObject("my-dense-vector").field("type", "dense_vector")
.endObject()
.endObject()
.endObject()
.endObject());
DocumentMapper mapper = parser.parse("_doc", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
float[] expectedArray = {-12.1f, 100.7f, -4};
ParsedDocument doc1 = mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
.bytes(XContentFactory.jsonBuilder()
.startObject()
.startArray("my-dense-vector").value(expectedArray[0]).value(expectedArray[1]).value(expectedArray[2]).endArray()
.endObject()),
XContentType.JSON));
IndexableField[] fields = doc1.rootDoc().getFields("my-dense-vector");
assertEquals(1, fields.length);
assertThat(fields[0], Matchers.instanceOf(BinaryDocValuesField.class));
// assert that after decoding the indexed value is equal to expected
BytesRef vectorBR = ((BinaryDocValuesField) fields[0]).binaryValue();
float[] decodedValues = VectorEncoderDecoder.decodeDenseVector(vectorBR);
assertArrayEquals(
"Decoded dense vector values is not equal to the indexed one.",
expectedArray,
decodedValues,
0.001f
);
}
}

View File

@ -0,0 +1,28 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper;
public class DenseVectorFieldTypeTests extends FieldTypeTestCase {
@Override
protected MappedFieldType createDefaultFieldType() {
return new DenseVectorFieldMapper.DenseVectorFieldType();
}
}

View File

@ -0,0 +1,164 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.hamcrest.Matchers;
import org.junit.Before;
import java.util.Collection;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.core.IsInstanceOf.instanceOf;
public class SparseVectorFieldMapperTests extends ESSingleNodeTestCase {
private DocumentMapper mapper;
@Before
public void setup() throws Exception {
IndexService indexService = createIndex("test-index");
DocumentMapperParser parser = indexService.mapperService().documentMapperParser();
String mapping = Strings.toString(XContentFactory.jsonBuilder()
.startObject()
.startObject("_doc")
.startObject("properties")
.startObject("my-sparse-vector").field("type", "sparse_vector")
.endObject()
.endObject()
.endObject()
.endObject());
mapper = parser.parse("_doc", new CompressedXContent(mapping));
}
@Override
protected Collection<Class<? extends Plugin>> getPlugins() {
return pluginList(MapperExtrasPlugin.class);
}
public void testDefaults() throws Exception {
int[] indexedDims = {65535, 50, 2};
float[] indexedValues = {0.5f, 1800f, -34567.11f};
ParsedDocument doc1 = mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
.bytes(XContentFactory.jsonBuilder()
.startObject()
.startObject("my-sparse-vector")
.field(Integer.toString(indexedDims[0]), indexedValues[0])
.field(Integer.toString(indexedDims[1]), indexedValues[1])
.field(Integer.toString(indexedDims[2]), indexedValues[2])
.endObject()
.endObject()),
XContentType.JSON));
IndexableField[] fields = doc1.rootDoc().getFields("my-sparse-vector");
assertEquals(1, fields.length);
assertThat(fields[0], Matchers.instanceOf(BinaryDocValuesField.class));
// assert that after decoding the indexed values are equal to expected
int[] expectedDims = {2, 50, 65535}; //the same as indexed but sorted
float[] expectedValues = {-34567.11f, 1800f, 0.5f}; //the same as indexed but sorted by their dimensions
// assert that after decoding the indexed dims and values are equal to expected
BytesRef vectorBR = ((BinaryDocValuesField) fields[0]).binaryValue();
int[] decodedDims = VectorEncoderDecoder.decodeSparseVectorDims(vectorBR);
assertArrayEquals(
"Decoded sparse vector dimensions are not equal to the indexed ones.",
expectedDims,
decodedDims
);
float[] decodedValues = VectorEncoderDecoder.decodeSparseVector(vectorBR);
assertArrayEquals(
"Decoded sparse vector values are not equal to the indexed ones.",
expectedValues,
decodedValues,
0.001f
);
}
public void testErrors() {
// 1. test for an error on negative dimension
MapperParsingException e = expectThrows(MapperParsingException.class, () -> {
mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
.bytes(XContentFactory.jsonBuilder()
.startObject()
.startObject("my-sparse-vector")
.field(Integer.toString(-50), 100f)
.endObject()
.endObject()),
XContentType.JSON));
});
assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
assertThat(e.getCause().getMessage(), containsString(
"dimension number must be a non-negative integer value not exceeding [65535], got [-50]"));
// 2. test for an error on a dimension greater than MAX_DIMS_NUMBER
e = expectThrows(MapperParsingException.class, () -> {
mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
.bytes(XContentFactory.jsonBuilder()
.startObject()
.startObject("my-sparse-vector")
.field(Integer.toString(70000), 100f)
.endObject()
.endObject()),
XContentType.JSON));
});
assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
assertThat(e.getCause().getMessage(), containsString(
"dimension number must be a non-negative integer value not exceeding [65535], got [70000]"));
// 3. test for an error on a wrong formatted dimension
e = expectThrows(MapperParsingException.class, () -> {
mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
.bytes(XContentFactory.jsonBuilder()
.startObject()
.startObject("my-sparse-vector")
.field("WrongDim123", 100f)
.endObject()
.endObject()),
XContentType.JSON));
});
assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
assertThat(e.getCause().getMessage(), containsString(
"dimensions should be integers represented as strings, but got [WrongDim123]"));
// 4. test for an error on a wrong format for the map of dims to values
e = expectThrows(MapperParsingException.class, () -> {
mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
.bytes(XContentFactory.jsonBuilder()
.startObject()
.startObject("my-sparse-vector")
.startArray(Integer.toString(10)).value(10f).value(100f).endArray()
.endObject()
.endObject()),
XContentType.JSON));
});
assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
assertThat(e.getCause().getMessage(), containsString(
"takes an object that maps a dimension number to a float, but got unexpected token [START_ARRAY]"));
}
}

View File

@ -0,0 +1,28 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper;
public class SparseVectorFieldTypeTests extends FieldTypeTestCase {
@Override
protected MappedFieldType createDefaultFieldType() {
return new SparseVectorFieldMapper.SparseVectorFieldType();
}
}

View File

@ -0,0 +1,115 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.test.ESTestCase;
import java.util.HashSet;
import java.util.Set;
import java.util.Arrays;
public class VectorEncoderDecoderTests extends ESTestCase {
public void testDenseVectorEncodingDecoding() {
int dimCount = randomIntBetween(0, 300);
float[] expectedValues = new float[dimCount];
for (int i = 0; i < dimCount; i++) {
expectedValues[i] = randomFloat();
}
// test that values that went through encoding and decoding are equal to their original
BytesRef encodedDenseVector = mockEncodeDenseVector(expectedValues);
float[] decodedValues = VectorEncoderDecoder.decodeDenseVector(encodedDenseVector);
assertArrayEquals(
"Decoded dense vector values are not equal to their original.",
expectedValues,
decodedValues,
0.001f
);
}
public void testSparseVectorEncodingDecoding() {
int dimCount = randomIntBetween(0, 100);
float[] expectedValues = new float[dimCount];
int[] expectedDims = randomUniqueDims(dimCount);
for (int i = 0; i < dimCount; i++) {
expectedValues[i] = randomFloat();
}
// test that sorting in the encoding works as expected
int[] sortedDims = Arrays.copyOf(expectedDims, dimCount);
Arrays.sort(sortedDims);
VectorEncoderDecoder.sortSparseDimsValues(expectedDims, expectedValues, dimCount);
assertArrayEquals(
"Sparse vector dims are not properly sorted!",
sortedDims,
expectedDims
);
// test that values that went through encoding and decoding are equal to their original
BytesRef encodedSparseVector = VectorEncoderDecoder.encodeSparseVector(expectedDims, expectedValues, dimCount);
int[] decodedDims = VectorEncoderDecoder.decodeSparseVectorDims(encodedSparseVector);
float[] decodedValues = VectorEncoderDecoder.decodeSparseVector(encodedSparseVector);
assertArrayEquals(
"Decoded sparse vector dims are not equal to their original!",
expectedDims,
decodedDims
);
assertArrayEquals(
"Decoded sparse vector values are not equal to their original.",
expectedValues,
decodedValues,
0.001f
);
}
// imitates the code in DenseVectorFieldMapper::parse
private BytesRef mockEncodeDenseVector(float[] dims) {
final short INT_BYTES = VectorEncoderDecoder.INT_BYTES;
byte[] buf = new byte[INT_BYTES * dims.length];
int offset = 0;
int intValue;
for (float value: dims) {
intValue = Float.floatToIntBits(value);
buf[offset] = (byte) (intValue >> 24);
buf[offset+1] = (byte) (intValue >> 16);
buf[offset+2] = (byte) (intValue >> 8);
buf[offset+3] = (byte) intValue;
offset += INT_BYTES;
}
return new BytesRef(buf, 0, offset);
}
// generate unique random dims
private int[] randomUniqueDims(int dimCount) {
int[] values = new int[dimCount];
Set<Integer> usedValues = new HashSet<>();
int value;
for (int i = 0; i < dimCount; i++) {
value = randomValueOtherThanMany(usedValues::contains, () -> randomIntBetween(0, SparseVectorFieldMapper.MAX_DIMS_NUMBER));
usedValues.add(value);
values[i] = value;
}
return values;
}
}

View File

@ -0,0 +1,29 @@
setup:
- skip:
version: " - 6.99.99"
reason: "dense_vector field was introduced in 7.0.0"
- do:
indices.create:
index: test-index
body:
settings:
number_of_replicas: 0
mappings:
_doc:
properties:
my_dense_vector:
type: dense_vector
---
"Indexing":
- do:
index:
index: test-index
type: _doc
id: 1
body:
my_dense_vector: [1.5, -10, 3455, 345452.4545]
- match: { result: created }

View File

@ -0,0 +1,29 @@
setup:
- skip:
version: " - 6.99.99"
reason: "sparse_vector field was introduced in 7.0.0"
- do:
indices.create:
index: test-index
body:
settings:
number_of_replicas: 0
mappings:
_doc:
properties:
my_sparse_vector:
type: sparse_vector
---
"Indexing":
- do:
index:
index: test-index
type: _doc
id: 1
body:
my_sparse_vector: { "50" : 1.8, "2" : -0.4, "10" : 1000.3, "4545" : -0.00004}
- match: { result: created }