Add scaled_float. #19264

This is a tentative to revive #15939 motivated by elastic/beats#1941.
Half-floats are a pretty bad option for storing percentages. They would likely
require 2 bytes all the time while they don't need more than one byte.

So this PR exposes a new `scaled_float` type that requires a `scaling_factor`
and internally indexes `value*scaling_factor` in a long field. Compared to the
original PR it exposes a lower-level API so that the trade-offs are clearer and
avoids any reference to fixed precision that might imply that this type is more
accurate (actually it is *less* accurate).

In addition to being more space-efficient for some use-cases that beats is
interested in, this is also faster that `half_float` unless we can improve the
efficiency of decoding half-float bits (which is currently done using software)
or until Java gets first-class support for half-floats.
This commit is contained in:
Adrien Grand 2016-07-05 11:08:45 +02:00
parent 59cda81b40
commit 398d70b567
7 changed files with 1214 additions and 24 deletions

View File

@ -69,7 +69,7 @@ import java.util.Objects;
public class NumberFieldMapper extends FieldMapper implements AllFieldMapper.IncludeInAll {
// this is private since it has a different default
private static final Setting<Boolean> COERCE_SETTING =
static final Setting<Boolean> COERCE_SETTING =
Setting.boolSetting("index.mapping.coerce", true, Property.IndexScope);
public static class Builder extends FieldMapper.Builder<Builder, NumberFieldMapper> {

View File

@ -0,0 +1,616 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper.core;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.Query;
import org.elasticsearch.action.fieldstats.FieldStats;
import org.elasticsearch.common.Explicit;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.XContentParser.Token;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.fielddata.AtomicNumericFieldData;
import org.elasticsearch.index.fielddata.FieldData;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.fielddata.IndexNumericFieldData;
import org.elasticsearch.index.fielddata.NumericDoubleValues;
import org.elasticsearch.index.fielddata.ScriptDocValues;
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
import org.elasticsearch.index.fielddata.SortedNumericDoubleValues;
import org.elasticsearch.index.fielddata.fieldcomparator.DoubleValuesComparatorSource;
import org.elasticsearch.index.fielddata.plain.DocValuesIndexFieldData;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.mapper.core.LegacyNumberFieldMapper.Defaults;
import org.elasticsearch.index.mapper.internal.AllFieldMapper;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.indices.breaker.CircuitBreakerService;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.MultiValueMode;
import org.joda.time.DateTimeZone;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
/** A {@link FieldMapper} for scaled floats. Values are internally multiplied
* by a scaling factor and rounded to the closest long. */
public class ScaledFloatFieldMapper extends FieldMapper implements AllFieldMapper.IncludeInAll {
public static final String CONTENT_TYPE = "scaled_float";
// use the same default as numbers
private static final Setting<Boolean> COERCE_SETTING = NumberFieldMapper.COERCE_SETTING;
public static class Builder extends FieldMapper.Builder<Builder, ScaledFloatFieldMapper> {
private boolean scalingFactorSet = false;
private Boolean ignoreMalformed;
private Boolean coerce;
public Builder(String name) {
super(name, new ScaledFloatFieldType(), new ScaledFloatFieldType());
builder = this;
}
public Builder ignoreMalformed(boolean ignoreMalformed) {
this.ignoreMalformed = ignoreMalformed;
return builder;
}
protected Explicit<Boolean> ignoreMalformed(BuilderContext context) {
if (ignoreMalformed != null) {
return new Explicit<>(ignoreMalformed, true);
}
if (context.indexSettings() != null) {
return new Explicit<>(IGNORE_MALFORMED_SETTING.get(context.indexSettings()), false);
}
return Defaults.IGNORE_MALFORMED;
}
public Builder coerce(boolean coerce) {
this.coerce = coerce;
return builder;
}
public Builder scalingFactor(double scalingFactor) {
((ScaledFloatFieldType) fieldType).setScalingFactor(scalingFactor);
scalingFactorSet = true;
return this;
}
protected Explicit<Boolean> coerce(BuilderContext context) {
if (coerce != null) {
return new Explicit<>(coerce, true);
}
if (context.indexSettings() != null) {
return new Explicit<>(COERCE_SETTING.get(context.indexSettings()), false);
}
return Defaults.COERCE;
}
@Override
public ScaledFloatFieldMapper build(BuilderContext context) {
if (scalingFactorSet == false) {
throw new IllegalArgumentException("Field [" + name + "] misses required parameter [scaling_factor]");
}
setupFieldType(context);
ScaledFloatFieldMapper fieldMapper =
new ScaledFloatFieldMapper(name, fieldType, defaultFieldType, ignoreMalformed(context),
coerce(context), context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
return (ScaledFloatFieldMapper) fieldMapper.includeInAll(includeInAll);
}
}
public static class TypeParser implements Mapper.TypeParser {
@Override
public Mapper.Builder<?,?> parse(String name, Map<String, Object> node,
ParserContext parserContext) throws MapperParsingException {
Builder builder = new Builder(name);
TypeParsers.parseField(builder, name, node, parserContext);
for (Iterator<Map.Entry<String, Object>> iterator = node.entrySet().iterator(); iterator.hasNext();) {
Map.Entry<String, Object> entry = iterator.next();
String propName = entry.getKey();
Object propNode = entry.getValue();
if (propName.equals("null_value")) {
if (propNode == null) {
throw new MapperParsingException("Property [null_value] cannot be null.");
}
builder.nullValue(NumberFieldMapper.NumberType.DOUBLE.parse(propNode));
iterator.remove();
} else if (propName.equals("ignore_malformed")) {
builder.ignoreMalformed(TypeParsers.nodeBooleanValue("ignore_malformed", propNode, parserContext));
iterator.remove();
} else if (propName.equals("coerce")) {
builder.coerce(TypeParsers.nodeBooleanValue("coerce", propNode, parserContext));
iterator.remove();
} else if (propName.equals("scaling_factor")) {
builder.scalingFactor(NumberFieldMapper.NumberType.DOUBLE.parse(propNode).doubleValue());
iterator.remove();
}
}
return builder;
}
}
public static final class ScaledFloatFieldType extends MappedFieldType {
private double scalingFactor;
public ScaledFloatFieldType() {
super();
setTokenized(false);
setHasDocValues(true);
setOmitNorms(true);
}
ScaledFloatFieldType(ScaledFloatFieldType other) {
super(other);
this.scalingFactor = other.scalingFactor;
}
public double getScalingFactor() {
return scalingFactor;
}
public void setScalingFactor(double scalingFactor) {
checkIfFrozen();
this.scalingFactor = scalingFactor;
}
@Override
public MappedFieldType clone() {
return new ScaledFloatFieldType(this);
}
@Override
public String typeName() {
return CONTENT_TYPE;
}
@Override
public void checkCompatibility(MappedFieldType other, List<String> conflicts, boolean strict) {
super.checkCompatibility(other, conflicts, strict);
if (scalingFactor != ((ScaledFloatFieldType) other).getScalingFactor()) {
conflicts.add("mapper [" + name() + "] has different [scaling_factor] values");
}
}
@Override
public Query termQuery(Object value, QueryShardContext context) {
failIfNotIndexed();
double queryValue = NumberFieldMapper.NumberType.DOUBLE.parse(value).doubleValue();
long scaledValue = Math.round(queryValue * scalingFactor);
Query query = NumberFieldMapper.NumberType.LONG.termQuery(name(), scaledValue);
if (boost() != 1f) {
query = new BoostQuery(query, boost());
}
return query;
}
@Override
public Query termsQuery(List values, QueryShardContext context) {
failIfNotIndexed();
List<Long> scaledValues = new ArrayList<>(values.size());
for (Object value : values) {
double queryValue = NumberFieldMapper.NumberType.DOUBLE.parse(value).doubleValue();
long scaledValue = Math.round(queryValue * scalingFactor);
scaledValues.add(scaledValue);
}
Query query = NumberFieldMapper.NumberType.LONG.termsQuery(name(), Collections.unmodifiableList(scaledValues));
if (boost() != 1f) {
query = new BoostQuery(query, boost());
}
return query;
}
@Override
public Query rangeQuery(Object lowerTerm, Object upperTerm, boolean includeLower, boolean includeUpper) {
failIfNotIndexed();
Long lo = null;
if (lowerTerm != null) {
double dValue = NumberFieldMapper.NumberType.DOUBLE.parse(lowerTerm).doubleValue();
if (includeLower == false) {
dValue = Math.nextUp(dValue);
}
lo = Math.round(Math.ceil(dValue * scalingFactor));
}
Long hi = null;
if (lowerTerm != null) {
double dValue = NumberFieldMapper.NumberType.DOUBLE.parse(upperTerm).doubleValue();
if (includeUpper == false) {
dValue = Math.nextDown(dValue);
}
hi = Math.round(Math.floor(dValue * scalingFactor));
}
Query query = NumberFieldMapper.NumberType.LONG.rangeQuery(name(), lo, hi, true, true);
if (boost() != 1f) {
query = new BoostQuery(query, boost());
}
return query;
}
@Override
public FieldStats<?> stats(IndexReader reader) throws IOException {
FieldStats.Long stats = (FieldStats.Long) NumberFieldMapper.NumberType.LONG.stats(
reader, name(), isSearchable(), isAggregatable());
if (stats == null) {
return null;
}
return new FieldStats.Double(stats.getMaxDoc(), stats.getDocCount(),
stats.getSumDocFreq(), stats.getSumTotalTermFreq(),
stats.isSearchable(), stats.isAggregatable(),
stats.getMinValue() == null ? null : stats.getMinValue() / scalingFactor,
stats.getMaxValue() == null ? null : stats.getMaxValue() / scalingFactor);
}
@Override
public IndexFieldData.Builder fielddataBuilder() {
failIfNoDocValues();
return new IndexFieldData.Builder() {
@Override
public IndexFieldData<?> build(IndexSettings indexSettings, MappedFieldType fieldType, IndexFieldDataCache cache,
CircuitBreakerService breakerService, MapperService mapperService) {
final IndexNumericFieldData scaledValues = (IndexNumericFieldData) new DocValuesIndexFieldData.Builder()
.numericType(IndexNumericFieldData.NumericType.LONG)
.build(indexSettings, fieldType, cache, breakerService, mapperService);
return new ScaledFloatIndexFieldData(scaledValues, scalingFactor);
}
};
}
@Override
public Object valueForSearch(Object value) {
if (value == null) {
return null;
}
return ((Number) value).longValue() / scalingFactor;
}
@Override
public DocValueFormat docValueFormat(String format, DateTimeZone timeZone) {
if (timeZone != null) {
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName()
+ "] does not support custom time zones");
}
if (format == null) {
return DocValueFormat.RAW;
} else {
return new DocValueFormat.Decimal(format);
}
}
@Override
public boolean equals(Object o) {
if (super.equals(o) == false) {
return false;
}
return scalingFactor == ((ScaledFloatFieldType) o).scalingFactor;
}
@Override
public int hashCode() {
return 31 * super.hashCode() + Double.hashCode(scalingFactor);
}
}
private Boolean includeInAll;
private Explicit<Boolean> ignoreMalformed;
private Explicit<Boolean> coerce;
private ScaledFloatFieldMapper(
String simpleName,
MappedFieldType fieldType,
MappedFieldType defaultFieldType,
Explicit<Boolean> ignoreMalformed,
Explicit<Boolean> coerce,
Settings indexSettings,
MultiFields multiFields,
CopyTo copyTo) {
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
final double scalingFactor = fieldType().getScalingFactor();
if (Double.isFinite(scalingFactor) == false || scalingFactor <= 0) {
throw new IllegalArgumentException("[scaling_factor] must be a positive number, got [" + scalingFactor + "]");
}
this.ignoreMalformed = ignoreMalformed;
this.coerce = coerce;
}
@Override
public ScaledFloatFieldType fieldType() {
return (ScaledFloatFieldType) super.fieldType();
}
@Override
protected String contentType() {
return fieldType.typeName();
}
@Override
protected ScaledFloatFieldMapper clone() {
return (ScaledFloatFieldMapper) super.clone();
}
@Override
public Mapper includeInAll(Boolean includeInAll) {
if (includeInAll != null) {
ScaledFloatFieldMapper clone = clone();
clone.includeInAll = includeInAll;
return clone;
} else {
return this;
}
}
@Override
public Mapper includeInAllIfNotSet(Boolean includeInAll) {
if (includeInAll != null && this.includeInAll == null) {
ScaledFloatFieldMapper clone = clone();
clone.includeInAll = includeInAll;
return clone;
} else {
return this;
}
}
@Override
public Mapper unsetIncludeInAll() {
if (includeInAll != null) {
ScaledFloatFieldMapper clone = clone();
clone.includeInAll = null;
return clone;
} else {
return this;
}
}
@Override
protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException {
XContentParser parser = context.parser();
Object value;
Number numericValue = null;
if (context.externalValueSet()) {
value = context.externalValue();
} else if (parser.currentToken() == Token.VALUE_NULL) {
value = null;
} else if (coerce.value()
&& parser.currentToken() == Token.VALUE_STRING
&& parser.textLength() == 0) {
value = null;
} else {
value = parser.textOrNull();
if (value != null) {
try {
numericValue = NumberFieldMapper.NumberType.DOUBLE.parse(parser, coerce.value());
} catch (IllegalArgumentException e) {
if (ignoreMalformed.value()) {
return;
} else {
throw e;
}
}
}
}
if (value == null) {
value = fieldType().nullValue();
}
if (value == null) {
return;
}
if (numericValue == null) {
numericValue = NumberFieldMapper.NumberType.DOUBLE.parse(value);
}
if (context.includeInAll(includeInAll, this)) {
context.allEntries().addText(fieldType().name(), value.toString(), fieldType().boost());
}
double doubleValue = numericValue.doubleValue();
if (Double.isFinite(doubleValue) == false) {
// since we encode to a long, we have no way to carry NaNs and infinities
throw new IllegalArgumentException("[scaled_float] only supports finite values, but got [" + doubleValue + "]");
}
long scaledValue = Math.round(doubleValue * fieldType().getScalingFactor());
boolean indexed = fieldType().indexOptions() != IndexOptions.NONE;
boolean docValued = fieldType().hasDocValues();
boolean stored = fieldType().stored();
fields.addAll(NumberFieldMapper.NumberType.LONG.createFields(fieldType().name(), scaledValue, indexed, docValued, stored));
}
@Override
protected void doMerge(Mapper mergeWith, boolean updateAllTypes) {
super.doMerge(mergeWith, updateAllTypes);
ScaledFloatFieldMapper other = (ScaledFloatFieldMapper) mergeWith;
this.includeInAll = other.includeInAll;
if (other.ignoreMalformed.explicit()) {
this.ignoreMalformed = other.ignoreMalformed;
}
if (other.coerce.explicit()) {
this.coerce = other.coerce;
}
}
@Override
protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException {
super.doXContentBody(builder, includeDefaults, params);
builder.field("scaling_factor", fieldType().getScalingFactor());
if (includeDefaults || ignoreMalformed.explicit()) {
builder.field("ignore_malformed", ignoreMalformed.value());
}
if (includeDefaults || coerce.explicit()) {
builder.field("coerce", coerce.value());
}
if (includeDefaults || fieldType().nullValue() != null) {
builder.field("null_value", fieldType().nullValue());
}
if (includeInAll != null) {
builder.field("include_in_all", includeInAll);
} else if (includeDefaults) {
builder.field("include_in_all", false);
}
}
private static class ScaledFloatIndexFieldData implements IndexNumericFieldData {
private final IndexNumericFieldData scaledFieldData;
private final double scalingFactor;
ScaledFloatIndexFieldData(IndexNumericFieldData scaledFieldData, double scalingFactor) {
this.scaledFieldData = scaledFieldData;
this.scalingFactor = scalingFactor;
}
@Override
public String getFieldName() {
return scaledFieldData.getFieldName();
}
@Override
public AtomicNumericFieldData load(LeafReaderContext context) {
return new ScaledFloatLeafFieldData(scaledFieldData.load(context), scalingFactor);
}
@Override
public AtomicNumericFieldData loadDirect(LeafReaderContext context) throws Exception {
return new ScaledFloatLeafFieldData(scaledFieldData.loadDirect(context), scalingFactor);
}
@Override
public org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource comparatorSource(Object missingValue,
MultiValueMode sortMode, Nested nested) {
return new DoubleValuesComparatorSource(this, missingValue, sortMode, nested);
}
@Override
public void clear() {
scaledFieldData.clear();
}
@Override
public Index index() {
return scaledFieldData.index();
}
@Override
public NumericType getNumericType() {
return scaledFieldData.getNumericType();
}
}
private static class ScaledFloatLeafFieldData implements AtomicNumericFieldData {
private final AtomicNumericFieldData scaledFieldData;
private final double scalingFactorInverse;
ScaledFloatLeafFieldData(AtomicNumericFieldData scaledFieldData, double scalingFactor) {
this.scaledFieldData = scaledFieldData;
this.scalingFactorInverse = 1d / scalingFactor;
}
@Override
public ScriptDocValues.Doubles getScriptValues() {
return new ScriptDocValues.Doubles(getDoubleValues());
}
@Override
public SortedBinaryDocValues getBytesValues() {
return FieldData.toString(getDoubleValues());
}
@Override
public long ramBytesUsed() {
return scaledFieldData.ramBytesUsed();
}
@Override
public void close() {
scaledFieldData.close();
}
@Override
public SortedNumericDocValues getLongValues() {
return FieldData.castToLong(getDoubleValues());
}
@Override
public SortedNumericDoubleValues getDoubleValues() {
final SortedNumericDocValues values = scaledFieldData.getLongValues();
final NumericDocValues singleValues = DocValues.unwrapSingleton(values);
if (singleValues != null) {
return FieldData.singleton(new NumericDoubleValues() {
@Override
public double get(int docID) {
return singleValues.get(docID) * scalingFactorInverse;
}
}, DocValues.unwrapSingletonBits(values));
} else {
return new SortedNumericDoubleValues() {
@Override
public double valueAt(int index) {
return values.valueAt(index) * scalingFactorInverse;
}
@Override
public void setDocument(int doc) {
values.setDocument(doc);
}
@Override
public int count() {
return values.count();
}
};
}
}
}
}

View File

@ -39,6 +39,7 @@ import org.elasticsearch.index.mapper.core.StringFieldMapper;
import org.elasticsearch.index.mapper.core.TextFieldMapper;
import org.elasticsearch.index.mapper.core.TokenCountFieldMapper;
import org.elasticsearch.index.mapper.core.NumberFieldMapper;
import org.elasticsearch.index.mapper.core.ScaledFloatFieldMapper;
import org.elasticsearch.index.mapper.geo.GeoPointFieldMapper;
import org.elasticsearch.index.mapper.geo.GeoShapeFieldMapper;
import org.elasticsearch.index.mapper.internal.AllFieldMapper;
@ -105,6 +106,7 @@ public class IndicesModule extends AbstractModule {
mappers.put(BinaryFieldMapper.CONTENT_TYPE, new BinaryFieldMapper.TypeParser());
mappers.put(DateFieldMapper.CONTENT_TYPE, new DateFieldMapper.TypeParser());
mappers.put(IpFieldMapper.CONTENT_TYPE, new IpFieldMapper.TypeParser());
mappers.put(ScaledFloatFieldMapper.CONTENT_TYPE, new ScaledFloatFieldMapper.TypeParser());
mappers.put(StringFieldMapper.CONTENT_TYPE, new StringFieldMapper.TypeParser());
mappers.put(TextFieldMapper.CONTENT_TYPE, new TextFieldMapper.TypeParser());
mappers.put(KeywordFieldMapper.CONTENT_TYPE, new KeywordFieldMapper.TypeParser());

View File

@ -0,0 +1,339 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper.core;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexableField;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.DocumentMapperParser;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.ParsedDocument;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.junit.Before;
import java.io.IOException;
import static org.hamcrest.Matchers.containsString;
public class ScaledFloatFieldMapperTests extends ESSingleNodeTestCase {
IndexService indexService;
DocumentMapperParser parser;
@Before
public void before() {
indexService = createIndex("test");
parser = indexService.mapperService().documentMapperParser();
}
public void testDefaults() throws Exception {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", "scaled_float")
.field("scaling_factor", 10.0).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", 123)
.endObject()
.bytes());
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
IndexableField pointField = fields[0];
assertEquals(1, pointField.fieldType().pointDimensionCount());
assertFalse(pointField.fieldType().stored());
assertEquals(1230, pointField.numericValue().longValue());
IndexableField dvField = fields[1];
assertEquals(DocValuesType.SORTED_NUMERIC, dvField.fieldType().docValuesType());
assertEquals(1230, dvField.numericValue().longValue());
assertFalse(dvField.fieldType().stored());
}
public void testMissingScalingFactor() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", "scaled_float").endObject().endObject()
.endObject().endObject().string();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> parser.parse("type", new CompressedXContent(mapping)));
assertEquals("Field [field] misses required parameter [scaling_factor]", e.getMessage());
}
public void testIllegalScalingFactor() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", "scaled_float")
.field("scaling_factor", -1).endObject().endObject()
.endObject().endObject().string();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> parser.parse("type", new CompressedXContent(mapping)));
assertEquals("[scaling_factor] must be a positive number, got [-1.0]", e.getMessage());
}
public void testNotIndexed() throws Exception {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", "scaled_float")
.field("index", false).field("scaling_factor", 10.0).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", 123)
.endObject()
.bytes());
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(1, fields.length);
IndexableField dvField = fields[0];
assertEquals(DocValuesType.SORTED_NUMERIC, dvField.fieldType().docValuesType());
assertEquals(1230, dvField.numericValue().longValue());
}
public void testNoDocValues() throws Exception {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", "scaled_float")
.field("doc_values", false).field("scaling_factor", 10.0).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", 123)
.endObject()
.bytes());
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(1, fields.length);
IndexableField pointField = fields[0];
assertEquals(1, pointField.fieldType().pointDimensionCount());
assertEquals(1230, pointField.numericValue().longValue());
}
public void testStore() throws Exception {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", "scaled_float")
.field("store", true).field("scaling_factor", 10.0).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", 123)
.endObject()
.bytes());
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(3, fields.length);
IndexableField pointField = fields[0];
assertEquals(1, pointField.fieldType().pointDimensionCount());
assertEquals(1230, pointField.numericValue().doubleValue(), 0d);
IndexableField dvField = fields[1];
assertEquals(DocValuesType.SORTED_NUMERIC, dvField.fieldType().docValuesType());
IndexableField storedField = fields[2];
assertTrue(storedField.fieldType().stored());
assertEquals(1230, storedField.numericValue().longValue());
}
public void testCoerce() throws Exception {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", "scaled_float")
.field("scaling_factor", 10.0).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", "123")
.endObject()
.bytes());
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
IndexableField pointField = fields[0];
assertEquals(1, pointField.fieldType().pointDimensionCount());
assertEquals(1230, pointField.numericValue().longValue());
IndexableField dvField = fields[1];
assertEquals(DocValuesType.SORTED_NUMERIC, dvField.fieldType().docValuesType());
mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", "scaled_float")
.field("scaling_factor", 10.0).field("coerce", false).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper2 = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper2.mappingSource().toString());
ThrowingRunnable runnable = () -> mapper2.parse("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", "123")
.endObject()
.bytes());
MapperParsingException e = expectThrows(MapperParsingException.class, runnable);
assertThat(e.getCause().getMessage(), containsString("passed as String"));
}
public void testIgnoreMalformed() throws Exception {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", "scaled_float")
.field("scaling_factor", 10.0).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ThrowingRunnable runnable = () -> mapper.parse("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", "a")
.endObject()
.bytes());
MapperParsingException e = expectThrows(MapperParsingException.class, runnable);
assertThat(e.getCause().getMessage(), containsString("For input string: \"a\""));
mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", "scaled_float")
.field("scaling_factor", 10.0).field("ignore_malformed", true).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper2 = parser.parse("type", new CompressedXContent(mapping));
ParsedDocument doc = mapper2.parse("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", "a")
.endObject()
.bytes());
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(0, fields.length);
}
public void testIncludeInAll() throws Exception {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", "scaled_float")
.field("scaling_factor", 10.0).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", 123)
.endObject()
.bytes());
IndexableField[] fields = doc.rootDoc().getFields("_all");
assertEquals(1, fields.length);
assertEquals("123", fields[0].stringValue());
mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field")
.field("type", "scaled_float").field("scaling_factor", 10.0)
.field("include_in_all", false).endObject().endObject()
.endObject().endObject().string();
mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", 123)
.endObject()
.bytes());
fields = doc.rootDoc().getFields("_all");
assertEquals(0, fields.length);
}
public void testNullValue() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject()
.startObject("type")
.startObject("properties")
.startObject("field")
.field("type", "scaled_float")
.field("scaling_factor", 10.0)
.endObject()
.endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.nullField("field")
.endObject()
.bytes());
assertArrayEquals(new IndexableField[0], doc.rootDoc().getFields("field"));
mapping = XContentFactory.jsonBuilder().startObject()
.startObject("type")
.startObject("properties")
.startObject("field")
.field("type", "scaled_float")
.field("scaling_factor", 10.0)
.field("null_value", 2.5)
.endObject()
.endObject()
.endObject().endObject().string();
mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.nullField("field")
.endObject()
.bytes());
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
IndexableField pointField = fields[0];
assertEquals(1, pointField.fieldType().pointDimensionCount());
assertFalse(pointField.fieldType().stored());
assertEquals(25, pointField.numericValue().longValue());
IndexableField dvField = fields[1];
assertEquals(DocValuesType.SORTED_NUMERIC, dvField.fieldType().docValuesType());
assertFalse(dvField.fieldType().stored());
}
}

View File

@ -0,0 +1,205 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper.core;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.Version;
import org.elasticsearch.action.fieldstats.FieldStats;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.fielddata.AtomicNumericFieldData;
import org.elasticsearch.index.fielddata.IndexNumericFieldData;
import org.elasticsearch.index.fielddata.SortedNumericDoubleValues;
import org.elasticsearch.index.mapper.FieldTypeTestCase;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.junit.Before;
import java.io.IOException;
import java.util.Arrays;
public class ScaledFloatFieldTypeTests extends FieldTypeTestCase {
@Override
protected MappedFieldType createDefaultFieldType() {
ScaledFloatFieldMapper.ScaledFloatFieldType ft = new ScaledFloatFieldMapper.ScaledFloatFieldType();
ft.setScalingFactor(100);
return ft;
}
@Before
public void setupProperties() {
addModifier(new Modifier("scaling_factor", false) {
@Override
public void modify(MappedFieldType ft) {
ScaledFloatFieldMapper.ScaledFloatFieldType tft = (ScaledFloatFieldMapper.ScaledFloatFieldType)ft;
tft.setScalingFactor(10);
}
@Override
public void normalizeOther(MappedFieldType other) {
super.normalizeOther(other);
((ScaledFloatFieldMapper.ScaledFloatFieldType) other).setScalingFactor(100);
}
});
}
public void testTermQuery() {
ScaledFloatFieldMapper.ScaledFloatFieldType ft = new ScaledFloatFieldMapper.ScaledFloatFieldType();
ft.setName("scaled_float");
ft.setScalingFactor(0.1 + randomDouble() * 100);
double value = (randomDouble() * 2 - 1) * 10000;
long scaledValue = Math.round(value * ft.getScalingFactor());
assertEquals(LongPoint.newExactQuery("scaled_float", scaledValue), ft.termQuery(value, null));
}
public void testTermsQuery() {
ScaledFloatFieldMapper.ScaledFloatFieldType ft = new ScaledFloatFieldMapper.ScaledFloatFieldType();
ft.setName("scaled_float");
ft.setScalingFactor(0.1 + randomDouble() * 100);
double value1 = (randomDouble() * 2 - 1) * 10000;
long scaledValue1 = Math.round(value1 * ft.getScalingFactor());
double value2 = (randomDouble() * 2 - 1) * 10000;
long scaledValue2 = Math.round(value2 * ft.getScalingFactor());
assertEquals(
LongPoint.newSetQuery("scaled_float", scaledValue1, scaledValue2),
ft.termsQuery(Arrays.asList(value1, value2), null));
}
public void testRangeQuery() throws IOException {
// make sure the accuracy loss of scaled floats only occurs at index time
// this test checks that searching scaled floats yields the same results as
// searching doubles that are rounded to the closest half float
ScaledFloatFieldMapper.ScaledFloatFieldType ft = new ScaledFloatFieldMapper.ScaledFloatFieldType();
ft.setName("scaled_float");
ft.setScalingFactor(0.1 + randomDouble() * 100);
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(null));
final int numDocs = 1000;
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
double value = (randomDouble() * 2 - 1) * 10000;
long scaledValue = Math.round(value * ft.getScalingFactor());
double rounded = scaledValue / ft.getScalingFactor();
doc.add(new LongPoint("scaled_float", scaledValue));
doc.add(new DoublePoint("double", rounded));
w.addDocument(doc);
}
final DirectoryReader reader = DirectoryReader.open(w);
w.close();
IndexSearcher searcher = newSearcher(reader);
final int numQueries = 1000;
for (int i = 0; i < numQueries; ++i) {
double l = (randomDouble() * 2 - 1) * 10000;
double u = (randomDouble() * 2 - 1) * 10000;
boolean includeLower = randomBoolean();
boolean includeUpper = randomBoolean();
Query doubleQ = NumberFieldMapper.NumberType.DOUBLE.rangeQuery("double", l, u, includeLower, includeUpper);
Query scaledFloatQ = ft.rangeQuery(l, u, includeLower, includeUpper);
assertEquals(searcher.count(doubleQ), searcher.count(scaledFloatQ));
}
IOUtils.close(reader, dir);
}
public void testValueForSearch() {
ScaledFloatFieldMapper.ScaledFloatFieldType ft = new ScaledFloatFieldMapper.ScaledFloatFieldType();
ft.setName("scaled_float");
ft.setScalingFactor(0.1 + randomDouble() * 100);
assertNull(ft.valueForSearch(null));
assertEquals(10/ft.getScalingFactor(), ft.valueForSearch(10L));
}
public void testStats() throws IOException {
ScaledFloatFieldMapper.ScaledFloatFieldType ft = new ScaledFloatFieldMapper.ScaledFloatFieldType();
ft.setName("scaled_float");
ft.setScalingFactor(0.1 + randomDouble() * 100);
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(null));
try (DirectoryReader reader = DirectoryReader.open(w)) {
assertNull(ft.stats(reader));
}
Document doc = new Document();
LongPoint point = new LongPoint("scaled_float", -1);
doc.add(point);
w.addDocument(doc);
point.setLongValue(10);
w.addDocument(doc);
try (DirectoryReader reader = DirectoryReader.open(w)) {
FieldStats<?> stats = ft.stats(reader);
assertEquals(-1/ft.getScalingFactor(), stats.getMinValue());
assertEquals(10/ft.getScalingFactor(), stats.getMaxValue());
assertEquals(2, stats.getMaxDoc());
}
w.deleteAll();
try (DirectoryReader reader = DirectoryReader.open(w)) {
assertNull(ft.stats(reader));
}
IOUtils.close(w, dir);
}
public void testFieldData() throws IOException {
ScaledFloatFieldMapper.ScaledFloatFieldType ft = new ScaledFloatFieldMapper.ScaledFloatFieldType();
ft.setScalingFactor(0.1 + randomDouble() * 100);
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(null));
Document doc = new Document();
doc.add(new SortedNumericDocValuesField("scaled_float1", 10));
doc.add(new SortedNumericDocValuesField("scaled_float2", 5));
doc.add(new SortedNumericDocValuesField("scaled_float2", 12));
w.addDocument(doc);
try (DirectoryReader reader = DirectoryReader.open(w)) {
IndexMetaData indexMetadata = new IndexMetaData.Builder("index").settings(
Settings.builder()
.put("index.version.created", Version.CURRENT)
.put("index.number_of_shards", 1)
.put("index.number_of_replicas", 0).build()).build();
IndexSettings indexSettings = new IndexSettings(indexMetadata, Settings.EMPTY);
// single-valued
ft.setName("scaled_float1");
IndexNumericFieldData fielddata = (IndexNumericFieldData) ft.fielddataBuilder().build(indexSettings, ft, null, null, null);
AtomicNumericFieldData leafFieldData = fielddata.load(reader.leaves().get(0));
SortedNumericDoubleValues values = leafFieldData.getDoubleValues();
values.setDocument(0);
assertEquals(1, values.count());
assertEquals(10/ft.getScalingFactor(), values.valueAt(0), 10e-5);
// multi-valued
ft.setName("scaled_float2");
fielddata = (IndexNumericFieldData) ft.fielddataBuilder().build(indexSettings, ft, null, null, null);
leafFieldData = fielddata.load(reader.leaves().get(0));
values = leafFieldData.getDoubleValues();
values.setDocument(0);
assertEquals(2, values.count());
assertEquals(5/ft.getScalingFactor(), values.valueAt(0), 10e-5);
assertEquals(12/ft.getScalingFactor(), values.valueAt(1), 10e-5);
}
IOUtils.close(w, dir);
}
}

View File

@ -152,8 +152,9 @@ space. They can be compressed more aggressively by using the `best_compression`
[float]
=== Use the smallest numeric type that is sufficient
When storing <<number,numeric data>>, using `float` over `double`, or `half_float`
over `float` can help save storage. This is also true for integer types, but less
since Elasticsearch will more easily compress them based on the number of bits
that they actually need.
The type that you pick for <<number,numeric data>> can have a significant impact
on disk usage. In particular, integers should be stored using an integer type
(`byte`, `short`, `integer` or `long`) and floating points should either be
stored in a `scaled_float` if appropriate or in the smallest type that fits the
use-case: using `float` over `double`, or `half_float` over `float` will help
save storage.

View File

@ -4,13 +4,14 @@
The following numeric types are supported:
[horizontal]
`long`:: A signed 64-bit integer with a minimum value of +-2^63^+ and a maximum value of +2^63^-1+.
`integer`:: A signed 32-bit integer with a minimum value of +-2^31^+ and a maximum value of +2^31^-1+.
`short`:: A signed 16-bit integer with a minimum value of +-32,768+ and a maximum value of +32,767+.
`byte`:: A signed 8-bit integer with a minimum value of +-128+ and a maximum value of +127+.
`double`:: A double-precision 64-bit IEEE 754 floating point.
`float`:: A single-precision 32-bit IEEE 754 floating point.
`half_float`:: A half-precision 16-bit IEEE 754 floating point.
`long`:: A signed 64-bit integer with a minimum value of +-2^63^+ and a maximum value of +2^63^-1+.
`integer`:: A signed 32-bit integer with a minimum value of +-2^31^+ and a maximum value of +2^31^-1+.
`short`:: A signed 16-bit integer with a minimum value of +-32,768+ and a maximum value of +32,767+.
`byte`:: A signed 8-bit integer with a minimum value of +-128+ and a maximum value of +127+.
`double`:: A double-precision 64-bit IEEE 754 floating point.
`float`:: A single-precision 32-bit IEEE 754 floating point.
`half_float`:: A half-precision 16-bit IEEE 754 floating point.
`scaled_float`:: A floating point that is backed by a `long` and a fixed scaling factor.
Below is an example of configuring a mapping with numeric fields:
@ -26,6 +27,10 @@ PUT my_index
},
"time_in_seconds": {
"type": "float"
},
"price": {
"type": "scaled_float",
"scaling_factor": 100
}
}
}
@ -42,11 +47,24 @@ help indexing and searching be more efficient. Note however that given that
storage is optimized based on the actual values that are stored, picking one
type over another one will have no impact on storage requirements.
For floating-point types, picking the smallest type that is enough for the
use-case will still help indexing and searching be more efficient. However,
given that floating-point data is hard to compress, it might also have a
significant impact on storage requirements. Here is a table that compares the
3 floating-point types that are available in order to help make a decision.
For floating-point types, it is often more efficient to store floating-point
data into an integer using a scaling factor, which is what the `scaled_float`
type does under the hood. For instance, a `price` field could be stored in a
`scaled_float` with a `scaling_factor` of +100+. All APIs would work as if
the field was stored as a double, but under the hood elasticsearch would be
working with the number of cents, +price*100+, which is an integer. This is
mostly helpful to save disk space since integers are way easier to compress
than floating points. `scaled_float` is also fine to use in order to trade
accuracy for disk space. For instance imagine that you are tracking cpu
utilization as a number between +0+ and +1+. It usually does not matter much
whether cpu utilization is +12.7%+ or +13%+, so you could use a `scaled_float`
with a `scaling_factor` of +100+ in order to round cpu utilization to the
closest percent in order to save space.
If `scaled_float` is not a good fit, then you should pick the smallest type
that is enough for the use-case among the floating-point types: `double`,
`float` and `half_float`. Here is a table that compares these types in order
to help make a decision.
[cols="<,<,<,<",options="header",]
|=======================================================================
@ -56,12 +74,6 @@ significant impact on storage requirements. Here is a table that compares the
|`half_float`|+2^-24^+ |+65504+ |+11+ / +3.31+
|=======================================================================
When possible, it is often more efficient to store floating-point data into an
integer using a scaling factor. For instance, it is more efficient to store
percentages as integers between 0 and 100 than as floating-point numbers between 0
and 1. Another example would be prices: it will be more efficient to store prices
as a number of cents, which is an integer, than as a floating-point number.
[[number-params]]
==== Parameters for numeric fields
@ -114,4 +126,19 @@ The following parameters are accepted by numeric types:
the <<mapping-source-field,`_source`>> field. Accepts `true` or `false`
(default).
[[scaled-float-params]]
==== Parameters for `scaled_float`
`scaled_float` accepts an additional parameter:
[horizontal]
`scaling_factor`::
The scaling factor to use when encoding values. Values will be multiplied
by this factor at index time and rounded to the closest long value. For
instance, a `scaled_float` with a `scaling_factor` of +10+ would internally
store +2.34+ as +23+ and all search-time operations (queries, aggregations,
sorting) will behave as if the document had a value of +2.3+. High values
of `scaling_factor` improve accuracy but also increase space requirements.
This parameter is required.