Expose half-floats. #18887

They have been implemented in https://issues.apache.org/jira/browse/LUCENE-7289.
Ranges are implemented so that the accuracy loss only occurs at index time,
which means that if you are searching for values between A and B, the query will
match exactly all documents whose value rounded to the closest half-float point
is between A and B.
This commit is contained in:
Adrien Grand 2016-06-15 10:35:55 +02:00
parent 18ff051ad5
commit 9ffb2ff6ba
8 changed files with 372 additions and 12 deletions

View File

@ -29,6 +29,7 @@ public interface IndexNumericFieldData extends IndexFieldData<AtomicNumericField
SHORT(false),
INT(false),
LONG(false),
HALF_FLOAT(true),
FLOAT(true),
DOUBLE(true);

View File

@ -19,6 +19,7 @@
package org.elasticsearch.index.fielddata.plain;
import org.apache.lucene.document.HalfFloatPoint;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.LeafReader;
@ -61,6 +62,7 @@ public class SortedNumericDVIndexFieldData extends DocValuesIndexFieldData imple
@Override
public org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource comparatorSource(Object missingValue, MultiValueMode sortMode, Nested nested) {
switch (numericType) {
case HALF_FLOAT:
case FLOAT:
return new FloatValuesComparatorSource(this, missingValue, sortMode, nested);
case DOUBLE:
@ -87,6 +89,8 @@ public class SortedNumericDVIndexFieldData extends DocValuesIndexFieldData imple
final String field = fieldName;
switch (numericType) {
case HALF_FLOAT:
return new SortedNumericHalfFloatFieldData(reader, field);
case FLOAT:
return new SortedNumericFloatFieldData(reader, field);
case DOUBLE:
@ -134,6 +138,95 @@ public class SortedNumericDVIndexFieldData extends DocValuesIndexFieldData imple
}
}
/**
* FieldData implementation for 16-bit float values.
* <p>
* Order of values within a document is consistent with
* {@link Float#compareTo(Float)}, hence the following reversible
* transformation is applied at both index and search:
* {@code bits ^ (bits >> 15) & 0x7fff}
* <p>
* Although the API is multi-valued, most codecs in Lucene specialize
* for the case where documents have at most one value. In this case
* {@link FieldData#unwrapSingleton(SortedNumericDoubleValues)} will return
* the underlying single-valued NumericDoubleValues representation, and
* {@link FieldData#unwrapSingletonBits(SortedNumericDoubleValues)} will return
* a Bits matching documents that have a real value (as opposed to missing).
*/
static final class SortedNumericHalfFloatFieldData extends AtomicDoubleFieldData {
final LeafReader reader;
final String field;
SortedNumericHalfFloatFieldData(LeafReader reader, String field) {
super(0L);
this.reader = reader;
this.field = field;
}
@Override
public SortedNumericDoubleValues getDoubleValues() {
try {
SortedNumericDocValues raw = DocValues.getSortedNumeric(reader, field);
NumericDocValues single = DocValues.unwrapSingleton(raw);
if (single != null) {
return FieldData.singleton(new SingleHalfFloatValues(single), DocValues.unwrapSingletonBits(raw));
} else {
return new MultiHalfFloatValues(raw);
}
} catch (IOException e) {
throw new IllegalStateException("Cannot load doc values", e);
}
}
@Override
public Collection<Accountable> getChildResources() {
return Collections.emptyList();
}
}
/**
* Wraps a NumericDocValues and exposes a single 16-bit float per document.
*/
static final class SingleHalfFloatValues extends NumericDoubleValues {
final NumericDocValues in;
SingleHalfFloatValues(NumericDocValues in) {
this.in = in;
}
@Override
public double get(int docID) {
return HalfFloatPoint.sortableShortToHalfFloat((short) in.get(docID));
}
}
/**
* Wraps a SortedNumericDocValues and exposes multiple 16-bit floats per document.
*/
static final class MultiHalfFloatValues extends SortedNumericDoubleValues {
final SortedNumericDocValues in;
MultiHalfFloatValues(SortedNumericDocValues in) {
this.in = in;
}
@Override
public void setDocument(int doc) {
in.setDocument(doc);
}
@Override
public double valueAt(int index) {
return HalfFloatPoint.sortableShortToHalfFloat((short) in.valueAt(index));
}
@Override
public int count() {
return in.count();
}
}
/**
* FieldData implementation for 32-bit float values.
* <p>

View File

@ -22,6 +22,7 @@ package org.elasticsearch.index.mapper.core;
import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.document.HalfFloatPoint;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.SortedNumericDocValuesField;
@ -180,6 +181,86 @@ public class NumberFieldMapper extends FieldMapper implements AllFieldMapper.Inc
}
public enum NumberType {
HALF_FLOAT("half_float", NumericType.HALF_FLOAT) {
@Override
Float parse(Object value) {
return (Float) FLOAT.parse(value);
}
@Override
Float parse(XContentParser parser, boolean coerce) throws IOException {
return parser.floatValue(coerce);
}
@Override
Query termQuery(String field, Object value) {
float v = parse(value);
return HalfFloatPoint.newExactQuery(field, v);
}
@Override
Query termsQuery(String field, List<Object> values) {
float[] v = new float[values.size()];
for (int i = 0; i < values.size(); ++i) {
v[i] = parse(values.get(i));
}
return HalfFloatPoint.newSetQuery(field, v);
}
@Override
Query rangeQuery(String field, Object lowerTerm, Object upperTerm,
boolean includeLower, boolean includeUpper) {
float l = Float.NEGATIVE_INFINITY;
float u = Float.POSITIVE_INFINITY;
if (lowerTerm != null) {
l = parse(lowerTerm);
if (includeLower) {
l = Math.nextDown(l);
}
l = HalfFloatPoint.nextUp(l);
}
if (upperTerm != null) {
u = parse(upperTerm);
if (includeUpper) {
u = Math.nextUp(u);
}
u = HalfFloatPoint.nextDown(u);
}
return HalfFloatPoint.newRangeQuery(field, l, u);
}
@Override
public List<Field> createFields(String name, Number value,
boolean indexed, boolean docValued, boolean stored) {
List<Field> fields = new ArrayList<>();
if (indexed) {
fields.add(new HalfFloatPoint(name, value.floatValue()));
}
if (docValued) {
fields.add(new SortedNumericDocValuesField(name,
HalfFloatPoint.halfFloatToSortableShort(value.floatValue())));
}
if (stored) {
fields.add(new StoredField(name, value.floatValue()));
}
return fields;
}
@Override
FieldStats.Double stats(IndexReader reader, String fieldName,
boolean isSearchable, boolean isAggregatable) throws IOException {
long size = XPointValues.size(reader, fieldName);
if (size == 0) {
return null;
}
int docCount = XPointValues.getDocCount(reader, fieldName);
byte[] min = XPointValues.getMinPackedValue(reader, fieldName);
byte[] max = XPointValues.getMaxPackedValue(reader, fieldName);
return new FieldStats.Double(reader.maxDoc(),docCount, -1L, size,
isSearchable, isAggregatable,
HalfFloatPoint.decodeDimension(min, 0), HalfFloatPoint.decodeDimension(max, 0));
}
},
FLOAT("float", NumericType.FLOAT) {
@Override
Float parse(Object value) {

View File

@ -19,6 +19,7 @@
package org.elasticsearch.fieldstats;
import org.apache.lucene.document.HalfFloatPoint;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.action.ActionRequestValidationException;
import org.elasticsearch.action.fieldstats.FieldStats;
@ -54,6 +55,7 @@ public class FieldStatsIntegrationIT extends ESIntegTestCase {
"string", "type=text",
"date", "type=date",
"double", "type=double",
"half_float", "type=half_float",
"float", "type=float",
"long", "type=long",
"integer", "type=integer",
@ -67,6 +69,7 @@ public class FieldStatsIntegrationIT extends ESIntegTestCase {
"string", "type=text,index=false",
"date", "type=date,index=false",
"double", "type=double,index=false",
"half_float", "type=half_float",
"float", "type=float,index=false",
"long", "type=long,index=false",
"integer", "type=integer,index=false",
@ -81,6 +84,7 @@ public class FieldStatsIntegrationIT extends ESIntegTestCase {
"string", "type=text,index=false",
"date", "type=date,index=false",
"double", "type=double,index=false",
"half_float", "type=half_float",
"float", "type=float,index=false",
"long", "type=long,index=false",
"integer", "type=integer,index=false",
@ -97,10 +101,12 @@ public class FieldStatsIntegrationIT extends ESIntegTestCase {
long maxInt = Integer.MIN_VALUE;
long minLong = Long.MAX_VALUE;
long maxLong = Long.MIN_VALUE;
double minFloat = Float.MAX_VALUE;
double maxFloat = Float.MIN_VALUE;
double minDouble = Double.MAX_VALUE;
double maxDouble = Double.MIN_VALUE;
double minHalfFloat = Double.POSITIVE_INFINITY;
double maxHalfFloat = Double.NEGATIVE_INFINITY;
double minFloat = Double.POSITIVE_INFINITY;
double maxFloat = Double.NEGATIVE_INFINITY;
double minDouble = Double.POSITIVE_INFINITY;
double maxDouble = Double.NEGATIVE_INFINITY;
String minString = new String(Character.toChars(1114111));
String maxString = "0";
@ -119,6 +125,10 @@ public class FieldStatsIntegrationIT extends ESIntegTestCase {
long l = randomLong();
minLong = Math.min(minLong, l);
maxLong = Math.max(maxLong, l);
float hf = randomFloat();
hf = HalfFloatPoint.sortableShortToHalfFloat(HalfFloatPoint.halfFloatToSortableShort(hf));
minHalfFloat = Math.min(minHalfFloat, hf);
maxHalfFloat = Math.max(maxHalfFloat, hf);
float f = randomFloat();
minFloat = Math.min(minFloat, f);
maxFloat = Math.max(maxFloat, f);
@ -138,6 +148,7 @@ public class FieldStatsIntegrationIT extends ESIntegTestCase {
"short", s,
"integer", i,
"long", l,
"half_float", hf,
"float", f,
"double", d,
"string", str)
@ -147,7 +158,7 @@ public class FieldStatsIntegrationIT extends ESIntegTestCase {
FieldStatsResponse response = client()
.prepareFieldStats()
.setFields("byte", "short", "integer", "long", "float", "double", "string").get();
.setFields("byte", "short", "integer", "long", "half_float", "float", "double", "string").get();
assertAllSuccessful(response);
for (FieldStats<?> stats : response.getAllFieldStats().values()) {
@ -164,6 +175,8 @@ public class FieldStatsIntegrationIT extends ESIntegTestCase {
assertThat(response.getAllFieldStats().get("integer").getMaxValue(), equalTo(maxInt));
assertThat(response.getAllFieldStats().get("long").getMinValue(), equalTo(minLong));
assertThat(response.getAllFieldStats().get("long").getMaxValue(), equalTo(maxLong));
assertThat(response.getAllFieldStats().get("half_float").getMinValue(), equalTo(minHalfFloat));
assertThat(response.getAllFieldStats().get("half_float").getMaxValue(), equalTo(maxHalfFloat));
assertThat(response.getAllFieldStats().get("float").getMinValue(), equalTo(minFloat));
assertThat(response.getAllFieldStats().get("float").getMaxValue(), equalTo(maxFloat));
assertThat(response.getAllFieldStats().get("double").getMinValue(), equalTo(minDouble));

View File

@ -108,6 +108,24 @@ public class FieldStatsTests extends ESSingleNodeTestCase {
assertThat(result.getAllFieldStats().get(fieldName).getMinValueAsString(), equalTo(Double.toString(-1)));
}
public void testHalfFloat() {
String fieldName = "field";
createIndex("test", Settings.EMPTY, "test", fieldName, "type=half_float");
for (float value = -1; value <= 9; value++) {
client().prepareIndex("test", "test").setSource(fieldName, value).get();
}
client().admin().indices().prepareRefresh().get();
FieldStatsResponse result = client().prepareFieldStats().setFields(fieldName).get();
assertThat(result.getAllFieldStats().get(fieldName).getMaxDoc(), equalTo(11L));
assertThat(result.getAllFieldStats().get(fieldName).getDocCount(), equalTo(11L));
assertThat(result.getAllFieldStats().get(fieldName).getDensity(), equalTo(100));
assertThat(result.getAllFieldStats().get(fieldName).getMinValue(), equalTo(-1d));
assertThat(result.getAllFieldStats().get(fieldName).getMaxValue(), equalTo(9d));
assertThat(result.getAllFieldStats().get(fieldName).getMinValueAsString(), equalTo(Float.toString(-1)));
assertThat(result.getAllFieldStats().get(fieldName).getMaxValueAsString(), equalTo(Float.toString(9)));
}
public void testFloat() {
String fieldName = "field";
createIndex("test", Settings.EMPTY, "test", fieldName, "type=float");

View File

@ -0,0 +1,82 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.fielddata.plain;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.TestUtil;
import org.elasticsearch.index.fielddata.FieldData;
import org.elasticsearch.index.fielddata.SortedNumericDoubleValues;
import org.elasticsearch.index.mapper.core.NumberFieldMapper;
import org.elasticsearch.test.ESTestCase;
import java.io.IOException;
public class HalfFloatFielddataTests extends ESTestCase {
public void testSingleValued() throws IOException {
Directory dir = newDirectory();
// we need the default codec to check for singletons
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(null).setCodec(TestUtil.getDefaultCodec()));
Document doc = new Document();
for (IndexableField f : NumberFieldMapper.NumberType.HALF_FLOAT.createFields("half_float", 3f, false, true, false)) {
doc.add(f);
}
w.addDocument(doc);
final DirectoryReader dirReader = DirectoryReader.open(w);
LeafReader reader = getOnlyLeafReader(dirReader);
SortedNumericDoubleValues values = new SortedNumericDVIndexFieldData.SortedNumericHalfFloatFieldData(
reader, "half_float").getDoubleValues();
assertNotNull(FieldData.unwrapSingleton(values));
values.setDocument(0);
assertEquals(1, values.count());
assertEquals(3f, values.valueAt(0), 0f);
IOUtils.close(dirReader, w, dir);
}
public void testMultiValued() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(null));
Document doc = new Document();
for (IndexableField f : NumberFieldMapper.NumberType.HALF_FLOAT.createFields("half_float", 3f, false, true, false)) {
doc.add(f);
}
for (IndexableField f : NumberFieldMapper.NumberType.HALF_FLOAT.createFields("half_float", 2f, false, true, false)) {
doc.add(f);
}
w.addDocument(doc);
final DirectoryReader dirReader = DirectoryReader.open(w);
LeafReader reader = getOnlyLeafReader(dirReader);
SortedNumericDoubleValues values = new SortedNumericDVIndexFieldData.SortedNumericHalfFloatFieldData(
reader, "half_float").getDoubleValues();
assertNull(FieldData.unwrapSingleton(values));
values.setDocument(0);
assertEquals(2, values.count());
assertEquals(2f, values.valueAt(0), 0f);
assertEquals(3f, values.valueAt(1), 0f);
IOUtils.close(dirReader, w, dir);
}
}

View File

@ -21,8 +21,18 @@ package org.elasticsearch.index.mapper.core;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.document.HalfFloatPoint;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.index.mapper.FieldTypeTestCase;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MappedFieldType.Relation;
@ -81,6 +91,7 @@ public class NumberFieldTypeTests extends FieldTypeTestCase {
assertEquals((short) 3, NumberType.SHORT.parse(3d));
assertEquals(3, NumberType.INTEGER.parse(3d));
assertEquals(3L, NumberType.LONG.parse(3d));
assertEquals(3f, NumberType.HALF_FLOAT.parse(3d));
assertEquals(3f, NumberType.FLOAT.parse(3d));
assertEquals(3d, NumberType.DOUBLE.parse(3d));
@ -103,7 +114,39 @@ public class NumberFieldTypeTests extends FieldTypeTestCase {
assertEquals("Value [2147483648] is out of range for an integer", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () -> NumberType.LONG.parse(10000000000000000000d));
assertEquals("Value [1.0E19] is out of range for a long", e.getMessage());
assertEquals(1.1f, NumberType.FLOAT.parse(1.1)); // accuracy loss is expected
assertEquals(1.1f, NumberType.HALF_FLOAT.parse(1.1));
assertEquals(1.1f, NumberType.FLOAT.parse(1.1));
assertEquals(1.1d, NumberType.DOUBLE.parse(1.1));
}
public void testHalfFloatRange() throws IOException {
// make sure the accuracy loss of half floats only occurs at index time
// this test checks that searching half floats yields the same results as
// searching floats that are rounded to the closest half float
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(null));
final int numDocs = 10000;
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
float value = (randomFloat() * 2 - 1) * 70000;
float rounded = HalfFloatPoint.sortableShortToHalfFloat(HalfFloatPoint.halfFloatToSortableShort(value));
doc.add(new HalfFloatPoint("half_float", value));
doc.add(new FloatPoint("float", rounded));
w.addDocument(doc);
}
final DirectoryReader reader = DirectoryReader.open(w);
w.close();
IndexSearcher searcher = newSearcher(reader);
final int numQueries = 1000;
for (int i = 0; i < numQueries; ++i) {
float l = (randomFloat() * 2 - 1) * 70000;
float u = (randomFloat() * 2 - 1) * 70000;
boolean includeLower = randomBoolean();
boolean includeUpper = randomBoolean();
Query floatQ = NumberFieldMapper.NumberType.FLOAT.rangeQuery("float", l, u, includeLower, includeUpper);
Query halfFloatQ = NumberFieldMapper.NumberType.HALF_FLOAT.rangeQuery("half_float", l, u, includeLower, includeUpper);
assertEquals(searcher.count(floatQ), searcher.count(halfFloatQ));
}
IOUtils.close(reader, dir);
}
}

View File

@ -4,12 +4,13 @@
The following numeric types are supported:
[horizontal]
`long`:: A signed 64-bit integer with a minimum value of +-2^63^+ and a maximum value of +2^63^-1+.
`integer`:: A signed 32-bit integer with a minimum value of +-2^31^+ and a maximum value of +2^31^-1+.
`short`:: A signed 16-bit integer with a minimum value of +-32,768+ and a maximum value of +32,767+.
`byte`:: A signed 8-bit integer with a minimum value of +-128+ and a maximum value of +127+.
`double`:: A double-precision 64-bit IEEE 754 floating point.
`float`:: A single-precision 32-bit IEEE 754 floating point.
`long`:: A signed 64-bit integer with a minimum value of +-2^63^+ and a maximum value of +2^63^-1+.
`integer`:: A signed 32-bit integer with a minimum value of +-2^31^+ and a maximum value of +2^31^-1+.
`short`:: A signed 16-bit integer with a minimum value of +-32,768+ and a maximum value of +32,767+.
`byte`:: A signed 8-bit integer with a minimum value of +-128+ and a maximum value of +127+.
`double`:: A double-precision 64-bit IEEE 754 floating point.
`float`:: A single-precision 32-bit IEEE 754 floating point.
`half_float`:: A half-precision 16-bit IEEE 754 floating point.
Below is an example of configuring a mapping with numeric fields:
@ -33,6 +34,34 @@ PUT my_index
--------------------------------------------------
// CONSOLE
==== Which type should I use?
As far as integer types (`byte`, `short`, `integer` and `long`) are concerned,
you should pick the smallest type which is enough for your use-case. This will
help indexing and searching be more efficient. Note however that given that
storage is optimized based on the actual values that are stored, picking one
type over another one will have no impact on storage requirements.
For floating-point types, picking the smallest type that is enough for the
use-case will still help indexing and searching be more efficient. However,
given that floating-point data is hard to compress, it might also have a
significant impact on storage requirements. Here is a table that compares the
3 floating-point types that are available in order to help make a decision.
[cols="<,<,<,<",options="header",]
|=======================================================================
|Type |Minimum value |Maximum value |Significant bits / digits
|`double`|+2^-1074^+ |+(2-2^-52^)·2^1023^+ |+53+ / +15.95+
|`float`|+2^-149^+ |+(2-2^-23^)·2^127^+ |+24+ / +7.22+
|`half_float`|+2^-24^+ |+65504+ |+11+ / +3.31+
|=======================================================================
When possible, it is often more efficient to store floating-point data into an
integer using a scaling factor. For instance, it is more efficient to store
percentages as integers between 0 and 100 than as floating-point numbers between 0
and 1. Another example would be prices: it will be more efficient to store prices
as a number of cents, which is an integer, than as a floating-point number.
[[number-params]]
==== Parameters for numeric fields