Implement stats for geo_point and geo_shape field (#22391)

Currently `geo_point` and `geo_shape` field are treated as `text` field by the field stats API and we
try to extract the min/max values with MultiFields.getTerms.
This is ok in master because a `geo_point` field is always a Point field but it can cause problem in 5.x (and 2.x) because the legacy
 `geo_point` are indexed as terms.
 As a result the min and max are extracted and then printed in the FieldStats output using BytesRef.utf8ToString
 which can throw an IndexOutOfBoundException since it's not valid UTF8 strings.
 This change ensure that we never try to extract min/max information from a `geo_point` field.
 It does not add a new type for geo points in the fieldstats API so we'll continue to use `text` for this kind of field.
 This PR is targeted to master even though we could only commit this change to 5.x. I think it's cleaner to have it in master too before we make any decision on
  https://github.com/elastic/elasticsearch/pull/21947.

Fixes #22384
This commit is contained in:
Jim Ferenczi 2017-01-04 10:42:22 +01:00 committed by GitHub
parent c6573e6e56
commit 360ce532eb
3 changed files with 139 additions and 12 deletions

View File

@ -19,14 +19,15 @@
package org.elasticsearch.index.mapper;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Query;
import org.elasticsearch.common.geo.GeoHashUtils;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.action.fieldstats.FieldStats;
import org.elasticsearch.common.Explicit;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.geo.GeoHashUtils;
import org.elasticsearch.common.geo.GeoPoint;
import org.elasticsearch.common.geo.GeoUtils;
import org.elasticsearch.common.logging.DeprecationLogger;
@ -170,6 +171,20 @@ public abstract class BaseGeoPointFieldMapper extends FieldMapper implements Arr
public Query termQuery(Object value, QueryShardContext context) {
throw new QueryShardException(context, "Geo fields do not support exact searching, use dedicated geo queries instead: [" + name() + "]");
}
@Override
public FieldStats stats(IndexReader reader) throws IOException {
int maxDoc = reader.maxDoc();
FieldInfo fi = org.apache.lucene.index.MultiFields.getMergedFieldInfos(reader).fieldInfo(name());
if (fi == null) {
return null;
}
/**
* we don't have a specific type for geo_point so we use an empty {@link FieldStats.Text}.
* TODO: we should maybe support a new type that knows how to (de)encode the min/max information
*/
return new FieldStats.Text(maxDoc, -1, -1, -1, isSearchable(), isAggregatable());
}
}
protected Explicit<Boolean> ignoreMalformed;

View File

@ -18,12 +18,11 @@
*/
package org.elasticsearch.index.mapper;
import org.apache.lucene.index.IndexableField;
import org.locationtech.spatial4j.shape.Point;
import org.locationtech.spatial4j.shape.Shape;
import org.locationtech.spatial4j.shape.jts.JtsGeometry;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Query;
import org.apache.lucene.spatial.prefix.PrefixTreeStrategy;
import org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy;
@ -33,8 +32,8 @@ import org.apache.lucene.spatial.prefix.tree.PackedQuadPrefixTree;
import org.apache.lucene.spatial.prefix.tree.QuadPrefixTree;
import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
import org.elasticsearch.Version;
import org.elasticsearch.action.fieldstats.FieldStats;
import org.elasticsearch.common.Explicit;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.geo.GeoUtils;
import org.elasticsearch.common.geo.SpatialStrategy;
import org.elasticsearch.common.geo.builders.ShapeBuilder;
@ -45,6 +44,9 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.query.QueryShardException;
import org.locationtech.spatial4j.shape.Point;
import org.locationtech.spatial4j.shape.Shape;
import org.locationtech.spatial4j.shape.jts.JtsGeometry;
import java.io.IOException;
import java.util.Iterator;
@ -415,6 +417,20 @@ public class GeoShapeFieldMapper extends FieldMapper {
public Query termQuery(Object value, QueryShardContext context) {
throw new QueryShardException(context, "Geo fields do not support exact searching, use dedicated geo queries instead");
}
@Override
public FieldStats stats(IndexReader reader) throws IOException {
int maxDoc = reader.maxDoc();
FieldInfo fi = org.apache.lucene.index.MultiFields.getMergedFieldInfos(reader).fieldInfo(name());
if (fi == null) {
return null;
}
/**
* we don't have a specific type for geo_shape so we use an empty {@link FieldStats.Text}.
* TODO: we should maybe support a new type that knows how to (de)encode the min/max information
*/
return new FieldStats.Text(maxDoc, -1, -1, -1, isSearchable(), isAggregatable());
}
}
protected Explicit<Boolean> coerce;

View File

@ -12,6 +12,12 @@ setup:
type: long
bar:
type: long
geo:
type: geo_point
geo_shape:
type: geo_shape
tree: quadtree
precision: 1m
- do:
indices.create:
@ -26,20 +32,26 @@ setup:
type: long
bar:
type: text
geo:
type: geo_point
geo_shape:
type: geo_shape
tree: quadtree
precision: 1m
- do:
index:
index: test_1
type: test
id: id_1
body: { foo: "bar", number: 123, bar: 123 }
body: { foo: "bar", number: 123, bar: 123, geo: { lat: 48.858093, lon: 2.294694} }
- do:
index:
index: test_2
type: test
id: id_10
body: { foo: "babar", number: 456, bar: "123" }
body: { foo: "babar", number: 456, bar: "123", geo_shape: {type: "linestring", coordinates : [[-77.03653, 38.897676], [-77.009051, 38.889939]] } }
- do:
indices.refresh: {}
@ -48,7 +60,7 @@ setup:
"Basic field stats":
- do:
field_stats:
fields: [foo, number]
fields: [foo, number, geo, geo_shape]
- match: { indices._all.fields.foo.max_doc: 2 }
- match: { indices._all.fields.foo.doc_count: 2 }
@ -68,13 +80,54 @@ setup:
- match: { indices._all.fields.number.max_value: 456 }
- match: { indices._all.fields.number.max_value_as_string: "456" }
- match: { indices._all.fields.number.type: "integer" }
- match: { indices._all.fields.geo.type: "string" }
- match: { indices._all.fields.geo.max_doc: 1 }
- match: { indices._all.fields.geo.doc_count: -1 }
- match: { indices._all.fields.geo.searchable: true }
- match: { indices._all.fields.geo.aggregatable: true }
- match: { indices._all.fields.geo_shape.type: "string" }
- match: { indices._all.fields.geo_shape.max_doc: 1 }
- match: { indices._all.fields.geo_shape.searchable: true }
- match: { indices._all.fields.geo_shape.aggregatable: false }
- is_false: conflicts
---
"Geopoint field stats":
- skip:
version: " - 5.2.0"
reason: geo_point fields don't return min/max for versions greater than 5.2.0
- do:
field_stats:
fields: [geo, geo_shape]
- match: { indices._all.fields.geo.type: "string" }
- match: { indices._all.fields.geo.max_doc: 1 }
- match: { indices._all.fields.geo.doc_count: -1 }
- match: { indices._all.fields.geo.searchable: true }
- match: { indices._all.fields.geo.aggregatable: true }
- is_false: indices._all.fields.geo.min_value
- is_false: indices._all.fields.geo.max_value
- is_false: indices._all.fields.geo.min_value_as_string
- is_false: indices._all.fields.geo.max_value_as_string
- match: { indices._all.fields.geo_shape.type: "string" }
- match: { indices._all.fields.geo_shape.max_doc: 1 }
- match: { indices._all.fields.geo_shape.doc_count: -1 }
- match: { indices._all.fields.geo_shape.searchable: true }
- match: { indices._all.fields.geo_shape.aggregatable: false }
- is_false: indices._all.fields.geo_shape.min_value
- is_false: indices._all.fields.geo_shape.max_value
- is_false: indices._all.fields.geo_shape.min_value_as_string
- is_false: indices._all.fields.geo_shape.max_value_as_string
- is_false: conflicts
---
"Basic field stats with level set to indices":
- do:
field_stats:
fields: [foo, number]
fields: [foo, number, geo, geo_shape]
level: indices
- match: { indices.test_1.fields.foo.max_doc: 1 }
@ -95,6 +148,10 @@ setup:
- match: { indices.test_1.fields.number.max_value: 123 }
- match: { indices.test_1.fields.number.max_value_as_string: "123" }
- match: { indices.test_1.fields.number.type: "integer" }
- match: { indices.test_1.fields.geo.type: "string" }
- match: { indices.test_1.fields.geo.max_doc: 1 }
- match: { indices.test_1.fields.geo.searchable: true }
- match: { indices.test_1.fields.geo.aggregatable: true }
- match: { indices.test_2.fields.foo.max_doc: 1 }
- match: { indices.test_2.fields.foo.doc_count: 1 }
- match: { indices.test_2.fields.foo.min_value: "babar" }
@ -114,6 +171,45 @@ setup:
- match: { indices.test_2.fields.number.max_value: 456 }
- match: { indices.test_2.fields.number.max_value_as_string: "456" }
- match: { indices.test_2.fields.number.type: "integer" }
- match: { indices.test_2.fields.geo_shape.type: "string" }
- match: { indices.test_2.fields.geo_shape.max_doc: 1 }
- match: { indices.test_2.fields.geo_shape.searchable: true }
- match: { indices.test_2.fields.geo_shape.aggregatable: false }
- is_false: indices.test_2.fields.geo
- is_false: conflicts
---
"Geopoint field stats with level set to indices":
- skip:
version: " - 5.2.0"
reason: geo_point fields don't return min/max for versions greater than 5.2.0
- do:
field_stats:
fields: [geo, geo_shape]
level: indices
- match: { indices.test_1.fields.geo.max_doc: 1 }
- match: { indices.test_1.fields.geo.doc_count: -1 }
- is_false: indices.test_1.fields.geo.min_value
- is_false: indices.test_1.fields.geo.max_value
- is_false: indices.test_1.fields.geo.min_value_as_string
- is_false: indices.test_1.fields.geo.max_value_as_string
- match: { indices.test_1.fields.geo.searchable: true }
- match: { indices.test_1.fields.geo.aggregatable: true }
- match: { indices.test_1.fields.geo.type: "string" }
- is_false: indices.test_2.fields.geo
- match: { indices.test_2.fields.geo_shape.max_doc: 1 }
- match: { indices.test_2.fields.geo_shape.doc_count: -1 }
- is_false: indices.test_2.fields.geo_shape.min_value
- is_false: indices.test_2.fields.geo_shape.max_value
- is_false: indices.test_2.fields.geo_shape.min_value_as_string
- is_false: indices.test_2.fields.geo_shape.max_value_as_string
- match: { indices.test_2.fields.geo_shape.searchable: true }
- match: { indices.test_2.fields.geo_shape.aggregatable: false }
- match: { indices.test_2.fields.geo_shape.type: "string" }
- is_false: indices.test_2.fields.geo
- is_false: conflicts
---