Add a new _ignored meta field. (#29658)

This adds a new `_ignored` meta field which indexes and stores fields that have
been ignored at index time because of the `ignore_malformed` option. It makes
malformed documents easier to identify by using `exists` or `term(s)` queries
on the `_ignored` field.

Closes #29494
This commit is contained in:
Adrien Grand 2018-05-02 10:47:02 +02:00 committed by GitHub
parent 0aad5fd0f5
commit 7358946bda
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 418 additions and 10 deletions

View File

@ -71,8 +71,12 @@ written to by an older Elasticsearch after writing to it with a newer Elasticsea
[[release-notes-6.4.0]]
== {es} 6.4.0
//[float]
//=== New Features
[float]
=== New Features
The new <<mapping-ignored-field,`_ignored`>> field allows to know which fields
got ignored at index time because of the <<ignore-malformed,`ignore_malformed`>>
option. ({pull}30140[#29658])
[float]
=== Enhancements

View File

@ -40,6 +40,14 @@ can be customised when a mapping type is created.
All fields in the document which contain non-null values.
[float]
=== Indexing meta-fields
<<mapping-ignored-field,`_ignored`>>::
All fields in the document that have been ignored at index time because of
<<ignore-malformed,`ignore_malformed`>>.
[float]
=== Routing meta-field
@ -57,6 +65,8 @@ can be customised when a mapping type is created.
include::fields/field-names-field.asciidoc[]
include::fields/ignored-field.asciidoc[]
include::fields/id-field.asciidoc[]
include::fields/index-field.asciidoc[]

View File

@ -0,0 +1,45 @@
[[mapping-ignored-field]]
=== `_ignored` field
added[6.4.0]
The `_ignored` field indexes and stores the names of every field in a document
that has been ignored because it was malformed and
<<ignore-malformed,`ignore_malformed`>> was turned on.
This field is searchable with <<query-dsl-term-query,`term`>>,
<<query-dsl-terms-query,`terms`>> and <<query-dsl-exists-query,`exists`>>
queries, and is returned as part of the search hits.
For instance the below query matches all documents that have one or more fields
that got ignored:
[source,js]
--------------------------------------------------
GET _search
{
"query": {
"exists": {
"field": "_ignored"
}
}
}
--------------------------------------------------
// CONSOLE
Similarly, the below query finds all documents whose `@timestamp` field was
ignored at index time:
[source,js]
--------------------------------------------------
GET _search
{
"query": {
"term": {
"_ignored": "@timestamp"
}
}
}
--------------------------------------------------
// CONSOLE

View File

@ -85,3 +85,13 @@ PUT my_index
<1> The `number_one` field inherits the index-level setting.
<2> The `number_two` field overrides the index-level setting to turn off `ignore_malformed`.
==== Dealing with malformed fields
Malformed fields are silently ignored at indexing time when `ignore_malformed`
is turned on. Whenever possible it is recommended to keep the number of
documents that have a malformed field contained, or queries on this field will
become meaningless. Elasticsearch makes it easy to check how many documents
have malformed fields by using `exist` or `term` queries on the special
<<mapping-ignored-field,`_ignored`>> field.

View File

@ -0,0 +1,92 @@
---
setup:
- skip:
version: " - 6.99.99" # TODO: change to 6.3.99 after backport to 6.4
reason: _ignored was added in 6.4.0
- do:
indices.create:
index: test
body:
mappings:
_doc:
properties:
my_date:
type: date
ignore_malformed: true
store: true
my_ip:
type: ip
ignore_malformed: true
- do:
index:
index: test
type: _doc
id: 1
body: { "my_date": "2018-05-11", "my_ip": ":::1" }
- do:
index:
index: test
type: _doc
id: 2
body: { "my_date": "bar", "my_ip": "192.168.1.42" }
- do:
index:
index: test
type: _doc
id: 3
body: { "my_date": "bar", "my_ip": "quux" }
- do:
indices.refresh: {}
---
"Exists on _ignored":
- do:
search:
body: { query: { exists: { "field": "_ignored" } } }
- length: { hits.hits: 3 }
---
"Search on _ignored with term":
- do:
search:
body: { query: { term: { "_ignored": "my_date" } } }
- length: { hits.hits: 2 }
---
"Search on _ignored with terms":
- do:
search:
body: { query: { terms: { "_ignored": [ "my_date", "my_ip" ] } } }
- length: { hits.hits: 3 }
---
"_ignored is returned by default":
- do:
search:
body: { query: { ids: { "values": [ "3" ] } } }
- length: { hits.hits: 1 }
- length: { hits.hits.0._ignored: 2}
---
"_ignored is still returned with explicit list of stored fields":
- do:
search:
stored_fields: [ "my_date" ]
body: { query: { ids: { "values": [ "3" ] } } }
- length: { hits.hits: 1 }
- is_true: hits.hits.0._ignored

View File

@ -24,6 +24,7 @@ import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.index.mapper.IdFieldMapper;
import org.elasticsearch.index.mapper.IgnoredFieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.RoutingFieldMapper;
@ -69,6 +70,12 @@ public class FieldsVisitor extends StoredFieldVisitor {
if (requiredFields.remove(fieldInfo.name)) {
return Status.YES;
}
// Always load _ignored to be explicit about ignored fields
// This works because _ignored is added as the first metadata mapper,
// so its stored fields always appear first in the list.
if (IgnoredFieldMapper.NAME.equals(fieldInfo.name)) {
return Status.YES;
}
// All these fields are single-valued so we can stop when the set is
// empty
return requiredFields.isEmpty()

View File

@ -446,6 +446,7 @@ public class DateFieldMapper extends FieldMapper {
timestamp = fieldType().parse(dateAsString);
} catch (IllegalArgumentException e) {
if (ignoreMalformed.value()) {
context.addIgnoredField(fieldType.name());
return;
} else {
throw e;

View File

@ -305,6 +305,7 @@ public class GeoPointFieldMapper extends FieldMapper implements ArrayValueMapper
if (ignoreMalformed.value() == false) {
throw e;
}
context.addIgnoredField(fieldType.name());
}
token = context.parser().nextToken();
}
@ -352,6 +353,7 @@ public class GeoPointFieldMapper extends FieldMapper implements ArrayValueMapper
if (ignoreMalformed.value() == false) {
throw e;
}
context.addIgnoredField(fieldType.name());
}
}
}

View File

@ -506,6 +506,7 @@ public class GeoShapeFieldMapper extends FieldMapper {
if (ignoreMalformed.value() == false) {
throw new MapperParsingException("failed to parse [" + fieldType().name() + "]", e);
}
context.addIgnoredField(fieldType.name());
}
return null;
}

View File

@ -0,0 +1,154 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.query.QueryShardContext;
import java.io.IOException;
import java.util.List;
import java.util.Map;
/**
* A field mapper that records fields that have been ignored because they were malformed.
*/
public final class IgnoredFieldMapper extends MetadataFieldMapper {
public static final String NAME = "_ignored";
public static final String CONTENT_TYPE = "_ignored";
public static class Defaults {
public static final String NAME = IgnoredFieldMapper.NAME;
public static final MappedFieldType FIELD_TYPE = new IgnoredFieldType();
static {
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS);
FIELD_TYPE.setTokenized(false);
FIELD_TYPE.setStored(true);
FIELD_TYPE.setOmitNorms(true);
FIELD_TYPE.setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
FIELD_TYPE.setSearchAnalyzer(Lucene.KEYWORD_ANALYZER);
FIELD_TYPE.setName(NAME);
FIELD_TYPE.freeze();
}
}
public static class Builder extends MetadataFieldMapper.Builder<Builder, IgnoredFieldMapper> {
public Builder(MappedFieldType existing) {
super(Defaults.NAME, existing == null ? Defaults.FIELD_TYPE : existing, Defaults.FIELD_TYPE);
}
@Override
public IgnoredFieldMapper build(BuilderContext context) {
return new IgnoredFieldMapper(context.indexSettings());
}
}
public static class TypeParser implements MetadataFieldMapper.TypeParser {
@Override
public MetadataFieldMapper.Builder<?,?> parse(String name, Map<String, Object> node,
ParserContext parserContext) throws MapperParsingException {
return new Builder(parserContext.mapperService().fullName(NAME));
}
@Override
public MetadataFieldMapper getDefault(MappedFieldType fieldType, ParserContext context) {
final Settings indexSettings = context.mapperService().getIndexSettings().getSettings();
return new IgnoredFieldMapper(indexSettings);
}
}
public static final class IgnoredFieldType extends TermBasedFieldType {
public IgnoredFieldType() {
}
protected IgnoredFieldType(IgnoredFieldType ref) {
super(ref);
}
@Override
public IgnoredFieldType clone() {
return new IgnoredFieldType(this);
}
@Override
public String typeName() {
return CONTENT_TYPE;
}
@Override
public Query existsQuery(QueryShardContext context) {
// This query is not performance sensitive, it only helps assess
// quality of the data, so we may use a slow query. It shouldn't
// be too slow in practice since the number of unique terms in this
// field is bounded by the number of fields in the mappings.
return new TermRangeQuery(name(), null, null, true, true);
}
}
private IgnoredFieldMapper(Settings indexSettings) {
super(NAME, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE, indexSettings);
}
@Override
public void preParse(ParseContext context) throws IOException {
}
@Override
public void postParse(ParseContext context) throws IOException {
super.parse(context);
}
@Override
public Mapper parse(ParseContext context) throws IOException {
// done in post-parse
return null;
}
@Override
protected void parseCreateField(ParseContext context, List<IndexableField> fields) throws IOException {
for (String field : context.getIgnoredFields()) {
context.doc().add(new Field(NAME, field, fieldType()));
}
}
@Override
protected String contentType() {
return CONTENT_TYPE;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
return builder;
}
}

View File

@ -370,6 +370,7 @@ public class IpFieldMapper extends FieldMapper {
address = InetAddresses.forString(addressAsString);
} catch (IllegalArgumentException e) {
if (ignoreMalformed.value()) {
context.addIgnoredField(fieldType.name());
return;
} else {
throw e;

View File

@ -112,7 +112,7 @@ public class MapperService extends AbstractIndexComponent implements Closeable {
//also missing, not sure if on purpose. See IndicesModule#getMetadataMappers
private static ObjectHashSet<String> META_FIELDS = ObjectHashSet.from(
"_id", "_type", "_routing", "_index",
"_size", "_timestamp", "_ttl"
"_size", "_timestamp", "_ttl", IgnoredFieldMapper.NAME
);
private static final DeprecationLogger DEPRECATION_LOGGER = new DeprecationLogger(Loggers.getLogger(MapperService.class));

View File

@ -990,6 +990,7 @@ public class NumberFieldMapper extends FieldMapper {
numericValue = fieldType().type.parse(parser, coerce.value());
} catch (IllegalArgumentException e) {
if (ignoreMalformed.value()) {
context.addIgnoredField(fieldType.name());
return;
} else {
throw e;

View File

@ -29,9 +29,12 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentParser;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
public abstract class ParseContext implements Iterable<ParseContext.Document>{
@ -286,6 +289,16 @@ public abstract class ParseContext implements Iterable<ParseContext.Document>{
public Iterator<Document> iterator() {
return in.iterator();
}
@Override
public void addIgnoredField(String field) {
in.addIgnoredField(field);
}
@Override
public Collection<String> getIgnoredFields() {
return in.getIgnoredFields();
}
}
public static class InternalParseContext extends ParseContext {
@ -319,6 +332,8 @@ public abstract class ParseContext implements Iterable<ParseContext.Document>{
private boolean docsReversed = false;
private final Set<String> ignoredFields = new HashSet<>();
public InternalParseContext(@Nullable Settings indexSettings, DocumentMapperParser docMapperParser, DocumentMapper docMapper,
SourceToParse source, XContentParser parser) {
this.indexSettings = indexSettings;
@ -453,6 +468,17 @@ public abstract class ParseContext implements Iterable<ParseContext.Document>{
public Iterator<Document> iterator() {
return documents.iterator();
}
@Override
public void addIgnoredField(String field) {
ignoredFields.add(field);
}
@Override
public Collection<String> getIgnoredFields() {
return Collections.unmodifiableCollection(ignoredFields);
}
}
/**
@ -461,6 +487,17 @@ public abstract class ParseContext implements Iterable<ParseContext.Document>{
*/
public abstract Iterable<Document> nonRootDocuments();
/**
* Add the given {@code field} to the set of ignored fields.
*/
public abstract void addIgnoredField(String field);
/**
* Return the collection of fields that have been ignored so far.
*/
public abstract Collection<String> getIgnoredFields();
public abstract DocumentMapperParser docMapperParser();
/**

View File

@ -35,6 +35,7 @@ import org.elasticsearch.index.mapper.FieldNamesFieldMapper;
import org.elasticsearch.index.mapper.GeoPointFieldMapper;
import org.elasticsearch.index.mapper.GeoShapeFieldMapper;
import org.elasticsearch.index.mapper.IdFieldMapper;
import org.elasticsearch.index.mapper.IgnoredFieldMapper;
import org.elasticsearch.index.mapper.IndexFieldMapper;
import org.elasticsearch.index.mapper.IpFieldMapper;
import org.elasticsearch.index.mapper.KeywordFieldMapper;
@ -130,7 +131,10 @@ public class IndicesModule extends AbstractModule {
Map<String, MetadataFieldMapper.TypeParser> builtInMetadataMappers;
// Use a LinkedHashMap for metadataMappers because iteration order matters
builtInMetadataMappers = new LinkedHashMap<>();
// ID first so it will be the first stored field to load (so will benefit from "fields: []" early termination
// _ignored first so that we always load it, even if only _id is requested
builtInMetadataMappers.put(IgnoredFieldMapper.NAME, new IgnoredFieldMapper.TypeParser());
// ID second so it will be the first (if no ignored fields) stored field to load
// (so will benefit from "fields: []" early termination
builtInMetadataMappers.put(IdFieldMapper.NAME, new IdFieldMapper.TypeParser());
builtInMetadataMappers.put(RoutingFieldMapper.NAME, new RoutingFieldMapper.TypeParser());
builtInMetadataMappers.put(IndexFieldMapper.NAME, new IndexFieldMapper.TypeParser());

View File

@ -43,6 +43,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentHelper;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.XContentParser.Token;
import org.elasticsearch.index.mapper.IgnoredFieldMapper;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.SourceFieldMapper;
import org.elasticsearch.index.shard.ShardId;
@ -444,8 +445,13 @@ public final class SearchHit implements Streamable, ToXContentObject, Iterable<D
builder.field(Fields._SCORE, score);
}
for (DocumentField field : metaFields) {
Object value = field.getValue();
builder.field(field.getName(), value);
// _ignored is the only multi-valued meta field
// TODO: can we avoid having an exception here?
if (field.getName().equals(IgnoredFieldMapper.NAME)) {
builder.field(field.getName(), field.getValues());
} else {
builder.field(field.getName(), field.<Object>getValue());
}
}
if (source != null) {
XContentHelper.writeRawField(SourceFieldMapper.NAME, source, builder, params);

View File

@ -192,6 +192,7 @@ public class DateFieldMapperTests extends ESSingleNodeTestCase {
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(0, fields.length);
assertArrayEquals(new String[] { "field" }, doc.rootDoc().getValues("_ignored"));
}
public void testChangeFormat() throws IOException {

View File

@ -0,0 +1,29 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper;
public class IgnoredFieldTypeTests extends FieldTypeTestCase {
@Override
protected MappedFieldType createDefaultFieldType() {
return new IgnoredFieldMapper.IgnoredFieldType();
}
}

View File

@ -194,6 +194,7 @@ public class IpFieldMapperTests extends ESSingleNodeTestCase {
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(0, fields.length);
assertArrayEquals(new String[] { "field" }, doc.rootDoc().getValues("_ignored"));
}
public void testNullValue() throws IOException {

View File

@ -255,6 +255,7 @@ public class NumberFieldMapperTests extends AbstractNumericFieldMapperTestCase {
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(0, fields.length);
assertArrayEquals(new String[] { "field" }, doc.rootDoc().getValues("_ignored"));
}
public void testRejectNorms() throws IOException {

View File

@ -21,6 +21,7 @@ package org.elasticsearch.indices;
import org.elasticsearch.index.mapper.FieldNamesFieldMapper;
import org.elasticsearch.index.mapper.IdFieldMapper;
import org.elasticsearch.index.mapper.IgnoredFieldMapper;
import org.elasticsearch.index.mapper.IndexFieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.Mapper;
@ -82,9 +83,9 @@ public class IndicesModuleTests extends ESTestCase {
}
});
private static String[] EXPECTED_METADATA_FIELDS = new String[]{IdFieldMapper.NAME, RoutingFieldMapper.NAME,
IndexFieldMapper.NAME, SourceFieldMapper.NAME, TypeFieldMapper.NAME, VersionFieldMapper.NAME,
SeqNoFieldMapper.NAME, FieldNamesFieldMapper.NAME};
private static String[] EXPECTED_METADATA_FIELDS = new String[]{IgnoredFieldMapper.NAME, IdFieldMapper.NAME,
RoutingFieldMapper.NAME, IndexFieldMapper.NAME, SourceFieldMapper.NAME, TypeFieldMapper.NAME,
VersionFieldMapper.NAME, SeqNoFieldMapper.NAME, FieldNamesFieldMapper.NAME};
public void testBuiltinMappers() {
IndicesModule module = new IndicesModule(Collections.emptyList());
@ -106,7 +107,7 @@ public class IndicesModuleTests extends ESTestCase {
greaterThan(noPluginsModule.getMapperRegistry().getMetadataMapperParsers().size()));
Map<String, MetadataFieldMapper.TypeParser> metadataMapperParsers = module.getMapperRegistry().getMetadataMapperParsers();
Iterator<String> iterator = metadataMapperParsers.keySet().iterator();
assertEquals(IdFieldMapper.NAME, iterator.next());
assertEquals(IgnoredFieldMapper.NAME, iterator.next());
String last = null;
while(iterator.hasNext()) {
last = iterator.next();