[7.x][ML] DF Analytics _explain API should skip object fields (#51115) (#51147)

Object fields cannot be used as features. At the moment _explain
API includes them and even worse it allows it does not error when
an object field is excluded. This creates the expectation to the
user that all children fields will also be excluded while it's not
the case.

This commit omits object fields from the _explain API and also
adds an error if an object field is included or excluded.

Backport of #51115
This commit is contained in:
Dimitris Athanasiou 2020-01-17 14:02:59 +02:00 committed by GitHub
parent 8282744207
commit b70ebdeb96
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 77 additions and 2 deletions

View File

@ -44,9 +44,10 @@ The following explanations are provided:
* which fields are included or not in the analysis and why,
* how much memory is estimated to be required. The estimate can be used when
deciding the appropriate value for `model_memory_limit` setting later on,
deciding the appropriate value for `model_memory_limit` setting later on.
about either an existing {dfanalytics-job} or one that has not been created yet.
If you have object fields or fields that are excluded via source filtering,
they are not included in the explanation.
[[ml-explain-dfanalytics-path-params]]

View File

@ -15,6 +15,7 @@ import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.mapper.BooleanFieldMapper;
import org.elasticsearch.index.mapper.ObjectMapper;
import org.elasticsearch.search.fetch.subphase.FetchSourceContext;
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
import org.elasticsearch.xpack.core.ml.dataframe.analyses.DataFrameAnalysis;
@ -40,6 +41,7 @@ import java.util.Objects;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class ExtractedFieldsDetector {
@ -82,6 +84,7 @@ public class ExtractedFieldsDetector {
Set<String> fields = new TreeSet<>(fieldCapabilitiesResponse.get().keySet());
fields.removeAll(IGNORE_FIELDS);
removeFieldsUnderResultsField(fields);
removeObjects(fields);
applySourceFiltering(fields);
FetchSourceContext analyzedFields = config.getAnalyzedFields();
@ -112,6 +115,17 @@ public class ExtractedFieldsDetector {
fields.removeIf(field -> field.startsWith(resultsField + "."));
}
private void removeObjects(Set<String> fields) {
Iterator<String> fieldsIterator = fields.iterator();
while (fieldsIterator.hasNext()) {
String field = fieldsIterator.next();
Set<String> types = getMappingTypes(field);
if (isObject(types)) {
fieldsIterator.remove();
}
}
}
private void applySourceFiltering(Set<String> fields) {
Iterator<String> fieldsIterator = fields.iterator();
while (fieldsIterator.hasNext()) {
@ -178,6 +192,9 @@ public class ExtractedFieldsDetector {
if (analyzedFields == null) {
return;
}
checkIncludesExcludesAreNotObjects(analyzedFields);
String includes = analyzedFields.includes().length == 0 ? "*" : Strings.arrayToCommaDelimitedString(analyzedFields.includes());
String excludes = Strings.arrayToCommaDelimitedString(analyzedFields.excludes());
@ -205,6 +222,16 @@ public class ExtractedFieldsDetector {
}
}
private void checkIncludesExcludesAreNotObjects(FetchSourceContext analyzedFields) {
List<String> objectFields = Stream.concat(Arrays.stream(analyzedFields.includes()), Arrays.stream(analyzedFields.excludes()))
.filter(field -> isObject(getMappingTypes(field)))
.collect(Collectors.toList());
if (objectFields.isEmpty() == false) {
throw ExceptionsHelper.badRequestException("{} must not include or exclude object fields: {}",
DataFrameAnalyticsConfig.ANALYZED_FIELDS.getPreferredName(), objectFields);
}
}
private void applyIncludesExcludes(Set<String> fields, Set<String> includes, Set<String> excludes,
Set<FieldSelection> fieldSelection) {
Iterator<String> fieldsIterator = fields.iterator();
@ -394,4 +421,8 @@ public class ExtractedFieldsDetector {
private static boolean isBoolean(Set<String> types) {
return types.size() == 1 && types.contains(BooleanFieldMapper.CONTENT_TYPE);
}
private boolean isObject(Set<String> types) {
return types.size() == 1 && types.contains(ObjectMapper.CONTENT_TYPE);
}
}

View File

@ -861,6 +861,49 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
FieldSelection.included("field_22", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL));
}
public void testDetect_GivenObjectFields() {
FieldCapabilitiesResponse fieldCapabilities = new MockFieldCapsResponseBuilder()
.addAggregatableField("float_field", "float")
.addNonAggregatableField("object_field_1", "object")
.addNonAggregatableField("object_field_2", "object").build();
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap());
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
List<ExtractedField> allFields = fieldExtraction.v1().getAllFields();
assertThat(allFields, hasSize(1));
assertThat(allFields.get(0).getName(), equalTo("float_field"));
}
public void testDetect_GivenAnalyzedFieldIncludesObjectField() {
FieldCapabilitiesResponse fieldCapabilities = new MockFieldCapsResponseBuilder()
.addAggregatableField("float_field", "float")
.addNonAggregatableField("object_field", "object").build();
analyzedFields = new FetchSourceContext(true, new String[] { "float_field", "object_field" }, null);
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap());
ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
assertThat(e.getMessage(), equalTo("analyzed_fields must not include or exclude object fields: [object_field]"));
}
public void testDetect_GivenAnalyzedFieldExcludesObjectField() {
FieldCapabilitiesResponse fieldCapabilities = new MockFieldCapsResponseBuilder()
.addAggregatableField("float_field", "float")
.addNonAggregatableField("object_field", "object").build();
analyzedFields = new FetchSourceContext(true, null, new String[] { "object_field" });
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap());
ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
assertThat(e.getMessage(), equalTo("analyzed_fields must not include or exclude object fields: [object_field]"));
}
private DataFrameAnalyticsConfig buildOutlierDetectionConfig() {
return new DataFrameAnalyticsConfig.Builder()
.setId("foo")