[7.x][ML] Explain data frame analytics API (#49455) (#49504)

This commit replaces the _estimate_memory_usage API with
a new API, the _explain API.

The API consolidates information that is useful before
creating a data frame analytics job.

It includes:

- memory estimation
- field selection explanation

Memory estimation is moved here from what was previously
calculated in the _estimate_memory_usage API.

Field selection is a new feature that explains to the user
whether each available field was selected to be included or
not in the analysis. In the case it was not included, it also
explains the reason why.

Backport of #49455
This commit is contained in:
Dimitris Athanasiou 2019-11-22 22:06:10 +02:00 committed by GitHub
parent 69f570ea5f
commit 8eaee7cbdc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
46 changed files with 2315 additions and 854 deletions

View File

@ -29,6 +29,7 @@ import org.apache.lucene.util.BytesRef;
import org.elasticsearch.client.RequestConverters.EndpointBuilder;
import org.elasticsearch.client.core.PageParams;
import org.elasticsearch.client.ml.CloseJobRequest;
import org.elasticsearch.client.ml.ExplainDataFrameAnalyticsRequest;
import org.elasticsearch.client.ml.DeleteCalendarEventRequest;
import org.elasticsearch.client.ml.DeleteCalendarJobRequest;
import org.elasticsearch.client.ml.DeleteCalendarRequest;
@ -701,12 +702,17 @@ final class MLRequestConverters {
return request;
}
static Request estimateMemoryUsage(PutDataFrameAnalyticsRequest estimateRequest) throws IOException {
String endpoint = new EndpointBuilder()
.addPathPartAsIs("_ml", "data_frame", "analytics", "_estimate_memory_usage")
.build();
Request request = new Request(HttpPost.METHOD_NAME, endpoint);
request.setEntity(createEntity(estimateRequest, REQUEST_BODY_CONTENT_TYPE));
static Request explainDataFrameAnalytics(ExplainDataFrameAnalyticsRequest explainRequest) throws IOException {
EndpointBuilder endpoint = new EndpointBuilder().addPathPartAsIs("_ml", "data_frame", "analytics");
if (explainRequest.getId() != null) {
endpoint.addPathPart(explainRequest.getId());
}
endpoint.addPathPartAsIs("_explain");
Request request = new Request(HttpPost.METHOD_NAME, endpoint.build());
if (explainRequest.getConfig() != null) {
request.setEntity(createEntity(explainRequest.getConfig(), REQUEST_BODY_CONTENT_TYPE));
}
return request;
}

View File

@ -22,6 +22,8 @@ import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.support.master.AcknowledgedResponse;
import org.elasticsearch.client.ml.CloseJobRequest;
import org.elasticsearch.client.ml.CloseJobResponse;
import org.elasticsearch.client.ml.ExplainDataFrameAnalyticsRequest;
import org.elasticsearch.client.ml.ExplainDataFrameAnalyticsResponse;
import org.elasticsearch.client.ml.DeleteCalendarEventRequest;
import org.elasticsearch.client.ml.DeleteCalendarJobRequest;
import org.elasticsearch.client.ml.DeleteCalendarRequest;
@ -34,7 +36,6 @@ import org.elasticsearch.client.ml.DeleteForecastRequest;
import org.elasticsearch.client.ml.DeleteJobRequest;
import org.elasticsearch.client.ml.DeleteJobResponse;
import org.elasticsearch.client.ml.DeleteModelSnapshotRequest;
import org.elasticsearch.client.ml.EstimateMemoryUsageResponse;
import org.elasticsearch.client.ml.EvaluateDataFrameRequest;
import org.elasticsearch.client.ml.EvaluateDataFrameResponse;
import org.elasticsearch.client.ml.FindFileStructureRequest;
@ -2249,46 +2250,46 @@ public final class MachineLearningClient {
}
/**
* Estimates memory usage for the given Data Frame Analytics
* Explains the given Data Frame Analytics
* <p>
* For additional info
* see <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/estimate-memory-usage-dfanalytics.html">
* Estimate Memory Usage for Data Frame Analytics documentation</a>
* see <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/explain-dfanalytics.html">
* Explain Data Frame Analytics documentation</a>
*
* @param request The {@link PutDataFrameAnalyticsRequest}
* @param request The {@link ExplainDataFrameAnalyticsRequest}
* @param options Additional request options (e.g. headers), use {@link RequestOptions#DEFAULT} if nothing needs to be customized
* @return {@link EstimateMemoryUsageResponse} response object
* @return {@link ExplainDataFrameAnalyticsResponse} response object
* @throws IOException when there is a serialization issue sending the request or receiving the response
*/
public EstimateMemoryUsageResponse estimateMemoryUsage(PutDataFrameAnalyticsRequest request,
public ExplainDataFrameAnalyticsResponse explainDataFrameAnalytics(ExplainDataFrameAnalyticsRequest request,
RequestOptions options) throws IOException {
return restHighLevelClient.performRequestAndParseEntity(
request,
MLRequestConverters::estimateMemoryUsage,
MLRequestConverters::explainDataFrameAnalytics,
options,
EstimateMemoryUsageResponse::fromXContent,
ExplainDataFrameAnalyticsResponse::fromXContent,
Collections.emptySet());
}
/**
* Estimates memory usage for the given Data Frame Analytics asynchronously and notifies listener upon completion
* Explains the given Data Frame Analytics asynchronously and notifies listener upon completion
* <p>
* For additional info
* see <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/estimate-memory-usage-dfanalytics.html">
* Estimate Memory Usage for Data Frame Analytics documentation</a>
* see <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/explain-dfanalytics.html">
* Explain Data Frame Analytics documentation</a>
*
* @param request The {@link PutDataFrameAnalyticsRequest}
* @param request The {@link ExplainDataFrameAnalyticsRequest}
* @param options Additional request options (e.g. headers), use {@link RequestOptions#DEFAULT} if nothing needs to be customized
* @param listener Listener to be notified upon request completion
* @return cancellable that may be used to cancel the request
*/
public Cancellable estimateMemoryUsageAsync(PutDataFrameAnalyticsRequest request, RequestOptions options,
ActionListener<EstimateMemoryUsageResponse> listener) {
public Cancellable explainDataFrameAnalyticsAsync(ExplainDataFrameAnalyticsRequest request, RequestOptions options,
ActionListener<ExplainDataFrameAnalyticsResponse> listener) {
return restHighLevelClient.performRequestAsyncAndParseEntity(
request,
MLRequestConverters::estimateMemoryUsage,
MLRequestConverters::explainDataFrameAnalytics,
options,
EstimateMemoryUsageResponse::fromXContent,
ExplainDataFrameAnalyticsResponse::fromXContent,
listener,
Collections.emptySet());
}

View File

@ -0,0 +1,72 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.client.ml;
import org.elasticsearch.client.Validatable;
import org.elasticsearch.client.ml.dataframe.DataFrameAnalyticsConfig;
import org.elasticsearch.common.Nullable;
import java.util.Objects;
/**
* Request to explain the following about a data frame analytics job:
* <ul>
* <li>field selection: which fields are included or are not in the analysis</li>
* <li>memory estimation: how much memory the job is estimated to require</li>
* </ul>
*/
public class ExplainDataFrameAnalyticsRequest implements Validatable {
private final String id;
private final DataFrameAnalyticsConfig config;
public ExplainDataFrameAnalyticsRequest(String id) {
this.id = Objects.requireNonNull(id);
this.config = null;
}
public ExplainDataFrameAnalyticsRequest(DataFrameAnalyticsConfig config) {
this.id = null;
this.config = Objects.requireNonNull(config);
}
@Nullable
public String getId() {
return id;
}
@Nullable
public DataFrameAnalyticsConfig getConfig() {
return config;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ExplainDataFrameAnalyticsRequest other = (ExplainDataFrameAnalyticsRequest) o;
return Objects.equals(id, other.id) && Objects.equals(config, other.config);
}
@Override
public int hashCode() {
return Objects.hash(id, config);
}
}

View File

@ -0,0 +1,94 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.client.ml;
import org.elasticsearch.client.ml.dataframe.explain.FieldSelection;
import org.elasticsearch.client.ml.dataframe.explain.MemoryEstimation;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import java.io.IOException;
import java.util.List;
import java.util.Objects;
public class ExplainDataFrameAnalyticsResponse implements ToXContentObject {
public static final ParseField TYPE = new ParseField("explain_data_frame_analytics_response");
public static final ParseField FIELD_SELECTION = new ParseField("field_selection");
public static final ParseField MEMORY_ESTIMATION = new ParseField("memory_estimation");
public static ExplainDataFrameAnalyticsResponse fromXContent(XContentParser parser) throws IOException {
return PARSER.parse(parser, null);
}
@SuppressWarnings("unchecked")
static final ConstructingObjectParser<ExplainDataFrameAnalyticsResponse, Void> PARSER =
new ConstructingObjectParser<>(
TYPE.getPreferredName(), true,
args -> new ExplainDataFrameAnalyticsResponse((List<FieldSelection>) args[0], (MemoryEstimation) args[1]));
static {
PARSER.declareObjectArray(ConstructingObjectParser.constructorArg(), FieldSelection.PARSER, FIELD_SELECTION);
PARSER.declareObject(ConstructingObjectParser.constructorArg(), MemoryEstimation.PARSER, MEMORY_ESTIMATION);
}
private final List<FieldSelection> fieldSelection;
private final MemoryEstimation memoryEstimation;
public ExplainDataFrameAnalyticsResponse(List<FieldSelection> fieldSelection, MemoryEstimation memoryEstimation) {
this.fieldSelection = Objects.requireNonNull(fieldSelection);
this.memoryEstimation = Objects.requireNonNull(memoryEstimation);
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
builder.field(FIELD_SELECTION.getPreferredName(), fieldSelection);
builder.field(MEMORY_ESTIMATION.getPreferredName(), memoryEstimation);
builder.endObject();
return builder;
}
@Override
public boolean equals(Object other) {
if (this == other) return true;
if (other == null || getClass() != other.getClass()) return false;
ExplainDataFrameAnalyticsResponse that = (ExplainDataFrameAnalyticsResponse) other;
return Objects.equals(fieldSelection, that.fieldSelection)
&& Objects.equals(memoryEstimation, that.memoryEstimation);
}
@Override
public int hashCode() {
return Objects.hash(fieldSelection, memoryEstimation);
}
public MemoryEstimation getMemoryEstimation() {
return memoryEstimation;
}
public List<FieldSelection> getFieldSelection() {
return fieldSelection;
}
}

View File

@ -0,0 +1,163 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.client.ml.dataframe.explain;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.ObjectParser;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import java.io.IOException;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
public class FieldSelection implements ToXContentObject {
private static final ParseField NAME = new ParseField("name");
private static final ParseField MAPPING_TYPES = new ParseField("mapping_types");
private static final ParseField IS_INCLUDED = new ParseField("is_included");
private static final ParseField IS_REQUIRED = new ParseField("is_required");
private static final ParseField FEATURE_TYPE = new ParseField("feature_type");
private static final ParseField REASON = new ParseField("reason");
public enum FeatureType {
CATEGORICAL, NUMERICAL;
public static FeatureType fromString(String value) {
return FeatureType.valueOf(value.toUpperCase(Locale.ROOT));
}
@Override
public String toString() {
return name().toLowerCase(Locale.ROOT);
}
}
@SuppressWarnings("unchecked")
public static ConstructingObjectParser<FieldSelection, Void> PARSER = new ConstructingObjectParser<>("field_selection", true,
a -> new FieldSelection((String) a[0], new HashSet<>((List<String>) a[1]), (boolean) a[2], (boolean) a[3], (FeatureType) a[4],
(String) a[5]));
static {
PARSER.declareString(ConstructingObjectParser.constructorArg(), NAME);
PARSER.declareStringArray(ConstructingObjectParser.constructorArg(), MAPPING_TYPES);
PARSER.declareBoolean(ConstructingObjectParser.constructorArg(), IS_INCLUDED);
PARSER.declareBoolean(ConstructingObjectParser.constructorArg(), IS_REQUIRED);
PARSER.declareField(ConstructingObjectParser.optionalConstructorArg(), p -> {
if (p.currentToken() == XContentParser.Token.VALUE_STRING) {
return FeatureType.fromString(p.text());
}
throw new IllegalArgumentException("Unsupported token [" + p.currentToken() + "]");
}, FEATURE_TYPE, ObjectParser.ValueType.STRING);
PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), REASON);
}
private final String name;
private final Set<String> mappingTypes;
private final boolean isIncluded;
private final boolean isRequired;
private final FeatureType featureType;
private final String reason;
public static FieldSelection included(String name, Set<String> mappingTypes, boolean isRequired, FeatureType featureType) {
return new FieldSelection(name, mappingTypes, true, isRequired, featureType, null);
}
public static FieldSelection excluded(String name, Set<String> mappingTypes, String reason) {
return new FieldSelection(name, mappingTypes, false, false, null, reason);
}
FieldSelection(String name, Set<String> mappingTypes, boolean isIncluded, boolean isRequired, @Nullable FeatureType featureType,
@Nullable String reason) {
this.name = Objects.requireNonNull(name);
this.mappingTypes = Collections.unmodifiableSet(mappingTypes);
this.isIncluded = isIncluded;
this.isRequired = isRequired;
this.featureType = featureType;
this.reason = reason;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
builder.field(NAME.getPreferredName(), name);
builder.field(MAPPING_TYPES.getPreferredName(), mappingTypes);
builder.field(IS_INCLUDED.getPreferredName(), isIncluded);
builder.field(IS_REQUIRED.getPreferredName(), isRequired);
if (featureType != null) {
builder.field(FEATURE_TYPE.getPreferredName(), featureType);
}
if (reason != null) {
builder.field(REASON.getPreferredName(), reason);
}
builder.endObject();
return builder;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
FieldSelection that = (FieldSelection) o;
return Objects.equals(name, that.name)
&& Objects.equals(mappingTypes, that.mappingTypes)
&& isIncluded == that.isIncluded
&& isRequired == that.isRequired
&& Objects.equals(featureType, that.featureType)
&& Objects.equals(reason, that.reason);
}
@Override
public int hashCode() {
return Objects.hash(name, mappingTypes, isIncluded, isRequired, featureType, reason);
}
public String getName() {
return name;
}
public Set<String> getMappingTypes() {
return mappingTypes;
}
public boolean isIncluded() {
return isIncluded;
}
public boolean isRequired() {
return isRequired;
}
@Nullable
public FeatureType getFeatureType() {
return featureType;
}
@Nullable
public String getReason() {
return reason;
}
}

View File

@ -16,8 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.client.ml;
package org.elasticsearch.client.ml.dataframe.explain;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.ParseField;
@ -26,23 +25,19 @@ import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.ObjectParser;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import java.io.IOException;
import java.util.Objects;
import static org.elasticsearch.common.xcontent.ConstructingObjectParser.optionalConstructorArg;
public class EstimateMemoryUsageResponse implements ToXContentObject {
public class MemoryEstimation implements ToXContentObject {
public static final ParseField EXPECTED_MEMORY_WITHOUT_DISK = new ParseField("expected_memory_without_disk");
public static final ParseField EXPECTED_MEMORY_WITH_DISK = new ParseField("expected_memory_with_disk");
static final ConstructingObjectParser<EstimateMemoryUsageResponse, Void> PARSER =
new ConstructingObjectParser<>(
"estimate_memory_usage_response",
true,
args -> new EstimateMemoryUsageResponse((ByteSizeValue) args[0], (ByteSizeValue) args[1]));
public static final ConstructingObjectParser<MemoryEstimation, Void> PARSER = new ConstructingObjectParser<>("memory_estimation", true,
a -> new MemoryEstimation((ByteSizeValue) a[0], (ByteSizeValue) a[1]));
static {
PARSER.declareField(
@ -57,14 +52,10 @@ public class EstimateMemoryUsageResponse implements ToXContentObject {
ObjectParser.ValueType.VALUE);
}
public static EstimateMemoryUsageResponse fromXContent(XContentParser parser) {
return PARSER.apply(parser, null);
}
private final ByteSizeValue expectedMemoryWithoutDisk;
private final ByteSizeValue expectedMemoryWithDisk;
public EstimateMemoryUsageResponse(@Nullable ByteSizeValue expectedMemoryWithoutDisk, @Nullable ByteSizeValue expectedMemoryWithDisk) {
public MemoryEstimation(@Nullable ByteSizeValue expectedMemoryWithoutDisk, @Nullable ByteSizeValue expectedMemoryWithDisk) {
this.expectedMemoryWithoutDisk = expectedMemoryWithoutDisk;
this.expectedMemoryWithDisk = expectedMemoryWithDisk;
}
@ -99,7 +90,7 @@ public class EstimateMemoryUsageResponse implements ToXContentObject {
return false;
}
EstimateMemoryUsageResponse that = (EstimateMemoryUsageResponse) other;
MemoryEstimation that = (MemoryEstimation) other;
return Objects.equals(expectedMemoryWithoutDisk, that.expectedMemoryWithoutDisk)
&& Objects.equals(expectedMemoryWithDisk, that.expectedMemoryWithDisk);
}

View File

@ -25,6 +25,7 @@ import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpPut;
import org.elasticsearch.client.core.PageParams;
import org.elasticsearch.client.ml.CloseJobRequest;
import org.elasticsearch.client.ml.ExplainDataFrameAnalyticsRequest;
import org.elasticsearch.client.ml.DeleteCalendarEventRequest;
import org.elasticsearch.client.ml.DeleteCalendarJobRequest;
import org.elasticsearch.client.ml.DeleteCalendarRequest;
@ -788,16 +789,27 @@ public class MLRequestConvertersTests extends ESTestCase {
}
}
public void testEstimateMemoryUsage() throws IOException {
PutDataFrameAnalyticsRequest estimateRequest = new PutDataFrameAnalyticsRequest(randomDataFrameAnalyticsConfig());
Request request = MLRequestConverters.estimateMemoryUsage(estimateRequest);
public void testExplainDataFrameAnalytics() throws IOException {
// Request with config
{
ExplainDataFrameAnalyticsRequest estimateRequest = new ExplainDataFrameAnalyticsRequest(randomDataFrameAnalyticsConfig());
Request request = MLRequestConverters.explainDataFrameAnalytics(estimateRequest);
assertEquals(HttpPost.METHOD_NAME, request.getMethod());
assertEquals("/_ml/data_frame/analytics/_estimate_memory_usage", request.getEndpoint());
assertEquals("/_ml/data_frame/analytics/_explain", request.getEndpoint());
try (XContentParser parser = createParser(JsonXContent.jsonXContent, request.getEntity().getContent())) {
DataFrameAnalyticsConfig parsedConfig = DataFrameAnalyticsConfig.fromXContent(parser);
assertThat(parsedConfig, equalTo(estimateRequest.getConfig()));
}
}
// Request with id
{
ExplainDataFrameAnalyticsRequest estimateRequest = new ExplainDataFrameAnalyticsRequest("foo");
Request request = MLRequestConverters.explainDataFrameAnalytics(estimateRequest);
assertEquals(HttpPost.METHOD_NAME, request.getMethod());
assertEquals("/_ml/data_frame/analytics/foo/_explain", request.getEndpoint());
assertNull(request.getEntity());
}
}
public void testGetTrainedModels() {
String modelId1 = randomAlphaOfLength(10);

View File

@ -32,6 +32,8 @@ import org.elasticsearch.client.indices.CreateIndexRequest;
import org.elasticsearch.client.indices.GetIndexRequest;
import org.elasticsearch.client.ml.CloseJobRequest;
import org.elasticsearch.client.ml.CloseJobResponse;
import org.elasticsearch.client.ml.ExplainDataFrameAnalyticsRequest;
import org.elasticsearch.client.ml.ExplainDataFrameAnalyticsResponse;
import org.elasticsearch.client.ml.DeleteCalendarEventRequest;
import org.elasticsearch.client.ml.DeleteCalendarJobRequest;
import org.elasticsearch.client.ml.DeleteCalendarRequest;
@ -44,7 +46,6 @@ import org.elasticsearch.client.ml.DeleteForecastRequest;
import org.elasticsearch.client.ml.DeleteJobRequest;
import org.elasticsearch.client.ml.DeleteJobResponse;
import org.elasticsearch.client.ml.DeleteModelSnapshotRequest;
import org.elasticsearch.client.ml.EstimateMemoryUsageResponse;
import org.elasticsearch.client.ml.EvaluateDataFrameRequest;
import org.elasticsearch.client.ml.EvaluateDataFrameResponse;
import org.elasticsearch.client.ml.FindFileStructureRequest;
@ -140,6 +141,8 @@ import org.elasticsearch.client.ml.dataframe.evaluation.softclassification.Binar
import org.elasticsearch.client.ml.dataframe.evaluation.softclassification.ConfusionMatrixMetric;
import org.elasticsearch.client.ml.dataframe.evaluation.softclassification.PrecisionMetric;
import org.elasticsearch.client.ml.dataframe.evaluation.softclassification.RecallMetric;
import org.elasticsearch.client.ml.dataframe.explain.FieldSelection;
import org.elasticsearch.client.ml.dataframe.explain.MemoryEstimation;
import org.elasticsearch.client.ml.filestructurefinder.FileStructure;
import org.elasticsearch.client.ml.inference.TrainedModelConfig;
import org.elasticsearch.client.ml.inference.TrainedModelDefinition;
@ -1996,8 +1999,8 @@ public class MachineLearningIT extends ESRestHighLevelClientTestCase {
highLevelClient().indices().create(new CreateIndexRequest(indexName).mapping(mapping), RequestOptions.DEFAULT);
}
public void testEstimateMemoryUsage() throws IOException {
String indexName = "estimate-test-index";
public void testExplainDataFrameAnalytics() throws IOException {
String indexName = "explain-df-test-index";
createIndex(indexName, mappingForSoftClassification());
BulkRequest bulk1 = new BulkRequest()
.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
@ -2007,8 +2010,8 @@ public class MachineLearningIT extends ESRestHighLevelClientTestCase {
highLevelClient().bulk(bulk1, RequestOptions.DEFAULT);
MachineLearningClient machineLearningClient = highLevelClient().machineLearning();
PutDataFrameAnalyticsRequest estimateMemoryUsageRequest =
new PutDataFrameAnalyticsRequest(
ExplainDataFrameAnalyticsRequest explainRequest =
new ExplainDataFrameAnalyticsRequest(
DataFrameAnalyticsConfig.builder()
.setSource(DataFrameAnalyticsSource.builder().setIndex(indexName).build())
.setAnalysis(OutlierDetection.createDefault())
@ -2019,11 +2022,16 @@ public class MachineLearningIT extends ESRestHighLevelClientTestCase {
ByteSizeValue upperBound = new ByteSizeValue(1, ByteSizeUnit.GB);
// Data Frame has 10 rows, expect that the returned estimates fall within (1kB, 1GB) range.
EstimateMemoryUsageResponse response1 =
execute(
estimateMemoryUsageRequest, machineLearningClient::estimateMemoryUsage, machineLearningClient::estimateMemoryUsageAsync);
assertThat(response1.getExpectedMemoryWithoutDisk(), allOf(greaterThanOrEqualTo(lowerBound), lessThan(upperBound)));
assertThat(response1.getExpectedMemoryWithDisk(), allOf(greaterThanOrEqualTo(lowerBound), lessThan(upperBound)));
ExplainDataFrameAnalyticsResponse response1 = execute(explainRequest, machineLearningClient::explainDataFrameAnalytics,
machineLearningClient::explainDataFrameAnalyticsAsync);
MemoryEstimation memoryEstimation1 = response1.getMemoryEstimation();
assertThat(memoryEstimation1.getExpectedMemoryWithoutDisk(), allOf(greaterThanOrEqualTo(lowerBound), lessThan(upperBound)));
assertThat(memoryEstimation1.getExpectedMemoryWithDisk(), allOf(greaterThanOrEqualTo(lowerBound), lessThan(upperBound)));
List<FieldSelection> fieldSelection = response1.getFieldSelection();
assertThat(fieldSelection.size(), equalTo(3));
assertThat(fieldSelection.stream().map(FieldSelection::getName).collect(Collectors.toList()), contains("dataset", "label", "p"));
BulkRequest bulk2 = new BulkRequest()
.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
@ -2033,15 +2041,16 @@ public class MachineLearningIT extends ESRestHighLevelClientTestCase {
highLevelClient().bulk(bulk2, RequestOptions.DEFAULT);
// Data Frame now has 100 rows, expect that the returned estimates will be greater than or equal to the previous ones.
EstimateMemoryUsageResponse response2 =
ExplainDataFrameAnalyticsResponse response2 =
execute(
estimateMemoryUsageRequest, machineLearningClient::estimateMemoryUsage, machineLearningClient::estimateMemoryUsageAsync);
explainRequest, machineLearningClient::explainDataFrameAnalytics, machineLearningClient::explainDataFrameAnalyticsAsync);
MemoryEstimation memoryEstimation2 = response2.getMemoryEstimation();
assertThat(
response2.getExpectedMemoryWithoutDisk(),
allOf(greaterThanOrEqualTo(response1.getExpectedMemoryWithoutDisk()), lessThan(upperBound)));
memoryEstimation2.getExpectedMemoryWithoutDisk(),
allOf(greaterThanOrEqualTo(memoryEstimation1.getExpectedMemoryWithoutDisk()), lessThan(upperBound)));
assertThat(
response2.getExpectedMemoryWithDisk(),
allOf(greaterThanOrEqualTo(response1.getExpectedMemoryWithDisk()), lessThan(upperBound)));
memoryEstimation2.getExpectedMemoryWithDisk(),
allOf(greaterThanOrEqualTo(memoryEstimation1.getExpectedMemoryWithDisk()), lessThan(upperBound)));
}
public void testGetTrainedModels() throws Exception {

View File

@ -36,6 +36,8 @@ import org.elasticsearch.client.core.PageParams;
import org.elasticsearch.client.indices.CreateIndexRequest;
import org.elasticsearch.client.ml.CloseJobRequest;
import org.elasticsearch.client.ml.CloseJobResponse;
import org.elasticsearch.client.ml.ExplainDataFrameAnalyticsRequest;
import org.elasticsearch.client.ml.ExplainDataFrameAnalyticsResponse;
import org.elasticsearch.client.ml.DeleteCalendarEventRequest;
import org.elasticsearch.client.ml.DeleteCalendarJobRequest;
import org.elasticsearch.client.ml.DeleteCalendarRequest;
@ -48,7 +50,6 @@ import org.elasticsearch.client.ml.DeleteForecastRequest;
import org.elasticsearch.client.ml.DeleteJobRequest;
import org.elasticsearch.client.ml.DeleteJobResponse;
import org.elasticsearch.client.ml.DeleteModelSnapshotRequest;
import org.elasticsearch.client.ml.EstimateMemoryUsageResponse;
import org.elasticsearch.client.ml.EvaluateDataFrameRequest;
import org.elasticsearch.client.ml.EvaluateDataFrameResponse;
import org.elasticsearch.client.ml.FindFileStructureRequest;
@ -155,6 +156,8 @@ import org.elasticsearch.client.ml.dataframe.evaluation.softclassification.Confu
import org.elasticsearch.client.ml.dataframe.evaluation.softclassification.ConfusionMatrixMetric.ConfusionMatrix;
import org.elasticsearch.client.ml.dataframe.evaluation.softclassification.PrecisionMetric;
import org.elasticsearch.client.ml.dataframe.evaluation.softclassification.RecallMetric;
import org.elasticsearch.client.ml.dataframe.explain.FieldSelection;
import org.elasticsearch.client.ml.dataframe.explain.MemoryEstimation;
import org.elasticsearch.client.ml.filestructurefinder.FileStructure;
import org.elasticsearch.client.ml.inference.TrainedModelConfig;
import org.elasticsearch.client.ml.inference.TrainedModelDefinition;
@ -213,6 +216,7 @@ import java.util.zip.GZIPOutputStream;
import static org.hamcrest.Matchers.allOf;
import static org.hamcrest.Matchers.closeTo;
import static org.hamcrest.Matchers.contains;
import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
@ -3460,10 +3464,10 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
}
}
public void testEstimateMemoryUsage() throws Exception {
createIndex("estimate-test-source-index");
public void testExplainDataFrameAnalytics() throws Exception {
createIndex("explain-df-test-source-index");
BulkRequest bulkRequest =
new BulkRequest("estimate-test-source-index")
new BulkRequest("explain-df-test-source-index")
.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
for (int i = 0; i < 10; ++i) {
bulkRequest.add(new IndexRequest().source(XContentType.JSON, "timestamp", 123456789L, "total", 10L));
@ -3471,22 +3475,33 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
RestHighLevelClient client = highLevelClient();
client.bulk(bulkRequest, RequestOptions.DEFAULT);
{
// tag::estimate-memory-usage-request
// tag::explain-data-frame-analytics-id-request
ExplainDataFrameAnalyticsRequest request = new ExplainDataFrameAnalyticsRequest("existing_job_id"); // <1>
// end::explain-data-frame-analytics-id-request
// tag::explain-data-frame-analytics-config-request
DataFrameAnalyticsConfig config = DataFrameAnalyticsConfig.builder()
.setSource(DataFrameAnalyticsSource.builder().setIndex("estimate-test-source-index").build())
.setSource(DataFrameAnalyticsSource.builder().setIndex("explain-df-test-source-index").build())
.setAnalysis(OutlierDetection.createDefault())
.build();
PutDataFrameAnalyticsRequest request = new PutDataFrameAnalyticsRequest(config); // <1>
// end::estimate-memory-usage-request
request = new ExplainDataFrameAnalyticsRequest(config); // <1>
// end::explain-data-frame-analytics-config-request
// tag::estimate-memory-usage-execute
EstimateMemoryUsageResponse response = client.machineLearning().estimateMemoryUsage(request, RequestOptions.DEFAULT);
// end::estimate-memory-usage-execute
// tag::explain-data-frame-analytics-execute
ExplainDataFrameAnalyticsResponse response = client.machineLearning().explainDataFrameAnalytics(request,
RequestOptions.DEFAULT);
// end::explain-data-frame-analytics-execute
// tag::estimate-memory-usage-response
ByteSizeValue expectedMemoryWithoutDisk = response.getExpectedMemoryWithoutDisk(); // <1>
ByteSizeValue expectedMemoryWithDisk = response.getExpectedMemoryWithDisk(); // <2>
// end::estimate-memory-usage-response
// tag::explain-data-frame-analytics-response
List<FieldSelection> fieldSelection = response.getFieldSelection(); // <1>
MemoryEstimation memoryEstimation = response.getMemoryEstimation(); // <2>
// end::explain-data-frame-analytics-response
assertThat(fieldSelection.size(), equalTo(2));
assertThat(fieldSelection.stream().map(FieldSelection::getName).collect(Collectors.toList()), contains("timestamp", "total"));
ByteSizeValue expectedMemoryWithoutDisk = memoryEstimation.getExpectedMemoryWithoutDisk(); // <1>
ByteSizeValue expectedMemoryWithDisk = memoryEstimation.getExpectedMemoryWithDisk(); // <2>
// We are pretty liberal here as this test does not aim at verifying concrete numbers but rather end-to-end user workflow.
ByteSizeValue lowerBound = new ByteSizeValue(1, ByteSizeUnit.KB);
@ -3496,14 +3511,14 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
}
{
DataFrameAnalyticsConfig config = DataFrameAnalyticsConfig.builder()
.setSource(DataFrameAnalyticsSource.builder().setIndex("estimate-test-source-index").build())
.setSource(DataFrameAnalyticsSource.builder().setIndex("explain-df-test-source-index").build())
.setAnalysis(OutlierDetection.createDefault())
.build();
PutDataFrameAnalyticsRequest request = new PutDataFrameAnalyticsRequest(config);
// tag::estimate-memory-usage-execute-listener
ActionListener<EstimateMemoryUsageResponse> listener = new ActionListener<EstimateMemoryUsageResponse>() {
ExplainDataFrameAnalyticsRequest request = new ExplainDataFrameAnalyticsRequest(config);
// tag::explain-data-frame-analytics-execute-listener
ActionListener<ExplainDataFrameAnalyticsResponse> listener = new ActionListener<ExplainDataFrameAnalyticsResponse>() {
@Override
public void onResponse(EstimateMemoryUsageResponse response) {
public void onResponse(ExplainDataFrameAnalyticsResponse response) {
// <1>
}
@ -3512,15 +3527,15 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
// <2>
}
};
// end::estimate-memory-usage-execute-listener
// end::explain-data-frame-analytics-execute-listener
// Replace the empty listener by a blocking listener in test
final CountDownLatch latch = new CountDownLatch(1);
listener = new LatchedActionListener<>(listener, latch);
// tag::estimate-memory-usage-execute-async
client.machineLearning().estimateMemoryUsageAsync(request, RequestOptions.DEFAULT, listener); // <1>
// end::estimate-memory-usage-execute-async
// tag::explain-data-frame-analytics-execute-async
client.machineLearning().explainDataFrameAnalyticsAsync(request, RequestOptions.DEFAULT, listener); // <1>
// end::explain-data-frame-analytics-execute-async
assertTrue(latch.await(30L, TimeUnit.SECONDS));
}

View File

@ -0,0 +1,44 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.client.ml;
import org.elasticsearch.client.ml.dataframe.DataFrameAnalyticsConfig;
import org.elasticsearch.client.ml.dataframe.DataFrameAnalyticsConfigTests;
import org.elasticsearch.test.ESTestCase;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.nullValue;
public class ExplainDataFrameAnalyticsRequestTests extends ESTestCase {
public void testIdConstructor() {
ExplainDataFrameAnalyticsRequest request = new ExplainDataFrameAnalyticsRequest("foo");
assertThat(request.getId(), equalTo("foo"));
assertThat(request.getConfig(), is(nullValue()));
}
public void testConfigConstructor() {
DataFrameAnalyticsConfig config = DataFrameAnalyticsConfigTests.randomDataFrameAnalyticsConfig();
ExplainDataFrameAnalyticsRequest request = new ExplainDataFrameAnalyticsRequest(config);
assertThat(request.getId(), is(nullValue()));
assertThat(request.getConfig(), equalTo(config));
}
}

View File

@ -0,0 +1,54 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.client.ml;
import org.elasticsearch.client.ml.dataframe.explain.FieldSelection;
import org.elasticsearch.client.ml.dataframe.explain.FieldSelectionTests;
import org.elasticsearch.client.ml.dataframe.explain.MemoryEstimation;
import org.elasticsearch.client.ml.dataframe.explain.MemoryEstimationTests;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.test.AbstractXContentTestCase;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.IntStream;
public class ExplainDataFrameAnalyticsResponseTests extends AbstractXContentTestCase<ExplainDataFrameAnalyticsResponse> {
@Override
protected ExplainDataFrameAnalyticsResponse createTestInstance() {
int fieldSelectionCount = randomIntBetween(1, 5);
List<FieldSelection> fieldSelection = new ArrayList<>(fieldSelectionCount);
IntStream.of(fieldSelectionCount).forEach(i -> fieldSelection.add(FieldSelectionTests.createRandom()));
MemoryEstimation memoryEstimation = MemoryEstimationTests.createRandom();
return new ExplainDataFrameAnalyticsResponse(fieldSelection, memoryEstimation);
}
@Override
protected ExplainDataFrameAnalyticsResponse doParseInstance(XContentParser parser) throws IOException {
return ExplainDataFrameAnalyticsResponse.fromXContent(parser);
}
@Override
protected boolean supportsUnknownFields() {
return true;
}
}

View File

@ -0,0 +1,57 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.client.ml.dataframe.explain;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.test.AbstractXContentTestCase;
import java.io.IOException;
import java.util.Set;
import java.util.stream.Collectors;
public class FieldSelectionTests extends AbstractXContentTestCase<FieldSelection> {
public static FieldSelection createRandom() {
Set<String> mappingTypes = randomSubsetOf(randomIntBetween(1, 3), "int", "float", "double", "text", "keyword", "ip")
.stream().collect(Collectors.toSet());
FieldSelection.FeatureType featureType = randomBoolean() ? null : randomFrom(FieldSelection.FeatureType.values());
String reason = randomBoolean() ? null : randomAlphaOfLength(20);
return new FieldSelection(randomAlphaOfLength(10),
mappingTypes,
randomBoolean(),
randomBoolean(),
featureType,
reason);
}
@Override
protected FieldSelection createTestInstance() {
return createRandom();
}
@Override
protected FieldSelection doParseInstance(XContentParser parser) throws IOException {
return FieldSelection.PARSER.apply(parser, null);
}
@Override
protected boolean supportsUnknownFields() {
return true;
}
}

View File

@ -16,7 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.client.ml;
package org.elasticsearch.client.ml.dataframe.explain;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.xcontent.XContentParser;
@ -24,22 +24,22 @@ import org.elasticsearch.test.AbstractXContentTestCase;
import java.io.IOException;
public class EstimateMemoryUsageResponseTests extends AbstractXContentTestCase<EstimateMemoryUsageResponse> {
public class MemoryEstimationTests extends AbstractXContentTestCase<MemoryEstimation> {
public static EstimateMemoryUsageResponse randomResponse() {
return new EstimateMemoryUsageResponse(
public static MemoryEstimation createRandom() {
return new MemoryEstimation(
randomBoolean() ? new ByteSizeValue(randomNonNegativeLong()) : null,
randomBoolean() ? new ByteSizeValue(randomNonNegativeLong()) : null);
}
@Override
protected EstimateMemoryUsageResponse createTestInstance() {
return randomResponse();
protected MemoryEstimation createTestInstance() {
return createRandom();
}
@Override
protected EstimateMemoryUsageResponse doParseInstance(XContentParser parser) throws IOException {
return EstimateMemoryUsageResponse.fromXContent(parser);
protected MemoryEstimation doParseInstance(XContentParser parser) throws IOException {
return MemoryEstimation.PARSER.apply(parser, null);
}
@Override

View File

@ -1,36 +0,0 @@
--
:api: estimate-memory-usage
:request: PutDataFrameAnalyticsRequest
:response: EstimateMemoryUsageResponse
--
[role="xpack"]
[id="{upid}-{api}"]
=== Estimate memory usage API
Estimates memory usage of {dfanalytics}.
Estimation results can be used when deciding the appropriate value for `model_memory_limit` setting later on.
The API accepts an +{request}+ object and returns an +{response}+.
[id="{upid}-{api}-request"]
==== Estimate memory usage request
["source","java",subs="attributes,callouts,macros"]
--------------------------------------------------
include-tagged::{doc-tests-file}[{api}-request]
--------------------------------------------------
<1> Constructing a new request containing a {dataframe-analytics-config} for which memory usage estimation should be performed
include::../execution.asciidoc[]
[id="{upid}-{api}-response"]
==== Response
The returned +{response}+ contains the memory usage estimates.
["source","java",subs="attributes,callouts,macros"]
--------------------------------------------------
include-tagged::{doc-tests-file}[{api}-response]
--------------------------------------------------
<1> Estimated memory usage under the assumption that the whole {dfanalytics} should happen in memory (i.e. without overflowing to disk).
<2> Estimated memory usage under the assumption that overflowing to disk is allowed during {dfanalytics}.

View File

@ -0,0 +1,48 @@
--
:api: explain-data-frame-analytics
:request: ExplainDataFrameAnalyticsRequest
:response: ExplainDataFrameAnalyticsResponse
--
[role="xpack"]
[id="{upid}-{api}"]
=== Explain {dfanalytics}} API
Explains the following about a {dataframe-analytics-config}:
* field selection: which fields are included or not in the analysis
* memory estimation: how much memory is estimated to be required. The estimate can be used when deciding the appropriate value for `model_memory_limit` setting later on.
The API accepts an +{request}+ object and returns an +{response}+.
[id="{upid}-{api}-request"]
==== Explain {dfanalytics} request
The request can be constructed with the id of an existing {dfanalytics-job}.
["source","java",subs="attributes,callouts,macros"]
--------------------------------------------------
include-tagged::{doc-tests-file}[{api}-id-request]
--------------------------------------------------
<1> Constructing a new request with the id of an existing {dfanalytics-job}
It can also be constructed with a {dataframe-analytics-config} to explain it before creating it.
["source","java",subs="attributes,callouts,macros"]
--------------------------------------------------
include-tagged::{doc-tests-file}[{api}-config-request]
--------------------------------------------------
<1> Constructing a new request containing a {dataframe-analytics-config}
include::../execution.asciidoc[]
[id="{upid}-{api}-response"]
==== Response
The returned +{response}+ contains the field selection and the memory usage estimation.
["source","java",subs="attributes,callouts,macros"]
--------------------------------------------------
include-tagged::{doc-tests-file}[{api}-response]
--------------------------------------------------
<1> A list where each item explains whether a field was selected for analysis or not
<2> The memory estimation for the {dfanalytics-job}

View File

@ -300,7 +300,7 @@ The Java High Level REST Client supports the following Machine Learning APIs:
* <<{upid}-start-data-frame-analytics>>
* <<{upid}-stop-data-frame-analytics>>
* <<{upid}-evaluate-data-frame>>
* <<{upid}-estimate-memory-usage>>
* <<{upid}-explain-data-frame-analytics>>
* <<{upid}-get-trained-models>>
* <<{upid}-put-filter>>
* <<{upid}-get-filters>>
@ -353,7 +353,7 @@ include::ml/delete-data-frame-analytics.asciidoc[]
include::ml/start-data-frame-analytics.asciidoc[]
include::ml/stop-data-frame-analytics.asciidoc[]
include::ml/evaluate-data-frame.asciidoc[]
include::ml/estimate-memory-usage.asciidoc[]
include::ml/explain-data-frame-analytics.asciidoc[]
include::ml/get-trained-models.asciidoc[]
include::ml/put-filter.asciidoc[]
include::ml/get-filters.asciidoc[]

View File

@ -1,80 +0,0 @@
[role="xpack"]
[testenv="platinum"]
[[estimate-memory-usage-dfanalytics]]
=== Estimate memory usage API
[subs="attributes"]
++++
<titleabbrev>Estimate memory usage for {dfanalytics-jobs}</titleabbrev>
++++
Estimates memory usage for the given {dataframe-analytics-config}.
experimental[]
[[ml-estimate-memory-usage-dfanalytics-request]]
==== {api-request-title}
`POST _ml/data_frame/analytics/_estimate_memory_usage`
[[ml-estimate-memory-usage-dfanalytics-prereq]]
==== {api-prereq-title}
* You must have `monitor_ml` privilege to use this API. For more
information, see <<security-privileges>> and <<built-in-roles>>.
[[ml-estimate-memory-usage-dfanalytics-desc]]
==== {api-description-title}
This API estimates memory usage for the given {dataframe-analytics-config} before the {dfanalytics-job} is even created.
Serves as an advice on how to set `model_memory_limit` when creating {dfanalytics-job}.
[[ml-estimate-memory-usage-dfanalytics-request-body]]
==== {api-request-body-title}
`data_frame_analytics_config`::
(Required, object) Intended configuration of {dfanalytics-job}. For more information, see
<<ml-dfanalytics-resources>>.
Note that `id` and `dest` don't need to be provided in the context of this API.
[[ml-estimate-memory-usage-dfanalytics-results]]
==== {api-response-body-title}
`expected_memory_without_disk`::
(string) Estimated memory usage under the assumption that the whole {dfanalytics} should happen in memory
(i.e. without overflowing to disk).
`expected_memory_with_disk`::
(string) Estimated memory usage under the assumption that overflowing to disk is allowed during {dfanalytics}.
`expected_memory_with_disk` is usually smaller than `expected_memory_without_disk` as using disk allows to
limit the main memory needed to perform {dfanalytics}.
[[ml-estimate-memory-usage-dfanalytics-example]]
==== {api-examples-title}
[source,console]
--------------------------------------------------
POST _ml/data_frame/analytics/_estimate_memory_usage
{
"data_frame_analytics_config": {
"source": {
"index": "logdata"
},
"analysis": {
"outlier_detection": {}
}
}
}
--------------------------------------------------
// TEST[skip:TBD]
The API returns the following results:
[source,console-result]
----
{
"expected_memory_without_disk": "128MB",
"expected_memory_with_disk": "32MB"
}
----

View File

@ -0,0 +1,159 @@
[role="xpack"]
[testenv="platinum"]
[[explain-dfanalytics]]
=== Explain {dfanalytics} API
[subs="attributes"]
++++
<titleabbrev>Explain {dfanalytics} API</titleabbrev>
++++
Explains a {dataframe-analytics-config}.
experimental[]
[[ml-explain-dfanalytics-request]]
==== {api-request-title}
`GET _ml/data_frame/analytics/_explain` +
`POST _ml/data_frame/analytics/_explain` +
`GET _ml/data_frame/analytics/<data_frame_analytics_id>/_explain` +
`POST _ml/data_frame/analytics/<data_frame_analytics_id>/_explain`
[[ml-explain-dfanalytics-prereq]]
==== {api-prereq-title}
* You must have `monitor_ml` privilege to use this API. For more
information, see <<security-privileges>> and <<built-in-roles>>.
[[ml-explain-dfanalytics-desc]]
==== {api-description-title}
This API provides explanations for a {dataframe-analytics-config} that either exists already or one that has not been created yet.
The following explanations are provided:
* which fields are included or not in the analysis and why
* how much memory is estimated to be required. The estimate can be used when deciding the appropriate value for `model_memory_limit` setting later on.
about either an existing {dfanalytics-job} or one that has not been created yet.
[[ml-explain-dfanalytics-path-params]]
==== {api-path-parms-title}
`<data_frame_analytics_id>`::
(Optional, string) A numerical character string that uniquely identifies the existing
{dfanalytics-job} to explain. This identifier can contain lowercase alphanumeric
characters (a-z and 0-9), hyphens, and underscores. It must start and end with
alphanumeric characters.
[[ml-explain-dfanalytics-request-body]]
==== {api-request-body-title}
`data_frame_analytics_config`::
(Optional, object) Intended configuration of {dfanalytics-job}. For more information, see
<<ml-dfanalytics-resources>>.
Note that `id` and `dest` don't need to be provided in the context of this API.
[[ml-explain-dfanalytics-results]]
==== {api-response-body-title}
The API returns a response that contains the following:
`field_selection`::
(array) An array of objects that explain selection for each field, sorted by the field names.
Each object in the array has the following properties:
`name`:::
(string) The field name.
`mapping_types`:::
(string) The mapping types of the field.
`is_included`:::
(boolean) Whether the field is selected to be included in the analysis.
`is_required`:::
(boolean) Whether the field is required.
`feature_type`:::
(string) The feature type of this field for the analysis. May be `categorical` or `numerical`.
`reason`:::
(string) The reason a field is not selected to be included in the analysis.
`memory_estimation`::
(object) An object containing the memory estimates. The object has the following properties:
`expected_memory_without_disk`:::
(string) Estimated memory usage under the assumption that the whole {dfanalytics} should happen in memory
(i.e. without overflowing to disk).
`expected_memory_with_disk`:::
(string) Estimated memory usage under the assumption that overflowing to disk is allowed during {dfanalytics}.
`expected_memory_with_disk` is usually smaller than `expected_memory_without_disk` as using disk allows to
limit the main memory needed to perform {dfanalytics}.
[[ml-explain-dfanalytics-example]]
==== {api-examples-title}
[source,console]
--------------------------------------------------
POST _ml/data_frame/analytics/_explain
{
"data_frame_analytics_config": {
"source": {
"index": "houses_sold_last_10_yrs"
},
"analysis": {
"regression": {
"dependent_variable": "price"
}
}
}
}
--------------------------------------------------
// TEST[skip:TBD]
The API returns the following results:
[source,console-result]
----
{
"field_selection": [
{
"field": "number_of_bedrooms",
"mappings_types": ["integer"],
"is_included": true,
"is_required": false,
"feature_type": "numerical"
},
{
"field": "postcode",
"mappings_types": ["text"],
"is_included": false,
"is_required": false,
"reason": "[postcode.keyword] is preferred because it is aggregatable"
},
{
"field": "postcode.keyword",
"mappings_types": ["keyword"],
"is_included": true,
"is_required": false,
"feature_type": "categorical"
},
{
"field": "price",
"mappings_types": ["float"],
"is_included": true,
"is_required": true,
"feature_type": "numerical"
}
],
"memory_estimation": {
"expected_memory_without_disk": "128MB",
"expected_memory_with_disk": "32MB"
}
}
----

View File

@ -12,7 +12,7 @@ You can use the following APIs to perform {ml} {dfanalytics} activities.
* <<start-dfanalytics,Start {dfanalytics-jobs}>>
* <<stop-dfanalytics,Stop {dfanalytics-jobs}>>
* <<evaluate-dfanalytics,Evaluate {dfanalytics}>>
* <<estimate-memory-usage-dfanalytics,Estimate memory usage for {dfanalytics}>>
* <<explain-dfanalytics,Explain {dfanalytics}>>
See also <<ml-apis>>.
@ -23,7 +23,7 @@ include::delete-dfanalytics.asciidoc[]
//EVALUATE
include::evaluate-dfanalytics.asciidoc[]
//ESTIMATE_MEMORY_USAGE
include::estimate-memory-usage-dfanalytics.asciidoc[]
include::explain-dfanalytics.asciidoc[]
//GET
include::get-dfanalytics.asciidoc[]
include::get-dfanalytics-stats.asciidoc[]

View File

@ -79,6 +79,7 @@ import org.elasticsearch.xpack.core.ml.MachineLearningFeatureSetUsage;
import org.elasticsearch.xpack.core.ml.MlMetadata;
import org.elasticsearch.xpack.core.ml.MlTasks;
import org.elasticsearch.xpack.core.ml.action.CloseJobAction;
import org.elasticsearch.xpack.core.ml.action.ExplainDataFrameAnalyticsAction;
import org.elasticsearch.xpack.core.ml.action.DeleteCalendarAction;
import org.elasticsearch.xpack.core.ml.action.DeleteCalendarEventAction;
import org.elasticsearch.xpack.core.ml.action.DeleteDataFrameAnalyticsAction;
@ -89,7 +90,6 @@ import org.elasticsearch.xpack.core.ml.action.DeleteForecastAction;
import org.elasticsearch.xpack.core.ml.action.DeleteJobAction;
import org.elasticsearch.xpack.core.ml.action.DeleteModelSnapshotAction;
import org.elasticsearch.xpack.core.ml.action.DeleteTrainedModelAction;
import org.elasticsearch.xpack.core.ml.action.EstimateMemoryUsageAction;
import org.elasticsearch.xpack.core.ml.action.EvaluateDataFrameAction;
import org.elasticsearch.xpack.core.ml.action.FinalizeJobExecutionAction;
import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction;
@ -158,6 +158,10 @@ import org.elasticsearch.xpack.core.ml.dataframe.evaluation.softclassification.P
import org.elasticsearch.xpack.core.ml.dataframe.evaluation.softclassification.Recall;
import org.elasticsearch.xpack.core.ml.dataframe.evaluation.softclassification.ScoreByThresholdResult;
import org.elasticsearch.xpack.core.ml.dataframe.evaluation.softclassification.SoftClassificationMetric;
import org.elasticsearch.xpack.core.ml.inference.preprocessing.FrequencyEncoding;
import org.elasticsearch.xpack.core.ml.inference.preprocessing.OneHotEncoding;
import org.elasticsearch.xpack.core.ml.inference.preprocessing.PreProcessor;
import org.elasticsearch.xpack.core.ml.inference.preprocessing.TargetMeanEncoding;
import org.elasticsearch.xpack.core.ml.inference.results.ClassificationInferenceResults;
import org.elasticsearch.xpack.core.ml.inference.results.InferenceResults;
import org.elasticsearch.xpack.core.ml.inference.results.RegressionInferenceResults;
@ -171,10 +175,6 @@ import org.elasticsearch.xpack.core.ml.inference.trainedmodel.ensemble.OutputAgg
import org.elasticsearch.xpack.core.ml.inference.trainedmodel.ensemble.WeightedMode;
import org.elasticsearch.xpack.core.ml.inference.trainedmodel.ensemble.WeightedSum;
import org.elasticsearch.xpack.core.ml.inference.trainedmodel.tree.Tree;
import org.elasticsearch.xpack.core.ml.inference.preprocessing.FrequencyEncoding;
import org.elasticsearch.xpack.core.ml.inference.preprocessing.OneHotEncoding;
import org.elasticsearch.xpack.core.ml.inference.preprocessing.PreProcessor;
import org.elasticsearch.xpack.core.ml.inference.preprocessing.TargetMeanEncoding;
import org.elasticsearch.xpack.core.ml.job.config.JobTaskState;
import org.elasticsearch.xpack.core.monitoring.MonitoringFeatureSetUsage;
import org.elasticsearch.xpack.core.rollup.RollupFeatureSetUsage;
@ -381,7 +381,7 @@ public class XPackClientPlugin extends Plugin implements ActionPlugin, NetworkPl
StartDataFrameAnalyticsAction.INSTANCE,
StopDataFrameAnalyticsAction.INSTANCE,
EvaluateDataFrameAction.INSTANCE,
EstimateMemoryUsageAction.INSTANCE,
ExplainDataFrameAnalyticsAction.INSTANCE,
InternalInferModelAction.INSTANCE,
GetTrainedModelsAction.INSTANCE,
DeleteTrainedModelAction.INSTANCE,

View File

@ -1,119 +0,0 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.core.ml.action;
import org.elasticsearch.action.ActionResponse;
import org.elasticsearch.action.ActionType;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.ObjectParser;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
import java.io.IOException;
import java.util.Objects;
import static org.elasticsearch.common.xcontent.ConstructingObjectParser.optionalConstructorArg;
public class EstimateMemoryUsageAction extends ActionType<EstimateMemoryUsageAction.Response> {
public static final EstimateMemoryUsageAction INSTANCE = new EstimateMemoryUsageAction();
public static final String NAME = "cluster:admin/xpack/ml/data_frame/analytics/estimate_memory_usage";
private EstimateMemoryUsageAction() {
super(NAME, EstimateMemoryUsageAction.Response::new);
}
public static class Response extends ActionResponse implements ToXContentObject {
public static final ParseField TYPE = new ParseField("memory_usage_estimation_result");
public static final ParseField EXPECTED_MEMORY_WITHOUT_DISK = new ParseField("expected_memory_without_disk");
public static final ParseField EXPECTED_MEMORY_WITH_DISK = new ParseField("expected_memory_with_disk");
static final ConstructingObjectParser<Response, Void> PARSER =
new ConstructingObjectParser<>(
TYPE.getPreferredName(),
args -> new Response((ByteSizeValue) args[0], (ByteSizeValue) args[1]));
static {
PARSER.declareField(
optionalConstructorArg(),
(p, c) -> ByteSizeValue.parseBytesSizeValue(p.text(), EXPECTED_MEMORY_WITHOUT_DISK.getPreferredName()),
EXPECTED_MEMORY_WITHOUT_DISK,
ObjectParser.ValueType.VALUE);
PARSER.declareField(
optionalConstructorArg(),
(p, c) -> ByteSizeValue.parseBytesSizeValue(p.text(), EXPECTED_MEMORY_WITH_DISK.getPreferredName()),
EXPECTED_MEMORY_WITH_DISK,
ObjectParser.ValueType.VALUE);
}
private final ByteSizeValue expectedMemoryWithoutDisk;
private final ByteSizeValue expectedMemoryWithDisk;
public Response(@Nullable ByteSizeValue expectedMemoryWithoutDisk, @Nullable ByteSizeValue expectedMemoryWithDisk) {
this.expectedMemoryWithoutDisk = expectedMemoryWithoutDisk;
this.expectedMemoryWithDisk = expectedMemoryWithDisk;
}
public Response(StreamInput in) throws IOException {
super(in);
this.expectedMemoryWithoutDisk = in.readOptionalWriteable(ByteSizeValue::new);
this.expectedMemoryWithDisk = in.readOptionalWriteable(ByteSizeValue::new);
}
public ByteSizeValue getExpectedMemoryWithoutDisk() {
return expectedMemoryWithoutDisk;
}
public ByteSizeValue getExpectedMemoryWithDisk() {
return expectedMemoryWithDisk;
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeOptionalWriteable(expectedMemoryWithoutDisk);
out.writeOptionalWriteable(expectedMemoryWithDisk);
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
if (expectedMemoryWithoutDisk != null) {
builder.field(EXPECTED_MEMORY_WITHOUT_DISK.getPreferredName(), expectedMemoryWithoutDisk.getStringRep());
}
if (expectedMemoryWithDisk != null) {
builder.field(EXPECTED_MEMORY_WITH_DISK.getPreferredName(), expectedMemoryWithDisk.getStringRep());
}
builder.endObject();
return builder;
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (other == null || getClass() != other.getClass()) {
return false;
}
Response that = (Response) other;
return Objects.equals(expectedMemoryWithoutDisk, that.expectedMemoryWithoutDisk)
&& Objects.equals(expectedMemoryWithDisk, that.expectedMemoryWithDisk);
}
@Override
public int hashCode() {
return Objects.hash(expectedMemoryWithoutDisk, expectedMemoryWithDisk);
}
}
}

View File

@ -0,0 +1,101 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.core.ml.action;
import org.elasticsearch.action.ActionResponse;
import org.elasticsearch.action.ActionType;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.xpack.core.ml.dataframe.explain.FieldSelection;
import org.elasticsearch.xpack.core.ml.dataframe.explain.MemoryEstimation;
import java.io.IOException;
import java.util.List;
import java.util.Objects;
public class ExplainDataFrameAnalyticsAction extends ActionType<ExplainDataFrameAnalyticsAction.Response> {
public static final ExplainDataFrameAnalyticsAction INSTANCE = new ExplainDataFrameAnalyticsAction();
public static final String NAME = "cluster:admin/xpack/ml/data_frame/analytics/explain";
private ExplainDataFrameAnalyticsAction() {
super(NAME, ExplainDataFrameAnalyticsAction.Response::new);
}
public static class Response extends ActionResponse implements ToXContentObject {
public static final ParseField TYPE = new ParseField("explain_data_frame_analytics_response");
public static final ParseField FIELD_SELECTION = new ParseField("field_selection");
public static final ParseField MEMORY_ESTIMATION = new ParseField("memory_estimation");
static final ConstructingObjectParser<Response, Void> PARSER =
new ConstructingObjectParser<>(
TYPE.getPreferredName(),
args -> new Response((List<FieldSelection>) args[0], (MemoryEstimation) args[1]));
static {
PARSER.declareObjectArray(ConstructingObjectParser.constructorArg(), FieldSelection.PARSER, FIELD_SELECTION);
PARSER.declareObject(ConstructingObjectParser.constructorArg(), MemoryEstimation.PARSER, MEMORY_ESTIMATION);
}
private final List<FieldSelection> fieldSelection;
private final MemoryEstimation memoryEstimation;
public Response(List<FieldSelection> fieldSelection, MemoryEstimation memoryEstimation) {
this.fieldSelection = Objects.requireNonNull(fieldSelection);
this.memoryEstimation = Objects.requireNonNull(memoryEstimation);
}
public Response(StreamInput in) throws IOException {
super(in);
this.fieldSelection = in.readList(FieldSelection::new);
this.memoryEstimation = new MemoryEstimation(in);
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeList(fieldSelection);
memoryEstimation.writeTo(out);
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
builder.field(FIELD_SELECTION.getPreferredName(), fieldSelection);
builder.field(MEMORY_ESTIMATION.getPreferredName(), memoryEstimation);
builder.endObject();
return builder;
}
@Override
public boolean equals(Object other) {
if (this == other) return true;
if (other == null || getClass() != other.getClass()) return false;
Response that = (Response) other;
return Objects.equals(fieldSelection, that.fieldSelection)
&& Objects.equals(memoryEstimation, that.memoryEstimation);
}
@Override
public int hashCode() {
return Objects.hash(fieldSelection, memoryEstimation);
}
public MemoryEstimation getMemoryEstimation() {
return memoryEstimation;
}
public List<FieldSelection> getFieldSelection() {
return fieldSelection;
}
}
}

View File

@ -51,13 +51,14 @@ public class PutDataFrameAnalyticsAction extends ActionType<PutDataFrameAnalytic
}
/**
* Parses request for memory estimation.
* {@link Request} is reused across {@link PutDataFrameAnalyticsAction} and {@link EstimateMemoryUsageAction} but parsing differs
* Parses request for use in the explain action.
* {@link Request} is reused across {@link PutDataFrameAnalyticsAction} and
* {@link ExplainDataFrameAnalyticsAction} but parsing differs
* between these two usages.
*/
public static Request parseRequestForMemoryEstimation(XContentParser parser) {
public static Request parseRequestForExplain(XContentParser parser) {
DataFrameAnalyticsConfig.Builder configBuilder = DataFrameAnalyticsConfig.STRICT_PARSER.apply(parser, null);
DataFrameAnalyticsConfig config = configBuilder.buildForMemoryEstimation();
DataFrameAnalyticsConfig config = configBuilder.buildForExplain();
return new PutDataFrameAnalyticsAction.Request(config);
}

View File

@ -416,11 +416,11 @@ public class DataFrameAnalyticsConfig implements ToXContentObject, Writeable {
}
/**
* Builds {@link DataFrameAnalyticsConfig} object for the purpose of performing memory estimation.
* Builds {@link DataFrameAnalyticsConfig} object for the purpose of explaining a job that has not been created yet.
* Some fields (i.e. "id", "dest") may not be present, therefore we overwrite them here to make {@link DataFrameAnalyticsConfig}'s
* constructor validations happy.
*/
public DataFrameAnalyticsConfig buildForMemoryEstimation() {
public DataFrameAnalyticsConfig buildForExplain() {
return new DataFrameAnalyticsConfig(
id != null ? id : "dummy",
description,

View File

@ -0,0 +1,184 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.core.ml.dataframe.explain;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.ObjectParser;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import java.io.IOException;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
public class FieldSelection implements ToXContentObject, Writeable {
private static final ParseField NAME = new ParseField("name");
private static final ParseField MAPPING_TYPES = new ParseField("mapping_types");
private static final ParseField IS_INCLUDED = new ParseField("is_included");
private static final ParseField IS_REQUIRED = new ParseField("is_required");
private static final ParseField FEATURE_TYPE = new ParseField("feature_type");
private static final ParseField REASON = new ParseField("reason");
public enum FeatureType {
CATEGORICAL, NUMERICAL;
public static FeatureType fromString(String value) {
return FeatureType.valueOf(value.toUpperCase(Locale.ROOT));
}
@Override
public String toString() {
return name().toLowerCase(Locale.ROOT);
}
}
public static ConstructingObjectParser<FieldSelection, Void> PARSER = new ConstructingObjectParser<>("field_selection",
a -> new FieldSelection((String) a[0], new HashSet<>((List<String>) a[1]), (boolean) a[2], (boolean) a[3], (FeatureType) a[4],
(String) a[5]));
static {
PARSER.declareString(ConstructingObjectParser.constructorArg(), NAME);
PARSER.declareStringArray(ConstructingObjectParser.constructorArg(), MAPPING_TYPES);
PARSER.declareBoolean(ConstructingObjectParser.constructorArg(), IS_INCLUDED);
PARSER.declareBoolean(ConstructingObjectParser.constructorArg(), IS_REQUIRED);
PARSER.declareField(ConstructingObjectParser.optionalConstructorArg(), p -> {
if (p.currentToken() == XContentParser.Token.VALUE_STRING) {
return FeatureType.fromString(p.text());
}
throw new IllegalArgumentException("Unsupported token [" + p.currentToken() + "]");
}, FEATURE_TYPE, ObjectParser.ValueType.STRING);
PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), REASON);
}
private final String name;
private final Set<String> mappingTypes;
private final boolean isIncluded;
private final boolean isRequired;
private final FeatureType featureType;
private final String reason;
public static FieldSelection included(String name, Set<String> mappingTypes, boolean isRequired, FeatureType featureType) {
return new FieldSelection(name, mappingTypes, true, isRequired, featureType, null);
}
public static FieldSelection excluded(String name, Set<String> mappingTypes, String reason) {
return new FieldSelection(name, mappingTypes, false, false, null, reason);
}
FieldSelection(String name, Set<String> mappingTypes, boolean isIncluded, boolean isRequired, @Nullable FeatureType featureType,
@Nullable String reason) {
this.name = Objects.requireNonNull(name);
this.mappingTypes = Collections.unmodifiableSet(mappingTypes);
this.isIncluded = isIncluded;
this.isRequired = isRequired;
this.featureType = featureType;
this.reason = reason;
}
public FieldSelection(StreamInput in) throws IOException {
this.name = in.readString();
this.mappingTypes = Collections.unmodifiableSet(in.readSet(StreamInput::readString));
this.isIncluded = in.readBoolean();
this.isRequired = in.readBoolean();
boolean hasFeatureType = in.readBoolean();
if (hasFeatureType) {
this.featureType = in.readEnum(FeatureType.class);
} else {
this.featureType = null;
}
this.reason = in.readOptionalString();
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(name);
out.writeCollection(mappingTypes, StreamOutput::writeString);
out.writeBoolean(isIncluded);
out.writeBoolean(isRequired);
if (featureType == null) {
out.writeBoolean(false);
} else {
out.writeBoolean(true);
out.writeEnum(featureType);
}
out.writeOptionalString(reason);
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
builder.field(NAME.getPreferredName(), name);
builder.field(MAPPING_TYPES.getPreferredName(), mappingTypes);
builder.field(IS_INCLUDED.getPreferredName(), isIncluded);
builder.field(IS_REQUIRED.getPreferredName(), isRequired);
if (featureType != null) {
builder.field(FEATURE_TYPE.getPreferredName(), featureType);
}
if (reason != null) {
builder.field(REASON.getPreferredName(), reason);
}
builder.endObject();
return builder;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
FieldSelection that = (FieldSelection) o;
return Objects.equals(name, that.name)
&& Objects.equals(mappingTypes, that.mappingTypes)
&& isIncluded == that.isIncluded
&& isRequired == that.isRequired
&& Objects.equals(featureType, that.featureType)
&& Objects.equals(reason, that.reason);
}
@Override
public int hashCode() {
return Objects.hash(name, mappingTypes, isIncluded, isRequired, featureType, reason);
}
public String getName() {
return name;
}
public Set<String> getMappingTypes() {
return mappingTypes;
}
public boolean isIncluded() {
return isIncluded;
}
public boolean isRequired() {
return isRequired;
}
@Nullable
public FeatureType getFeatureType() {
return featureType;
}
@Nullable
public String getReason() {
return reason;
}
}

View File

@ -0,0 +1,103 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.core.ml.dataframe.explain;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.ObjectParser;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
import java.io.IOException;
import java.util.Objects;
import static org.elasticsearch.common.xcontent.ConstructingObjectParser.optionalConstructorArg;
public class MemoryEstimation implements ToXContentObject, Writeable {
public static final ParseField EXPECTED_MEMORY_WITHOUT_DISK = new ParseField("expected_memory_without_disk");
public static final ParseField EXPECTED_MEMORY_WITH_DISK = new ParseField("expected_memory_with_disk");
public static final ConstructingObjectParser<MemoryEstimation, Void> PARSER = new ConstructingObjectParser<>("memory_estimation",
a -> new MemoryEstimation((ByteSizeValue) a[0], (ByteSizeValue) a[1]));
static {
PARSER.declareField(
optionalConstructorArg(),
(p, c) -> ByteSizeValue.parseBytesSizeValue(p.text(), EXPECTED_MEMORY_WITHOUT_DISK.getPreferredName()),
EXPECTED_MEMORY_WITHOUT_DISK,
ObjectParser.ValueType.VALUE);
PARSER.declareField(
optionalConstructorArg(),
(p, c) -> ByteSizeValue.parseBytesSizeValue(p.text(), EXPECTED_MEMORY_WITH_DISK.getPreferredName()),
EXPECTED_MEMORY_WITH_DISK,
ObjectParser.ValueType.VALUE);
}
private final ByteSizeValue expectedMemoryWithoutDisk;
private final ByteSizeValue expectedMemoryWithDisk;
public MemoryEstimation(@Nullable ByteSizeValue expectedMemoryWithoutDisk, @Nullable ByteSizeValue expectedMemoryWithDisk) {
this.expectedMemoryWithoutDisk = expectedMemoryWithoutDisk;
this.expectedMemoryWithDisk = expectedMemoryWithDisk;
}
public MemoryEstimation(StreamInput in) throws IOException {
this.expectedMemoryWithoutDisk = in.readOptionalWriteable(ByteSizeValue::new);
this.expectedMemoryWithDisk = in.readOptionalWriteable(ByteSizeValue::new);
}
public ByteSizeValue getExpectedMemoryWithoutDisk() {
return expectedMemoryWithoutDisk;
}
public ByteSizeValue getExpectedMemoryWithDisk() {
return expectedMemoryWithDisk;
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeOptionalWriteable(expectedMemoryWithoutDisk);
out.writeOptionalWriteable(expectedMemoryWithDisk);
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
if (expectedMemoryWithoutDisk != null) {
builder.field(EXPECTED_MEMORY_WITHOUT_DISK.getPreferredName(), expectedMemoryWithoutDisk.getStringRep());
}
if (expectedMemoryWithDisk != null) {
builder.field(EXPECTED_MEMORY_WITH_DISK.getPreferredName(), expectedMemoryWithDisk.getStringRep());
}
builder.endObject();
return builder;
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (other == null || getClass() != other.getClass()) {
return false;
}
MemoryEstimation that = (MemoryEstimation) other;
return Objects.equals(expectedMemoryWithoutDisk, that.expectedMemoryWithoutDisk)
&& Objects.equals(expectedMemoryWithDisk, that.expectedMemoryWithDisk);
}
@Override
public int hashCode() {
return Objects.hash(expectedMemoryWithoutDisk, expectedMemoryWithDisk);
}
}

View File

@ -1,54 +0,0 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.core.ml.action;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.test.AbstractSerializingTestCase;
import org.elasticsearch.xpack.core.ml.action.EstimateMemoryUsageAction.Response;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.nullValue;
public class EstimateMemoryUsageActionResponseTests extends AbstractSerializingTestCase<Response> {
@Override
protected Response createTestInstance() {
return new Response(
randomBoolean() ? new ByteSizeValue(randomNonNegativeLong()) : null,
randomBoolean() ? new ByteSizeValue(randomNonNegativeLong()) : null);
}
@Override
protected Writeable.Reader<Response> instanceReader() {
return Response::new;
}
@Override
protected Response doParseInstance(XContentParser parser) {
return Response.PARSER.apply(parser, null);
}
public void testConstructor_NullValues() {
Response response = new Response(null, null);
assertThat(response.getExpectedMemoryWithoutDisk(), nullValue());
assertThat(response.getExpectedMemoryWithDisk(), nullValue());
}
public void testConstructor_SmallValues() {
Response response = new Response(new ByteSizeValue(120, ByteSizeUnit.KB), new ByteSizeValue(30, ByteSizeUnit.KB));
assertThat(response.getExpectedMemoryWithoutDisk(), equalTo(new ByteSizeValue(120, ByteSizeUnit.KB)));
assertThat(response.getExpectedMemoryWithDisk(), equalTo(new ByteSizeValue(30, ByteSizeUnit.KB)));
}
public void testConstructor() {
Response response = new Response(new ByteSizeValue(20, ByteSizeUnit.MB), new ByteSizeValue(10, ByteSizeUnit.MB));
assertThat(response.getExpectedMemoryWithoutDisk(), equalTo(new ByteSizeValue(20, ByteSizeUnit.MB)));
assertThat(response.getExpectedMemoryWithDisk(), equalTo(new ByteSizeValue(10, ByteSizeUnit.MB)));
}
}

View File

@ -0,0 +1,42 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.core.ml.action;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.test.AbstractSerializingTestCase;
import org.elasticsearch.xpack.core.ml.action.ExplainDataFrameAnalyticsAction.Response;
import org.elasticsearch.xpack.core.ml.dataframe.explain.FieldSelection;
import org.elasticsearch.xpack.core.ml.dataframe.explain.FieldSelectionTests;
import org.elasticsearch.xpack.core.ml.dataframe.explain.MemoryEstimation;
import org.elasticsearch.xpack.core.ml.dataframe.explain.MemoryEstimationTests;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.IntStream;
public class ExplainDataFrameAnalyticsActionResponseTests extends AbstractSerializingTestCase<Response> {
@Override
protected Response createTestInstance() {
int fieldSelectionCount = randomIntBetween(1, 5);
List<FieldSelection> fieldSelection = new ArrayList<>(fieldSelectionCount);
IntStream.of(fieldSelectionCount).forEach(i -> fieldSelection.add(FieldSelectionTests.createRandom()));
MemoryEstimation memoryEstimation = MemoryEstimationTests.createRandom();
return new Response(fieldSelection, memoryEstimation);
}
@Override
protected Writeable.Reader<Response> instanceReader() {
return Response::new;
}
@Override
protected Response doParseInstance(XContentParser parser) {
return Response.PARSER.apply(parser, null);
}
}

View File

@ -279,32 +279,32 @@ public class DataFrameAnalyticsConfigTests extends AbstractSerializingTestCase<D
assertThat(e.getMessage(), containsString("must be less than the value of the xpack.ml.max_model_memory_limit setting"));
}
public void testBuildForMemoryEstimation() {
public void testBuildForExplain() {
DataFrameAnalyticsConfig.Builder builder = createRandomBuilder("foo");
DataFrameAnalyticsConfig config = builder.buildForMemoryEstimation();
DataFrameAnalyticsConfig config = builder.buildForExplain();
assertThat(config, equalTo(builder.build()));
}
public void testBuildForMemoryEstimation_MissingId() {
public void testBuildForExplain_MissingId() {
DataFrameAnalyticsConfig.Builder builder = new DataFrameAnalyticsConfig.Builder()
.setAnalysis(OutlierDetectionTests.createRandom())
.setSource(DataFrameAnalyticsSourceTests.createRandom())
.setDest(DataFrameAnalyticsDestTests.createRandom());
DataFrameAnalyticsConfig config = builder.buildForMemoryEstimation();
DataFrameAnalyticsConfig config = builder.buildForExplain();
assertThat(config.getId(), equalTo("dummy"));
}
public void testBuildForMemoryEstimation_MissingDest() {
public void testBuildForExplain_MissingDest() {
DataFrameAnalyticsConfig.Builder builder = new DataFrameAnalyticsConfig.Builder()
.setId("foo")
.setAnalysis(OutlierDetectionTests.createRandom())
.setSource(DataFrameAnalyticsSourceTests.createRandom());
DataFrameAnalyticsConfig config = builder.buildForMemoryEstimation();
DataFrameAnalyticsConfig config = builder.buildForExplain();
assertThat(config.getDest().getIndex(), equalTo("dummy"));
}

View File

@ -0,0 +1,45 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.core.ml.dataframe.explain;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.test.AbstractSerializingTestCase;
import java.io.IOException;
import java.util.Set;
import java.util.stream.Collectors;
public class FieldSelectionTests extends AbstractSerializingTestCase<FieldSelection> {
public static FieldSelection createRandom() {
Set<String> mappingTypes = randomSubsetOf(randomIntBetween(1, 3), "int", "float", "double", "text", "keyword", "ip")
.stream().collect(Collectors.toSet());
FieldSelection.FeatureType featureType = randomBoolean() ? null : randomFrom(FieldSelection.FeatureType.values());
String reason = randomBoolean() ? null : randomAlphaOfLength(20);
return new FieldSelection(randomAlphaOfLength(10),
mappingTypes,
randomBoolean(),
randomBoolean(),
featureType,
reason);
}
@Override
protected FieldSelection createTestInstance() {
return createRandom();
}
@Override
protected FieldSelection doParseInstance(XContentParser parser) throws IOException {
return FieldSelection.PARSER.apply(parser, null);
}
@Override
protected Writeable.Reader<FieldSelection> instanceReader() {
return FieldSelection::new;
}
}

View File

@ -0,0 +1,61 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.core.ml.dataframe.explain;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.test.AbstractSerializingTestCase;
import java.io.IOException;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.nullValue;
public class MemoryEstimationTests extends AbstractSerializingTestCase<MemoryEstimation> {
public static MemoryEstimation createRandom() {
return new MemoryEstimation(
randomBoolean() ? new ByteSizeValue(randomNonNegativeLong()) : null,
randomBoolean() ? new ByteSizeValue(randomNonNegativeLong()) : null);
}
@Override
protected MemoryEstimation createTestInstance() {
return createRandom();
}
@Override
protected Writeable.Reader<MemoryEstimation> instanceReader() {
return MemoryEstimation::new;
}
@Override
protected MemoryEstimation doParseInstance(XContentParser parser) throws IOException {
return MemoryEstimation.PARSER.apply(parser, null);
}
public void testConstructor_NullValues() {
MemoryEstimation memoryEstimation = new MemoryEstimation(null, null);
assertThat(memoryEstimation.getExpectedMemoryWithoutDisk(), nullValue());
assertThat(memoryEstimation.getExpectedMemoryWithDisk(), nullValue());
}
public void testConstructor_SmallValues() {
MemoryEstimation memoryEstimation = new MemoryEstimation(
new ByteSizeValue(120, ByteSizeUnit.KB), new ByteSizeValue(30, ByteSizeUnit.KB));
assertThat(memoryEstimation.getExpectedMemoryWithoutDisk(), equalTo(new ByteSizeValue(120, ByteSizeUnit.KB)));
assertThat(memoryEstimation.getExpectedMemoryWithDisk(), equalTo(new ByteSizeValue(30, ByteSizeUnit.KB)));
}
public void testConstructor() {
MemoryEstimation memoryEstimation = new MemoryEstimation(
new ByteSizeValue(20, ByteSizeUnit.MB), new ByteSizeValue(10, ByteSizeUnit.MB));
assertThat(memoryEstimation.getExpectedMemoryWithoutDisk(), equalTo(new ByteSizeValue(20, ByteSizeUnit.MB)));
assertThat(memoryEstimation.getExpectedMemoryWithDisk(), equalTo(new ByteSizeValue(10, ByteSizeUnit.MB)));
}
}

View File

@ -92,7 +92,6 @@ integTest.runner {
'ml/data_frame_analytics_crud/Test put classification given num_top_classes is greater than 1k',
'ml/data_frame_analytics_crud/Test put classification given training_percent is less than one',
'ml/data_frame_analytics_crud/Test put classification given training_percent is greater than hundred',
'ml/data_frame_analytics_memory_usage_estimation/Test memory usage estimation for empty data frame',
'ml/evaluate_data_frame/Test given missing index',
'ml/evaluate_data_frame/Test given index does not exist',
'ml/evaluate_data_frame/Test given missing evaluation',
@ -113,6 +112,10 @@ integTest.runner {
'ml/evaluate_data_frame/Test regression given evaluation with empty metrics',
'ml/evaluate_data_frame/Test regression given missing actual_field',
'ml/evaluate_data_frame/Test regression given missing predicted_field',
'ml/explain_data_frame_analytics/Test neither job id nor body',
'ml/explain_data_frame_analytics/Test both job id and body',
'ml/explain_data_frame_analytics/Test missing job',
'ml/explain_data_frame_analytics/Test empty data frame given body',
'ml/delete_job_force/Test cannot force delete a non-existent job',
'ml/delete_model_snapshot/Test delete snapshot missing snapshotId',
'ml/delete_model_snapshot/Test delete snapshot missing job_id',

View File

@ -65,6 +65,7 @@ import org.elasticsearch.xpack.core.XPackSettings;
import org.elasticsearch.xpack.core.ml.MachineLearningField;
import org.elasticsearch.xpack.core.ml.MlMetaIndex;
import org.elasticsearch.xpack.core.ml.action.CloseJobAction;
import org.elasticsearch.xpack.core.ml.action.ExplainDataFrameAnalyticsAction;
import org.elasticsearch.xpack.core.ml.action.DeleteCalendarAction;
import org.elasticsearch.xpack.core.ml.action.DeleteCalendarEventAction;
import org.elasticsearch.xpack.core.ml.action.DeleteDataFrameAnalyticsAction;
@ -75,7 +76,6 @@ import org.elasticsearch.xpack.core.ml.action.DeleteForecastAction;
import org.elasticsearch.xpack.core.ml.action.DeleteJobAction;
import org.elasticsearch.xpack.core.ml.action.DeleteModelSnapshotAction;
import org.elasticsearch.xpack.core.ml.action.DeleteTrainedModelAction;
import org.elasticsearch.xpack.core.ml.action.EstimateMemoryUsageAction;
import org.elasticsearch.xpack.core.ml.action.EvaluateDataFrameAction;
import org.elasticsearch.xpack.core.ml.action.FinalizeJobExecutionAction;
import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction;
@ -98,8 +98,8 @@ import org.elasticsearch.xpack.core.ml.action.GetOverallBucketsAction;
import org.elasticsearch.xpack.core.ml.action.GetRecordsAction;
import org.elasticsearch.xpack.core.ml.action.GetTrainedModelsAction;
import org.elasticsearch.xpack.core.ml.action.GetTrainedModelsStatsAction;
import org.elasticsearch.xpack.core.ml.action.IsolateDatafeedAction;
import org.elasticsearch.xpack.core.ml.action.InternalInferModelAction;
import org.elasticsearch.xpack.core.ml.action.IsolateDatafeedAction;
import org.elasticsearch.xpack.core.ml.action.KillProcessAction;
import org.elasticsearch.xpack.core.ml.action.MlInfoAction;
import org.elasticsearch.xpack.core.ml.action.OpenJobAction;
@ -136,6 +136,7 @@ import org.elasticsearch.xpack.core.ml.job.persistence.ElasticsearchMappings;
import org.elasticsearch.xpack.core.ml.notifications.AuditorField;
import org.elasticsearch.xpack.core.template.TemplateUtils;
import org.elasticsearch.xpack.ml.action.TransportCloseJobAction;
import org.elasticsearch.xpack.ml.action.TransportExplainDataFrameAnalyticsAction;
import org.elasticsearch.xpack.ml.action.TransportDeleteCalendarAction;
import org.elasticsearch.xpack.ml.action.TransportDeleteCalendarEventAction;
import org.elasticsearch.xpack.ml.action.TransportDeleteDataFrameAnalyticsAction;
@ -146,7 +147,6 @@ import org.elasticsearch.xpack.ml.action.TransportDeleteForecastAction;
import org.elasticsearch.xpack.ml.action.TransportDeleteJobAction;
import org.elasticsearch.xpack.ml.action.TransportDeleteModelSnapshotAction;
import org.elasticsearch.xpack.ml.action.TransportDeleteTrainedModelAction;
import org.elasticsearch.xpack.ml.action.TransportEstimateMemoryUsageAction;
import org.elasticsearch.xpack.ml.action.TransportEvaluateDataFrameAction;
import org.elasticsearch.xpack.ml.action.TransportFinalizeJobExecutionAction;
import org.elasticsearch.xpack.ml.action.TransportFindFileStructureAction;
@ -167,9 +167,9 @@ import org.elasticsearch.xpack.ml.action.TransportGetJobsStatsAction;
import org.elasticsearch.xpack.ml.action.TransportGetModelSnapshotsAction;
import org.elasticsearch.xpack.ml.action.TransportGetOverallBucketsAction;
import org.elasticsearch.xpack.ml.action.TransportGetRecordsAction;
import org.elasticsearch.xpack.ml.action.TransportGetTrainedModelsAction;
import org.elasticsearch.xpack.ml.action.TransportGetTrainedModelsStatsAction;
import org.elasticsearch.xpack.ml.action.TransportInternalInferModelAction;
import org.elasticsearch.xpack.ml.action.TransportGetTrainedModelsAction;
import org.elasticsearch.xpack.ml.action.TransportIsolateDatafeedAction;
import org.elasticsearch.xpack.ml.action.TransportKillProcessAction;
import org.elasticsearch.xpack.ml.action.TransportMlInfoAction;
@ -258,8 +258,8 @@ import org.elasticsearch.xpack.ml.rest.datafeeds.RestPutDatafeedAction;
import org.elasticsearch.xpack.ml.rest.datafeeds.RestStartDatafeedAction;
import org.elasticsearch.xpack.ml.rest.datafeeds.RestStopDatafeedAction;
import org.elasticsearch.xpack.ml.rest.datafeeds.RestUpdateDatafeedAction;
import org.elasticsearch.xpack.ml.rest.dataframe.RestExplainDataFrameAnalyticsAction;
import org.elasticsearch.xpack.ml.rest.dataframe.RestDeleteDataFrameAnalyticsAction;
import org.elasticsearch.xpack.ml.rest.dataframe.RestEstimateMemoryUsageAction;
import org.elasticsearch.xpack.ml.rest.dataframe.RestEvaluateDataFrameAction;
import org.elasticsearch.xpack.ml.rest.dataframe.RestGetDataFrameAnalyticsAction;
import org.elasticsearch.xpack.ml.rest.dataframe.RestGetDataFrameAnalyticsStatsAction;
@ -759,7 +759,7 @@ public class MachineLearning extends Plugin implements ActionPlugin, AnalysisPlu
new RestStartDataFrameAnalyticsAction(restController),
new RestStopDataFrameAnalyticsAction(restController),
new RestEvaluateDataFrameAction(restController),
new RestEstimateMemoryUsageAction(restController),
new RestExplainDataFrameAnalyticsAction(restController),
new RestGetTrainedModelsAction(restController),
new RestDeleteTrainedModelAction(restController),
new RestGetTrainedModelsStatsAction(restController)
@ -829,7 +829,7 @@ public class MachineLearning extends Plugin implements ActionPlugin, AnalysisPlu
new ActionHandler<>(StartDataFrameAnalyticsAction.INSTANCE, TransportStartDataFrameAnalyticsAction.class),
new ActionHandler<>(StopDataFrameAnalyticsAction.INSTANCE, TransportStopDataFrameAnalyticsAction.class),
new ActionHandler<>(EvaluateDataFrameAction.INSTANCE, TransportEvaluateDataFrameAction.class),
new ActionHandler<>(EstimateMemoryUsageAction.INSTANCE, TransportEstimateMemoryUsageAction.class),
new ActionHandler<>(ExplainDataFrameAnalyticsAction.INSTANCE, TransportExplainDataFrameAnalyticsAction.class),
new ActionHandler<>(InternalInferModelAction.INSTANCE, TransportInternalInferModelAction.class),
new ActionHandler<>(GetTrainedModelsAction.INSTANCE, TransportGetTrainedModelsAction.class),
new ActionHandler<>(DeleteTrainedModelAction.INSTANCE, TransportDeleteTrainedModelAction.class),

View File

@ -1,130 +0,0 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.action;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.ActionListenerResponseHandler;
import org.elasticsearch.action.support.ActionFilters;
import org.elasticsearch.action.support.HandledTransportAction;
import org.elasticsearch.client.node.NodeClient;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.tasks.Task;
import org.elasticsearch.transport.TransportService;
import org.elasticsearch.xpack.core.ml.action.EstimateMemoryUsageAction;
import org.elasticsearch.xpack.core.ml.action.PutDataFrameAnalyticsAction;
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
import org.elasticsearch.xpack.ml.MachineLearning;
import org.elasticsearch.xpack.ml.dataframe.extractor.DataFrameDataExtractorFactory;
import org.elasticsearch.xpack.ml.dataframe.process.MemoryUsageEstimationProcessManager;
import java.util.Objects;
import java.util.Optional;
/**
* Estimates memory usage for the given data frame analytics spec.
* Redirects to a different node if the current node is *not* an ML node.
*/
public class TransportEstimateMemoryUsageAction
extends HandledTransportAction<PutDataFrameAnalyticsAction.Request, EstimateMemoryUsageAction.Response> {
private final TransportService transportService;
private final ClusterService clusterService;
private final NodeClient client;
private final MemoryUsageEstimationProcessManager processManager;
@Inject
public TransportEstimateMemoryUsageAction(TransportService transportService,
ActionFilters actionFilters,
ClusterService clusterService,
NodeClient client,
MemoryUsageEstimationProcessManager processManager) {
super(EstimateMemoryUsageAction.NAME, transportService, actionFilters, PutDataFrameAnalyticsAction.Request::new);
this.transportService = transportService;
this.clusterService = Objects.requireNonNull(clusterService);
this.client = Objects.requireNonNull(client);
this.processManager = Objects.requireNonNull(processManager);
}
@Override
protected void doExecute(Task task,
PutDataFrameAnalyticsAction.Request request,
ActionListener<EstimateMemoryUsageAction.Response> listener) {
DiscoveryNode localNode = clusterService.localNode();
if (MachineLearning.isMlNode(localNode)) {
doEstimateMemoryUsage(createTaskIdForMemoryEstimation(task), request, listener);
} else {
redirectToMlNode(request, listener);
}
}
/**
* Creates unique task id for the memory estimation process. This id is useful when logging.
*/
private static String createTaskIdForMemoryEstimation(Task task) {
return "memory_usage_estimation_" + task.getId();
}
/**
* Performs memory usage estimation.
* Memory usage estimation spawns an ML C++ process which is only available on ML nodes. That's why this method can only be called on
* the ML node.
*/
private void doEstimateMemoryUsage(String taskId,
PutDataFrameAnalyticsAction.Request request,
ActionListener<EstimateMemoryUsageAction.Response> listener) {
DataFrameDataExtractorFactory.createForSourceIndices(
client,
taskId,
true, // We are not interested in first-time run validations here
request.getConfig(),
ActionListener.wrap(
dataExtractorFactory -> {
processManager.runJobAsync(
taskId,
request.getConfig(),
dataExtractorFactory,
ActionListener.wrap(
result -> listener.onResponse(
new EstimateMemoryUsageAction.Response(
result.getExpectedMemoryWithoutDisk(), result.getExpectedMemoryWithDisk())),
listener::onFailure
)
);
},
listener::onFailure
)
);
}
/**
* Finds the first available ML node in the cluster and redirects the request to this node.
*/
private void redirectToMlNode(PutDataFrameAnalyticsAction.Request request,
ActionListener<EstimateMemoryUsageAction.Response> listener) {
Optional<DiscoveryNode> node = findMlNode(clusterService.state());
if (node.isPresent()) {
transportService.sendRequest(
node.get(), actionName, request, new ActionListenerResponseHandler<>(listener, EstimateMemoryUsageAction.Response::new));
} else {
listener.onFailure(ExceptionsHelper.badRequestException("No ML node to run on"));
}
}
/**
* Finds the first available ML node in the cluster state.
*/
private static Optional<DiscoveryNode> findMlNode(ClusterState clusterState) {
for (DiscoveryNode node : clusterState.getNodes()) {
if (MachineLearning.isMlNode(node)) {
return Optional.of(node);
}
}
return Optional.empty();
}
}

View File

@ -0,0 +1,156 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.action;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.ActionListenerResponseHandler;
import org.elasticsearch.action.support.ActionFilters;
import org.elasticsearch.action.support.HandledTransportAction;
import org.elasticsearch.client.node.NodeClient;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.license.LicenseUtils;
import org.elasticsearch.license.XPackLicenseState;
import org.elasticsearch.tasks.Task;
import org.elasticsearch.transport.TransportService;
import org.elasticsearch.xpack.core.XPackField;
import org.elasticsearch.xpack.core.ml.action.ExplainDataFrameAnalyticsAction;
import org.elasticsearch.xpack.core.ml.action.PutDataFrameAnalyticsAction;
import org.elasticsearch.xpack.core.ml.dataframe.explain.FieldSelection;
import org.elasticsearch.xpack.core.ml.dataframe.explain.MemoryEstimation;
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
import org.elasticsearch.xpack.ml.MachineLearning;
import org.elasticsearch.xpack.ml.dataframe.extractor.DataFrameDataExtractorFactory;
import org.elasticsearch.xpack.ml.dataframe.extractor.ExtractedFieldsDetector;
import org.elasticsearch.xpack.ml.dataframe.extractor.ExtractedFieldsDetectorFactory;
import org.elasticsearch.xpack.ml.dataframe.process.MemoryUsageEstimationProcessManager;
import org.elasticsearch.xpack.ml.extractor.ExtractedFields;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
/**
* Provides explanations on aspects of the given data frame analytics spec like memory estimation, field selection, etc.
* Redirects to a different node if the current node is *not* an ML node.
*/
public class TransportExplainDataFrameAnalyticsAction
extends HandledTransportAction<PutDataFrameAnalyticsAction.Request, ExplainDataFrameAnalyticsAction.Response> {
private final XPackLicenseState licenseState;
private final TransportService transportService;
private final ClusterService clusterService;
private final NodeClient client;
private final MemoryUsageEstimationProcessManager processManager;
@Inject
public TransportExplainDataFrameAnalyticsAction(TransportService transportService,
ActionFilters actionFilters,
ClusterService clusterService,
NodeClient client,
XPackLicenseState licenseState,
MemoryUsageEstimationProcessManager processManager) {
super(ExplainDataFrameAnalyticsAction.NAME, transportService, actionFilters, PutDataFrameAnalyticsAction.Request::new);
this.transportService = transportService;
this.clusterService = Objects.requireNonNull(clusterService);
this.client = Objects.requireNonNull(client);
this.licenseState = licenseState;
this.processManager = Objects.requireNonNull(processManager);
}
@Override
protected void doExecute(Task task,
PutDataFrameAnalyticsAction.Request request,
ActionListener<ExplainDataFrameAnalyticsAction.Response> listener) {
if (licenseState.isMachineLearningAllowed() == false) {
listener.onFailure(LicenseUtils.newComplianceException(XPackField.MACHINE_LEARNING));
return;
}
DiscoveryNode localNode = clusterService.localNode();
if (MachineLearning.isMlNode(localNode)) {
explain(task, request, listener);
} else {
redirectToMlNode(request, listener);
}
}
private void explain(Task task, PutDataFrameAnalyticsAction.Request request,
ActionListener<ExplainDataFrameAnalyticsAction.Response> listener) {
ExtractedFieldsDetectorFactory extractedFieldsDetectorFactory = new ExtractedFieldsDetectorFactory(client);
extractedFieldsDetectorFactory.createFromSource(request.getConfig(), true, ActionListener.wrap(
extractedFieldsDetector -> {
explain(task, request, extractedFieldsDetector, listener);
},
listener::onFailure
));
}
private void explain(Task task, PutDataFrameAnalyticsAction.Request request, ExtractedFieldsDetector extractedFieldsDetector,
ActionListener<ExplainDataFrameAnalyticsAction.Response> listener) {
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
ActionListener<MemoryEstimation> memoryEstimationListener = ActionListener.wrap(
memoryEstimation -> listener.onResponse(new ExplainDataFrameAnalyticsAction.Response(fieldExtraction.v2(), memoryEstimation)),
listener::onFailure
);
estimateMemoryUsage(task, request, fieldExtraction.v1(), memoryEstimationListener);
}
/**
* Performs memory usage estimation.
* Memory usage estimation spawns an ML C++ process which is only available on ML nodes. That's why this method can only be called on
* the ML node.
*/
private void estimateMemoryUsage(Task task,
PutDataFrameAnalyticsAction.Request request,
ExtractedFields extractedFields,
ActionListener<MemoryEstimation> listener) {
final String estimateMemoryTaskId = "memory_usage_estimation_" + task.getId();
DataFrameDataExtractorFactory extractorFactory = DataFrameDataExtractorFactory.createForSourceIndices(
client, estimateMemoryTaskId, request.getConfig(), extractedFields);
processManager.runJobAsync(
estimateMemoryTaskId,
request.getConfig(),
extractorFactory,
ActionListener.wrap(
result -> listener.onResponse(
new MemoryEstimation(result.getExpectedMemoryWithoutDisk(), result.getExpectedMemoryWithDisk())),
listener::onFailure
)
);
}
/**
* Finds the first available ML node in the cluster and redirects the request to this node.
*/
private void redirectToMlNode(PutDataFrameAnalyticsAction.Request request,
ActionListener<ExplainDataFrameAnalyticsAction.Response> listener) {
Optional<DiscoveryNode> node = findMlNode(clusterService.state());
if (node.isPresent()) {
transportService.sendRequest(node.get(), actionName, request,
new ActionListenerResponseHandler<>(listener, ExplainDataFrameAnalyticsAction.Response::new));
} else {
listener.onFailure(ExceptionsHelper.badRequestException("No ML node to run on"));
}
}
/**
* Finds the first available ML node in the cluster state.
*/
private static Optional<DiscoveryNode> findMlNode(ClusterState clusterState) {
for (DiscoveryNode node : clusterState.getNodes()) {
if (MachineLearning.isMlNode(node)) {
return Optional.of(node);
}
}
return Optional.empty();
}
}

View File

@ -29,6 +29,7 @@ import org.elasticsearch.common.Strings;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.IndexNotFoundException;
import org.elasticsearch.license.LicenseUtils;
@ -47,7 +48,7 @@ import org.elasticsearch.xpack.core.ClientHelper;
import org.elasticsearch.xpack.core.XPackField;
import org.elasticsearch.xpack.core.ml.MlMetadata;
import org.elasticsearch.xpack.core.ml.MlTasks;
import org.elasticsearch.xpack.core.ml.action.EstimateMemoryUsageAction;
import org.elasticsearch.xpack.core.ml.action.ExplainDataFrameAnalyticsAction;
import org.elasticsearch.xpack.core.ml.action.GetDataFrameAnalyticsStatsAction;
import org.elasticsearch.xpack.core.ml.action.PutDataFrameAnalyticsAction;
import org.elasticsearch.xpack.core.ml.action.StartDataFrameAnalyticsAction;
@ -66,6 +67,7 @@ import org.elasticsearch.xpack.ml.dataframe.SourceDestValidator;
import org.elasticsearch.xpack.ml.dataframe.extractor.DataFrameDataExtractorFactory;
import org.elasticsearch.xpack.ml.dataframe.extractor.ExtractedFieldsDetectorFactory;
import org.elasticsearch.xpack.ml.dataframe.persistence.DataFrameAnalyticsConfigProvider;
import org.elasticsearch.xpack.ml.extractor.ExtractedFields;
import org.elasticsearch.xpack.ml.job.JobNodeSelector;
import org.elasticsearch.xpack.ml.notifications.DataFrameAnalyticsAuditor;
import org.elasticsearch.xpack.ml.process.MlMemoryTracker;
@ -190,20 +192,18 @@ public class TransportStartDataFrameAnalyticsAction
final String jobId = startContext.config.getId();
// Tell the job tracker to refresh the memory requirement for this job and all other jobs that have persistent tasks
ActionListener<EstimateMemoryUsageAction.Response> estimateMemoryUsageListener = ActionListener.wrap(
estimateMemoryUsageResponse -> {
auditor.info(
jobId,
Messages.getMessage(
Messages.DATA_FRAME_ANALYTICS_AUDIT_ESTIMATED_MEMORY_USAGE,
estimateMemoryUsageResponse.getExpectedMemoryWithoutDisk()));
ActionListener<ExplainDataFrameAnalyticsAction.Response> explainListener = ActionListener.wrap(
explainResponse -> {
ByteSizeValue expectedMemoryWithoutDisk = explainResponse.getMemoryEstimation().getExpectedMemoryWithoutDisk();
auditor.info(jobId,
Messages.getMessage(Messages.DATA_FRAME_ANALYTICS_AUDIT_ESTIMATED_MEMORY_USAGE, expectedMemoryWithoutDisk));
// Validate that model memory limit is sufficient to run the analysis
if (startContext.config.getModelMemoryLimit()
.compareTo(estimateMemoryUsageResponse.getExpectedMemoryWithoutDisk()) < 0) {
.compareTo(expectedMemoryWithoutDisk) < 0) {
ElasticsearchStatusException e =
ExceptionsHelper.badRequestException(
"Cannot start because the configured model memory limit [{}] is lower than the expected memory usage [{}]",
startContext.config.getModelMemoryLimit(), estimateMemoryUsageResponse.getExpectedMemoryWithoutDisk());
startContext.config.getModelMemoryLimit(), expectedMemoryWithoutDisk);
listener.onFailure(e);
return;
}
@ -215,13 +215,13 @@ public class TransportStartDataFrameAnalyticsAction
listener::onFailure
);
PutDataFrameAnalyticsAction.Request estimateMemoryUsageRequest = new PutDataFrameAnalyticsAction.Request(startContext.config);
PutDataFrameAnalyticsAction.Request explainRequest = new PutDataFrameAnalyticsAction.Request(startContext.config);
ClientHelper.executeAsyncWithOrigin(
client,
ClientHelper.ML_ORIGIN,
EstimateMemoryUsageAction.INSTANCE,
estimateMemoryUsageRequest,
estimateMemoryUsageListener);
ExplainDataFrameAnalyticsAction.INSTANCE,
explainRequest,
explainListener);
}
@ -277,7 +277,11 @@ public class TransportStartDataFrameAnalyticsAction
// Validate extraction is possible
boolean isTaskRestarting = startContext.startingState != DataFrameAnalyticsTask.StartingState.FIRST_TIME;
new ExtractedFieldsDetectorFactory(client).createFromSource(startContext.config, isTaskRestarting, ActionListener.wrap(
extractedFieldsDetector -> toValidateDestEmptyListener.onResponse(startContext), finalListener::onFailure));
extractedFieldsDetector -> {
startContext.extractedFields = extractedFieldsDetector.detect().v1();
toValidateDestEmptyListener.onResponse(startContext);
},
finalListener::onFailure));
},
finalListener::onFailure
);
@ -294,15 +298,11 @@ public class TransportStartDataFrameAnalyticsAction
}
private void validateSourceIndexHasRows(StartContext startContext, ActionListener<StartContext> listener) {
boolean isTaskRestarting = startContext.startingState != DataFrameAnalyticsTask.StartingState.FIRST_TIME;
DataFrameDataExtractorFactory.createForSourceIndices(client,
DataFrameDataExtractorFactory extractorFactory = DataFrameDataExtractorFactory.createForSourceIndices(client,
"validate_source_index_has_rows-" + startContext.config.getId(),
isTaskRestarting,
startContext.config,
ActionListener.wrap(
dataFrameDataExtractorFactory ->
dataFrameDataExtractorFactory
.newExtractor(false)
startContext.extractedFields);
extractorFactory.newExtractor(false)
.collectDataSummaryAsync(ActionListener.wrap(
dataSummary -> {
if (dataSummary.rows == 0) {
@ -320,8 +320,6 @@ public class TransportStartDataFrameAnalyticsAction
}
},
listener::onFailure
)),
listener::onFailure
));
}
@ -402,6 +400,7 @@ public class TransportStartDataFrameAnalyticsAction
private final DataFrameAnalyticsConfig config;
private final List<PhaseProgress> progressOnStart;
private final DataFrameAnalyticsTask.StartingState startingState;
private volatile ExtractedFields extractedFields;
private StartContext(DataFrameAnalyticsConfig config, List<PhaseProgress> progressOnStart) {
this.config = config;

View File

@ -29,7 +29,7 @@ public class DataFrameDataExtractorFactory {
private final Map<String, String> headers;
private final boolean includeRowsWithMissingValues;
private DataFrameDataExtractorFactory(Client client, String analyticsId, List<String> indices, ExtractedFields extractedFields,
public DataFrameDataExtractorFactory(Client client, String analyticsId, List<String> indices, ExtractedFields extractedFields,
Map<String, String> headers, boolean includeRowsWithMissingValues) {
this.client = Objects.requireNonNull(client);
this.analyticsId = Objects.requireNonNull(analyticsId);
@ -66,32 +66,19 @@ public class DataFrameDataExtractorFactory {
}
/**
* Validate and create a new extractor factory
* Create a new extractor factory
*
* The source index must exist and contain at least 1 compatible field or validations will fail.
*
* @param client ES Client used to make calls against the cluster
* @param taskId The task id
* @param isTaskRestarting Whether the task is restarting or it is running for the first time
* @param config The config from which to create the extractor factory
* @param listener The listener to notify on creation or failure
* @param extractedFields The fields to extract
*/
public static void createForSourceIndices(Client client,
String taskId,
boolean isTaskRestarting,
DataFrameAnalyticsConfig config,
ActionListener<DataFrameDataExtractorFactory> listener) {
ExtractedFieldsDetectorFactory extractedFieldsDetectorFactory = new ExtractedFieldsDetectorFactory(client);
extractedFieldsDetectorFactory.createFromSource(config, isTaskRestarting, ActionListener.wrap(
extractedFieldsDetector -> {
ExtractedFields extractedFields = extractedFieldsDetector.detect();
DataFrameDataExtractorFactory extractorFactory = new DataFrameDataExtractorFactory(client, taskId,
Arrays.asList(config.getSource().getIndex()), extractedFields, config.getHeaders(),
config.getAnalysis().supportsMissingValues());
listener.onResponse(extractorFactory);
},
listener::onFailure
));
public static DataFrameDataExtractorFactory createForSourceIndices(Client client, String taskId, DataFrameAnalyticsConfig config,
ExtractedFields extractedFields) {
return new DataFrameDataExtractorFactory(client, taskId, Arrays.asList(config.getSource().getIndex()), extractedFields,
config.getHeaders(), config.getAnalysis().supportsMissingValues());
}
/**
@ -111,7 +98,7 @@ public class DataFrameDataExtractorFactory {
ExtractedFieldsDetectorFactory extractedFieldsDetectorFactory = new ExtractedFieldsDetectorFactory(client);
extractedFieldsDetectorFactory.createFromDest(config, isTaskRestarting, ActionListener.wrap(
extractedFieldsDetector -> {
ExtractedFields extractedFields = extractedFieldsDetector.detect();
ExtractedFields extractedFields = extractedFieldsDetector.detect().v1();
DataFrameDataExtractorFactory extractorFactory = new DataFrameDataExtractorFactory(client, config.getId(),
Collections.singletonList(config.getDest().getIndex()), extractedFields, config.getHeaders(),
config.getAnalysis().supportsMissingValues());

View File

@ -11,6 +11,7 @@ import org.elasticsearch.ResourceNotFoundException;
import org.elasticsearch.action.fieldcaps.FieldCapabilities;
import org.elasticsearch.action.fieldcaps.FieldCapabilitiesResponse;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.mapper.BooleanFieldMapper;
@ -19,6 +20,7 @@ import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsDest;
import org.elasticsearch.xpack.core.ml.dataframe.analyses.RequiredField;
import org.elasticsearch.xpack.core.ml.dataframe.analyses.Types;
import org.elasticsearch.xpack.core.ml.dataframe.explain.FieldSelection;
import org.elasticsearch.xpack.core.ml.job.messages.Messages;
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
import org.elasticsearch.xpack.core.ml.utils.NameResolver;
@ -29,13 +31,12 @@ import org.elasticsearch.xpack.ml.extractor.ExtractedFields;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
@ -57,9 +58,8 @@ public class ExtractedFieldsDetector {
private final FieldCapabilitiesResponse fieldCapabilitiesResponse;
private final Map<String, Long> fieldCardinalities;
ExtractedFieldsDetector(String[] index, DataFrameAnalyticsConfig config, boolean isTaskRestarting,
int docValueFieldsLimit, FieldCapabilitiesResponse fieldCapabilitiesResponse,
Map<String, Long> fieldCardinalities) {
ExtractedFieldsDetector(String[] index, DataFrameAnalyticsConfig config, boolean isTaskRestarting, int docValueFieldsLimit,
FieldCapabilitiesResponse fieldCapabilitiesResponse, Map<String, Long> fieldCardinalities) {
this.index = Objects.requireNonNull(index);
this.config = Objects.requireNonNull(config);
this.isTaskRestarting = isTaskRestarting;
@ -68,8 +68,30 @@ public class ExtractedFieldsDetector {
this.fieldCardinalities = Objects.requireNonNull(fieldCardinalities);
}
public ExtractedFields detect() {
Set<String> fields = getIncludedFields();
public Tuple<ExtractedFields, List<FieldSelection>> detect() {
TreeSet<FieldSelection> fieldSelection = new TreeSet<>(Comparator.comparing(FieldSelection::getName));
Set<String> fields = getIncludedFields(fieldSelection);
checkFieldsHaveCompatibleTypes(fields);
checkRequiredFields(fields);
checkFieldsWithCardinalityLimit();
ExtractedFields extractedFields = detectExtractedFields(fields, fieldSelection);
addIncludedFields(extractedFields, fieldSelection);
return Tuple.tuple(extractedFields, Collections.unmodifiableList(new ArrayList<>(fieldSelection)));
}
private Set<String> getIncludedFields(Set<FieldSelection> fieldSelection) {
Set<String> fields = new TreeSet<>(fieldCapabilitiesResponse.get().keySet());
fields.removeAll(IGNORE_FIELDS);
checkResultsFieldIsNotPresent();
removeFieldsUnderResultsField(fields);
FetchSourceContext analyzedFields = config.getAnalyzedFields();
// If the user has not explicitly included fields we'll include all compatible fields
if (analyzedFields == null || analyzedFields.includes().length == 0) {
removeFieldsWithIncompatibleTypes(fields, fieldSelection);
}
includeAndExcludeFields(fields, fieldSelection);
if (fields.isEmpty()) {
throw ExceptionsHelper.badRequestException("No compatible fields could be detected in index {}. Supported types are {}.",
@ -77,28 +99,21 @@ public class ExtractedFieldsDetector {
getSupportedTypes());
}
checkNoIgnoredFields(fields);
checkFieldsHaveCompatibleTypes(fields);
checkRequiredFields(fields);
checkFieldsWithCardinalityLimit();
return detectExtractedFields(fields);
}
private Set<String> getIncludedFields() {
Set<String> fields = new HashSet<>(fieldCapabilitiesResponse.get().keySet());
checkResultsFieldIsNotPresent();
removeFieldsUnderResultsField(fields);
FetchSourceContext analyzedFields = config.getAnalyzedFields();
// If the user has not explicitly included fields we'll include all compatible fields
if (analyzedFields == null || analyzedFields.includes().length == 0) {
fields.removeAll(IGNORE_FIELDS);
removeFieldsWithIncompatibleTypes(fields);
}
includeAndExcludeFields(fields);
return fields;
}
private void removeFieldsUnderResultsField(Set<String> fields) {
String resultsField = config.getDest().getResultsField();
Iterator<String> fieldsIterator = fields.iterator();
while (fieldsIterator.hasNext()) {
String field = fieldsIterator.next();
if (field.startsWith(resultsField + ".")) {
fieldsIterator.remove();
}
}
fields.removeIf(field -> field.startsWith(resultsField + "."));
}
private void checkResultsFieldIsNotPresent() {
// If the task is restarting we do not mind the index containing the results field, we will overwrite all docs
if (isTaskRestarting) {
@ -117,16 +132,21 @@ public class ExtractedFieldsDetector {
}
}
private void removeFieldsUnderResultsField(Set<String> fields) {
// Ignore fields under the results object
fields.removeIf(field -> field.startsWith(config.getDest().getResultsField() + "."));
private void addExcludedField(String field, String reason, Set<FieldSelection> fieldSelection) {
fieldSelection.add(FieldSelection.excluded(field, getMappingTypes(field), reason));
}
private void removeFieldsWithIncompatibleTypes(Set<String> fields) {
private Set<String> getMappingTypes(String field) {
Map<String, FieldCapabilities> fieldCaps = fieldCapabilitiesResponse.getField(field);
return fieldCaps == null ? Collections.emptySet() : fieldCaps.keySet();
}
private void removeFieldsWithIncompatibleTypes(Set<String> fields, Set<FieldSelection> fieldSelection) {
Iterator<String> fieldsIterator = fields.iterator();
while (fieldsIterator.hasNext()) {
String field = fieldsIterator.next();
if (hasCompatibleType(field) == false) {
addExcludedField(field, "unsupported type; supported types are " + getSupportedTypes(), fieldSelection);
fieldsIterator.remove();
}
}
@ -163,7 +183,7 @@ public class ExtractedFieldsDetector {
return supportedTypes;
}
private void includeAndExcludeFields(Set<String> fields) {
private void includeAndExcludeFields(Set<String> fields, Set<FieldSelection> fieldSelection) {
FetchSourceContext analyzedFields = config.getAnalyzedFields();
if (analyzedFields == null) {
return;
@ -188,18 +208,30 @@ public class ExtractedFieldsDetector {
Messages.getMessage(Messages.DATA_FRAME_ANALYTICS_BAD_FIELD_FILTER, ex)))
.expand(excludes, true);
fields.retainAll(includedSet);
fields.removeAll(excludedSet);
applyIncludesExcludes(fields, includedSet, excludedSet, fieldSelection);
} catch (ResourceNotFoundException ex) {
// Re-wrap our exception so that we throw the same exception type when there are no fields.
throw ExceptionsHelper.badRequestException(ex.getMessage());
}
}
private void checkNoIgnoredFields(Set<String> fields) {
Optional<String> ignoreField = IGNORE_FIELDS.stream().filter(fields::contains).findFirst();
if (ignoreField.isPresent()) {
throw ExceptionsHelper.badRequestException("field [{}] cannot be analyzed", ignoreField.get());
private void applyIncludesExcludes(Set<String> fields, Set<String> includes, Set<String> excludes,
Set<FieldSelection> fieldSelection) {
Iterator<String> fieldsIterator = fields.iterator();
while (fieldsIterator.hasNext()) {
String field = fieldsIterator.next();
if (includes.contains(field)) {
if (IGNORE_FIELDS.contains(field)) {
throw ExceptionsHelper.badRequestException("field [{}] cannot be analyzed", field);
}
} else {
fieldsIterator.remove();
addExcludedField(field, "field not in includes list", fieldSelection);
}
if (excludes.contains(field)) {
fieldsIterator.remove();
addExcludedField(field, "field in excludes list", fieldSelection);
}
}
}
@ -247,13 +279,10 @@ public class ExtractedFieldsDetector {
}
}
private ExtractedFields detectExtractedFields(Set<String> fields) {
List<String> sortedFields = new ArrayList<>(fields);
// We sort the fields to ensure the checksum for each document is deterministic
Collections.sort(sortedFields);
ExtractedFields extractedFields = ExtractedFields.build(sortedFields, Collections.emptySet(), fieldCapabilitiesResponse);
private ExtractedFields detectExtractedFields(Set<String> fields, Set<FieldSelection> fieldSelection) {
ExtractedFields extractedFields = ExtractedFields.build(fields, Collections.emptySet(), fieldCapabilitiesResponse);
boolean preferSource = extractedFields.getDocValueFields().size() > docValueFieldsLimit;
extractedFields = deduplicateMultiFields(extractedFields, preferSource);
extractedFields = deduplicateMultiFields(extractedFields, preferSource, fieldSelection);
if (preferSource) {
extractedFields = fetchFromSourceIfSupported(extractedFields);
if (extractedFields.getDocValueFields().size() > docValueFieldsLimit) {
@ -266,7 +295,8 @@ public class ExtractedFieldsDetector {
return extractedFields;
}
private ExtractedFields deduplicateMultiFields(ExtractedFields extractedFields, boolean preferSource) {
private ExtractedFields deduplicateMultiFields(ExtractedFields extractedFields, boolean preferSource,
Set<FieldSelection> fieldSelection) {
Set<String> requiredFields = config.getAnalysis().getRequiredFields().stream().map(RequiredField::getName)
.collect(Collectors.toSet());
Map<String, ExtractedField> nameOrParentToField = new LinkedHashMap<>();
@ -276,43 +306,53 @@ public class ExtractedFieldsDetector {
if (existingField != null) {
ExtractedField parent = currentField.isMultiField() ? existingField : currentField;
ExtractedField multiField = currentField.isMultiField() ? currentField : existingField;
nameOrParentToField.put(nameOrParent, chooseMultiFieldOrParent(preferSource, requiredFields, parent, multiField));
nameOrParentToField.put(nameOrParent,
chooseMultiFieldOrParent(preferSource, requiredFields, parent, multiField, fieldSelection));
}
}
return new ExtractedFields(new ArrayList<>(nameOrParentToField.values()));
}
private ExtractedField chooseMultiFieldOrParent(boolean preferSource, Set<String> requiredFields,
ExtractedField parent, ExtractedField multiField) {
private ExtractedField chooseMultiFieldOrParent(boolean preferSource, Set<String> requiredFields, ExtractedField parent,
ExtractedField multiField, Set<FieldSelection> fieldSelection) {
// Check requirements first
if (requiredFields.contains(parent.getName())) {
addExcludedField(multiField.getName(), "[" + parent.getName() + "] is required instead", fieldSelection);
return parent;
}
if (requiredFields.contains(multiField.getName())) {
addExcludedField(parent.getName(), "[" + multiField.getName() + "] is required instead", fieldSelection);
return multiField;
}
// If both are multi-fields it means there are several. In this case parent is the previous multi-field
// we selected. We'll just keep that.
if (parent.isMultiField() && multiField.isMultiField()) {
addExcludedField(multiField.getName(), "[" + parent.getName() + "] came first", fieldSelection);
return parent;
}
// If we prefer source only the parent may support it. If it does we pick it immediately.
if (preferSource && parent.supportsFromSource()) {
addExcludedField(multiField.getName(), "[" + parent.getName() + "] is preferred because it supports fetching from source",
fieldSelection);
return parent;
}
// If any of the two is a doc_value field let's prefer it as it'd support aggregations.
// We check the parent first as it'd be a shorter field name.
if (parent.getMethod() == ExtractedField.Method.DOC_VALUE) {
addExcludedField(multiField.getName(), "[" + parent.getName() + "] is preferred because it is aggregatable", fieldSelection);
return parent;
}
if (multiField.getMethod() == ExtractedField.Method.DOC_VALUE) {
addExcludedField(parent.getName(), "[" + multiField.getName() + "] is preferred because it is aggregatable", fieldSelection);
return multiField;
}
// None is aggregatable. Let's pick the parent for its shorter name.
addExcludedField(multiField.getName(), "[" + parent.getName() + "] is preferred because none of the multi-fields are aggregatable",
fieldSelection);
return parent;
}
@ -343,6 +383,26 @@ public class ExtractedFieldsDetector {
return new ExtractedFields(adjusted);
}
private void addIncludedFields(ExtractedFields extractedFields, Set<FieldSelection> fieldSelection) {
Set<String> requiredFields = config.getAnalysis().getRequiredFields().stream().map(RequiredField::getName)
.collect(Collectors.toSet());
Set<String> categoricalFields = getCategoricalFields(extractedFields);
for (ExtractedField includedField : extractedFields.getAllFields()) {
FieldSelection.FeatureType featureType = categoricalFields.contains(includedField.getName()) ?
FieldSelection.FeatureType.CATEGORICAL : FieldSelection.FeatureType.NUMERICAL;
fieldSelection.add(FieldSelection.included(includedField.getName(), includedField.getTypes(),
requiredFields.contains(includedField.getName()), featureType));
}
}
private Set<String> getCategoricalFields(ExtractedFields extractedFields) {
return extractedFields.getAllFields().stream()
.filter(extractedField -> config.getAnalysis().getAllowedCategoricalTypes(extractedField.getName())
.containsAll(extractedField.getTypes()))
.map(ExtractedField::getName)
.collect(Collectors.toSet());
}
private static boolean isBoolean(Set<String> types) {
return types.size() == 1 && types.contains(BooleanFieldMapper.CONTENT_TYPE);
}

View File

@ -100,9 +100,9 @@ public class MemoryUsageEstimationProcessManager {
} finally {
process.consumeAndCloseOutputStream();
try {
LOGGER.info("[{}] Closing process", jobId);
LOGGER.debug("[{}] Closing process", jobId);
process.close();
LOGGER.info("[{}] Closed process", jobId);
LOGGER.debug("[{}] Closed process", jobId);
} catch (Exception e) {
String errorMsg =
new ParameterizedMessage(

View File

@ -1,38 +0,0 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.rest.dataframe;
import org.elasticsearch.client.node.NodeClient;
import org.elasticsearch.rest.BaseRestHandler;
import org.elasticsearch.rest.RestController;
import org.elasticsearch.rest.RestRequest;
import org.elasticsearch.rest.action.RestToXContentListener;
import org.elasticsearch.xpack.core.ml.action.EstimateMemoryUsageAction;
import org.elasticsearch.xpack.core.ml.action.PutDataFrameAnalyticsAction;
import org.elasticsearch.xpack.ml.MachineLearning;
import java.io.IOException;
public class RestEstimateMemoryUsageAction extends BaseRestHandler {
public RestEstimateMemoryUsageAction(RestController controller) {
controller.registerHandler(
RestRequest.Method.POST,
MachineLearning.BASE_PATH + "data_frame/analytics/_estimate_memory_usage", this);
}
@Override
public String getName() {
return "ml_estimate_memory_usage_action";
}
@Override
protected RestChannelConsumer prepareRequest(RestRequest restRequest, NodeClient client) throws IOException {
PutDataFrameAnalyticsAction.Request request =
PutDataFrameAnalyticsAction.Request.parseRequestForMemoryEstimation(restRequest.contentOrSourceParamParser());
return channel -> client.execute(EstimateMemoryUsageAction.INSTANCE, request, new RestToXContentListener<>(channel));
}
}

View File

@ -0,0 +1,84 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.rest.dataframe;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.client.node.NodeClient;
import org.elasticsearch.common.Strings;
import org.elasticsearch.rest.BaseRestHandler;
import org.elasticsearch.rest.RestController;
import org.elasticsearch.rest.RestRequest;
import org.elasticsearch.rest.action.RestToXContentListener;
import org.elasticsearch.xpack.core.ml.action.ExplainDataFrameAnalyticsAction;
import org.elasticsearch.xpack.core.ml.action.GetDataFrameAnalyticsAction;
import org.elasticsearch.xpack.core.ml.action.PutDataFrameAnalyticsAction;
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
import org.elasticsearch.xpack.ml.MachineLearning;
import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;
public class RestExplainDataFrameAnalyticsAction extends BaseRestHandler {
public RestExplainDataFrameAnalyticsAction(RestController controller) {
controller.registerHandler(RestRequest.Method.GET, MachineLearning.BASE_PATH + "data_frame/analytics/_explain", this);
controller.registerHandler(RestRequest.Method.POST, MachineLearning.BASE_PATH + "data_frame/analytics/_explain", this);
controller.registerHandler(RestRequest.Method.GET, MachineLearning.BASE_PATH + "data_frame/analytics/{"
+ DataFrameAnalyticsConfig.ID.getPreferredName() + "}/_explain", this);
controller.registerHandler(RestRequest.Method.POST, MachineLearning.BASE_PATH + "data_frame/analytics/{"
+ DataFrameAnalyticsConfig.ID.getPreferredName() + "}/_explain", this);
}
@Override
public String getName() {
return "ml_explain_data_frame_analytics_action";
}
@Override
protected RestChannelConsumer prepareRequest(RestRequest restRequest, NodeClient client) throws IOException {
final String jobId = restRequest.param(DataFrameAnalyticsConfig.ID.getPreferredName());
if (Strings.isNullOrEmpty(jobId) && restRequest.hasContentOrSourceParam() == false) {
throw ExceptionsHelper.badRequestException("Please provide a job [{}] or the config object",
DataFrameAnalyticsConfig.ID.getPreferredName());
}
if (Strings.isNullOrEmpty(jobId) == false && restRequest.hasContentOrSourceParam()) {
throw ExceptionsHelper.badRequestException("Please provide either a job [{}] or the config object but not both",
DataFrameAnalyticsConfig.ID.getPreferredName());
}
// We need to consume the body before returning
PutDataFrameAnalyticsAction.Request explainRequestFromBody = Strings.isNullOrEmpty(jobId) ?
PutDataFrameAnalyticsAction.Request.parseRequestForExplain(restRequest.contentOrSourceParamParser()) : null;
return channel -> {
RestToXContentListener<ExplainDataFrameAnalyticsAction.Response> listener = new RestToXContentListener<>(channel);
if (explainRequestFromBody != null) {
client.execute(ExplainDataFrameAnalyticsAction.INSTANCE, explainRequestFromBody, listener);
} else {
GetDataFrameAnalyticsAction.Request getRequest = new GetDataFrameAnalyticsAction.Request(jobId);
getRequest.setAllowNoResources(false);
client.execute(GetDataFrameAnalyticsAction.INSTANCE, getRequest, ActionListener.wrap(
getResponse -> {
List<DataFrameAnalyticsConfig> jobs = getResponse.getResources().results();
if (jobs.size() > 1) {
listener.onFailure(ExceptionsHelper.badRequestException("expected only one config but matched {}",
jobs.stream().map(DataFrameAnalyticsConfig::getId).collect(Collectors.toList())));
} else {
PutDataFrameAnalyticsAction.Request explainRequest = new PutDataFrameAnalyticsAction.Request(jobs.get(0));
client.execute(ExplainDataFrameAnalyticsAction.INSTANCE, explainRequest, listener);
}
},
listener::onFailure
));
}
};
}
}

View File

@ -8,6 +8,7 @@ package org.elasticsearch.xpack.ml.dataframe.extractor;
import org.elasticsearch.ElasticsearchStatusException;
import org.elasticsearch.action.fieldcaps.FieldCapabilities;
import org.elasticsearch.action.fieldcaps.FieldCapabilitiesResponse;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.fetch.subphase.FetchSourceContext;
import org.elasticsearch.test.ESTestCase;
@ -17,6 +18,7 @@ import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsSource;
import org.elasticsearch.xpack.core.ml.dataframe.analyses.Classification;
import org.elasticsearch.xpack.core.ml.dataframe.analyses.OutlierDetection;
import org.elasticsearch.xpack.core.ml.dataframe.analyses.Regression;
import org.elasticsearch.xpack.core.ml.dataframe.explain.FieldSelection;
import org.elasticsearch.xpack.ml.extractor.ExtractedField;
import org.elasticsearch.xpack.ml.extractor.ExtractedFields;
import org.elasticsearch.xpack.ml.test.SearchHitBuilder;
@ -25,6 +27,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@ -48,12 +51,15 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(), false, 100, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
List<ExtractedField> allFields = extractedFields.getAllFields();
List<ExtractedField> allFields = fieldExtraction.v1().getAllFields();
assertThat(allFields.size(), equalTo(1));
assertThat(allFields.get(0).getName(), equalTo("some_float"));
assertThat(allFields.get(0).getMethod(), equalTo(ExtractedField.Method.DOC_VALUE));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.included("some_float", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL));
}
public void testDetect_GivenNumericFieldWithMultipleTypes() {
@ -63,12 +69,16 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(), false, 100, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
List<ExtractedField> allFields = extractedFields.getAllFields();
List<ExtractedField> allFields = fieldExtraction.v1().getAllFields();
assertThat(allFields.size(), equalTo(1));
assertThat(allFields.get(0).getName(), equalTo("some_number"));
assertThat(allFields.get(0).getMethod(), equalTo(ExtractedField.Method.DOC_VALUE));
assertFieldSelectionContains(fieldExtraction.v2(), FieldSelection.included("some_number",
new HashSet<>(Arrays.asList("long", "integer", "short", "byte", "double", "float", "half_float", "scaled_float")), false,
FieldSelection.FeatureType.NUMERICAL));
}
public void testDetect_GivenOutlierDetectionAndNonNumericField() {
@ -105,14 +115,22 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(), false, 100, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
List<ExtractedField> allFields = extractedFields.getAllFields();
List<ExtractedField> allFields = fieldExtraction.v1().getAllFields();
assertThat(allFields.size(), equalTo(3));
assertThat(allFields.stream().map(ExtractedField::getName).collect(Collectors.toSet()),
containsInAnyOrder("some_float", "some_long", "some_boolean"));
assertThat(allFields.stream().map(ExtractedField::getMethod).collect(Collectors.toSet()),
contains(equalTo(ExtractedField.Method.DOC_VALUE)));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.included("some_boolean", Collections.singleton("boolean"), false, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.included("some_float", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.excluded("some_keyword", Collections.singleton("keyword"), "unsupported type; " +
"supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]"),
FieldSelection.included("some_long", Collections.singleton("long"), false, FieldSelection.FeatureType.NUMERICAL)
);
}
public void testDetect_GivenRegressionAndMultipleFields() {
@ -126,14 +144,22 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildRegressionConfig("foo"), false, 100, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
List<ExtractedField> allFields = extractedFields.getAllFields();
List<ExtractedField> allFields = fieldExtraction.v1().getAllFields();
assertThat(allFields.size(), equalTo(5));
assertThat(allFields.stream().map(ExtractedField::getName).collect(Collectors.toList()),
containsInAnyOrder("foo", "some_float", "some_keyword", "some_long", "some_boolean"));
assertThat(allFields.stream().map(ExtractedField::getMethod).collect(Collectors.toSet()),
contains(equalTo(ExtractedField.Method.DOC_VALUE)));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.included("foo", Collections.singleton("double"), true, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.included("some_boolean", Collections.singleton("boolean"), false, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.included("some_float", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.included("some_keyword", Collections.singleton("keyword"), false, FieldSelection.FeatureType.CATEGORICAL),
FieldSelection.included("some_long", Collections.singleton("long"), false, FieldSelection.FeatureType.NUMERICAL)
);
}
public void testDetect_GivenRegressionAndRequiredFieldMissing() {
@ -191,11 +217,16 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(analyzedFields), false, 100, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
List<ExtractedField> allFields = extractedFields.getAllFields();
List<ExtractedField> allFields = fieldExtraction.v1().getAllFields();
assertThat(allFields.size(), equalTo(1));
assertThat(allFields.stream().map(ExtractedField::getName).collect(Collectors.toList()), contains("bar"));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.included("bar", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.excluded("foo", Collections.singleton("float"), "field in excludes list")
);
}
public void testDetect_GivenRegressionAndRequiredFieldHasInvalidType() {
@ -258,14 +289,15 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
public void testDetect_GivenIncludedIgnoredField() {
FieldCapabilitiesResponse fieldCapabilities = new MockFieldCapsResponseBuilder()
.addAggregatableField("_id", "float").build();
.addAggregatableField("_id", "float")
.build();
FetchSourceContext analyzedFields = new FetchSourceContext(true, new String[]{"_id"}, new String[0]);
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(analyzedFields), false, 100, fieldCapabilities, Collections.emptyMap());
ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, () -> extractedFieldsDetector.detect());
assertThat(e.getMessage(), equalTo("field [_id] cannot be analyzed"));
assertThat(e.getMessage(), equalTo("No field [_id] could be detected"));
}
public void testDetect_ShouldSortFieldsAlphabetically() {
@ -285,9 +317,9 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(), false, 100, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
List<String> extractedFieldNames = extractedFields.getAllFields().stream().map(ExtractedField::getName)
List<String> extractedFieldNames = fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getName)
.collect(Collectors.toList());
assertThat(extractedFieldNames, equalTo(sortedFields));
}
@ -333,11 +365,17 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(desiredFields), false, 100, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
List<String> extractedFieldNames = extractedFields.getAllFields().stream().map(ExtractedField::getName)
List<String> extractedFieldNames = fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getName)
.collect(Collectors.toList());
assertThat(extractedFieldNames, equalTo(Arrays.asList("my_field1", "your_field2")));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.included("my_field1", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.excluded("my_field1_nope", Collections.singleton("float"), "field in excludes list"),
FieldSelection.included("your_field2", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL)
);
}
public void testDetect_GivenIncludedFieldHasUnsupportedType() {
@ -384,11 +422,18 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(), true, 100, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
List<String> extractedFieldNames = extractedFields.getAllFields().stream().map(ExtractedField::getName)
List<String> extractedFieldNames = fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getName)
.collect(Collectors.toList());
assertThat(extractedFieldNames, equalTo(Arrays.asList("my_field1", "your_field2")));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.included("my_field1", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.included("your_field2", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.excluded("your_keyword", Collections.singleton("keyword"), "unsupported type; supported types " +
"are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]")
);
}
public void testDetect_GivenIncludedResultsField() {
@ -434,12 +479,12 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(), true, 4, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
List<String> extractedFieldNames = extractedFields.getAllFields().stream().map(ExtractedField::getName)
List<String> extractedFieldNames = fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getName)
.collect(Collectors.toList());
assertThat(extractedFieldNames, equalTo(Arrays.asList("field_1", "field_2", "field_3")));
assertThat(extractedFields.getAllFields().stream().map(ExtractedField::getMethod).collect(Collectors.toSet()),
assertThat(fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getMethod).collect(Collectors.toSet()),
contains(equalTo(ExtractedField.Method.DOC_VALUE)));
}
@ -453,12 +498,12 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(), true, 3, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
List<String> extractedFieldNames = extractedFields.getAllFields().stream().map(ExtractedField::getName)
List<String> extractedFieldNames = fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getName)
.collect(Collectors.toList());
assertThat(extractedFieldNames, equalTo(Arrays.asList("field_1", "field_2", "field_3")));
assertThat(extractedFields.getAllFields().stream().map(ExtractedField::getMethod).collect(Collectors.toSet()),
assertThat(fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getMethod).collect(Collectors.toSet()),
contains(equalTo(ExtractedField.Method.DOC_VALUE)));
}
@ -472,12 +517,12 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(), true, 2, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
List<String> extractedFieldNames = extractedFields.getAllFields().stream().map(ExtractedField::getName)
List<String> extractedFieldNames = fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getName)
.collect(Collectors.toList());
assertThat(extractedFieldNames, equalTo(Arrays.asList("field_1", "field_2", "field_3")));
assertThat(extractedFields.getAllFields().stream().map(ExtractedField::getMethod).collect(Collectors.toSet()),
assertThat(fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getMethod).collect(Collectors.toSet()),
contains(equalTo(ExtractedField.Method.SOURCE)));
}
@ -488,14 +533,18 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildOutlierDetectionConfig(), false, 100, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
List<ExtractedField> allFields = extractedFields.getAllFields();
List<ExtractedField> allFields = fieldExtraction.v1().getAllFields();
assertThat(allFields.size(), equalTo(1));
ExtractedField booleanField = allFields.get(0);
assertThat(booleanField.getTypes(), contains("boolean"));
assertThat(booleanField.getMethod(), equalTo(ExtractedField.Method.DOC_VALUE));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.included("some_boolean", Collections.singleton("boolean"), false, FieldSelection.FeatureType.NUMERICAL)
);
SearchHit hit = new SearchHitBuilder(42).addField("some_boolean", true).build();
assertThat(booleanField.value(hit), arrayContaining(1));
@ -514,14 +563,18 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildClassificationConfig("some_boolean"), false, 100, fieldCapabilities,
Collections.singletonMap("some_boolean", 2L));
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
List<ExtractedField> allFields = extractedFields.getAllFields();
List<ExtractedField> allFields = fieldExtraction.v1().getAllFields();
assertThat(allFields.size(), equalTo(1));
ExtractedField booleanField = allFields.get(0);
assertThat(booleanField.getTypes(), contains("boolean"));
assertThat(booleanField.getMethod(), equalTo(ExtractedField.Method.DOC_VALUE));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.included("some_boolean", Collections.singleton("boolean"), true, FieldSelection.FeatureType.CATEGORICAL)
);
SearchHit hit = new SearchHitBuilder(42).addField("some_boolean", true).build();
assertThat(booleanField.value(hit), arrayContaining("true"));
@ -546,12 +599,26 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildRegressionConfig("a_float"), true, 100, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
assertThat(extractedFields.getAllFields().size(), equalTo(5));
List<String> extractedFieldNames = extractedFields.getAllFields().stream().map(ExtractedField::getName)
assertThat(fieldExtraction.v1().getAllFields().size(), equalTo(5));
List<String> extractedFieldNames = fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getName)
.collect(Collectors.toList());
assertThat(extractedFieldNames, contains("a_float", "keyword_1", "text_1.keyword", "text_2.keyword", "text_without_keyword"));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.included("a_float", Collections.singleton("float"), true, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.included("keyword_1", Collections.singleton("keyword"), false, FieldSelection.FeatureType.CATEGORICAL),
FieldSelection.excluded("keyword_1.text", Collections.singleton("text"),
"[keyword_1] is preferred because it is aggregatable"),
FieldSelection.excluded("text_1", Collections.singleton("text"),
"[text_1.keyword] is preferred because it is aggregatable"),
FieldSelection.included("text_1.keyword", Collections.singleton("keyword"), false, FieldSelection.FeatureType.CATEGORICAL),
FieldSelection.excluded("text_2", Collections.singleton("text"),
"[text_2.keyword] is preferred because it is aggregatable"),
FieldSelection.included("text_2.keyword", Collections.singleton("keyword"), false, FieldSelection.FeatureType.CATEGORICAL),
FieldSelection.included("text_without_keyword", Collections.singleton("text"), false, FieldSelection.FeatureType.CATEGORICAL)
);
}
public void testDetect_GivenMultiFieldAndParentIsRequired() {
@ -563,12 +630,19 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildClassificationConfig("field_1"), true, 100, fieldCapabilities, Collections.singletonMap("field_1", 2L));
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
assertThat(extractedFields.getAllFields().size(), equalTo(2));
List<String> extractedFieldNames = extractedFields.getAllFields().stream().map(ExtractedField::getName)
assertThat(fieldExtraction.v1().getAllFields().size(), equalTo(2));
List<String> extractedFieldNames = fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getName)
.collect(Collectors.toList());
assertThat(extractedFieldNames, contains("field_1", "field_2"));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.included("field_1", Collections.singleton("keyword"), true, FieldSelection.FeatureType.CATEGORICAL),
FieldSelection.excluded("field_1.keyword", Collections.singleton("keyword"),
"[field_1] is required instead"),
FieldSelection.included("field_2", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL)
);
}
public void testDetect_GivenMultiFieldAndMultiFieldIsRequired() {
@ -581,12 +655,19 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildClassificationConfig("field_1.keyword"), true, 100, fieldCapabilities,
Collections.singletonMap("field_1.keyword", 2L));
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
assertThat(extractedFields.getAllFields().size(), equalTo(2));
List<String> extractedFieldNames = extractedFields.getAllFields().stream().map(ExtractedField::getName)
assertThat(fieldExtraction.v1().getAllFields().size(), equalTo(2));
List<String> extractedFieldNames = fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getName)
.collect(Collectors.toList());
assertThat(extractedFieldNames, contains("field_1.keyword", "field_2"));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.excluded("field_1", Collections.singleton("keyword"),
"[field_1.keyword] is required instead"),
FieldSelection.included("field_1.keyword", Collections.singleton("keyword"), true, FieldSelection.FeatureType.CATEGORICAL),
FieldSelection.included("field_2", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL)
);
}
public void testDetect_GivenSeveralMultiFields_ShouldPickFirstSorted() {
@ -600,12 +681,21 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildRegressionConfig("field_2"), true, 100, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
assertThat(extractedFields.getAllFields().size(), equalTo(2));
List<String> extractedFieldNames = extractedFields.getAllFields().stream().map(ExtractedField::getName)
assertThat(fieldExtraction.v1().getAllFields().size(), equalTo(2));
List<String> extractedFieldNames = fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getName)
.collect(Collectors.toList());
assertThat(extractedFieldNames, contains("field_1.keyword_1", "field_2"));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.excluded("field_1", Collections.singleton("text"),
"[field_1.keyword_1] is preferred because it is aggregatable"),
FieldSelection.included("field_1.keyword_1", Collections.singleton("keyword"), false, FieldSelection.FeatureType.CATEGORICAL),
FieldSelection.excluded("field_1.keyword_2", Collections.singleton("keyword"), "[field_1.keyword_1] came first"),
FieldSelection.excluded("field_1.keyword_3", Collections.singleton("keyword"), "[field_1.keyword_1] came first"),
FieldSelection.included("field_2", Collections.singleton("float"), true, FieldSelection.FeatureType.NUMERICAL)
);
}
public void testDetect_GivenMultiFields_OverDocValueLimit() {
@ -617,12 +707,19 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildRegressionConfig("field_2"), true, 0, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
assertThat(extractedFields.getAllFields().size(), equalTo(2));
List<String> extractedFieldNames = extractedFields.getAllFields().stream().map(ExtractedField::getName)
assertThat(fieldExtraction.v1().getAllFields().size(), equalTo(2));
List<String> extractedFieldNames = fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getName)
.collect(Collectors.toList());
assertThat(extractedFieldNames, contains("field_1", "field_2"));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.included("field_1", Collections.singleton("text"), false, FieldSelection.FeatureType.CATEGORICAL),
FieldSelection.excluded("field_1.keyword_1", Collections.singleton("keyword"),
"[field_1] is preferred because it supports fetching from source"),
FieldSelection.included("field_2", Collections.singleton("float"), true, FieldSelection.FeatureType.NUMERICAL)
);
}
public void testDetect_GivenParentAndMultiFieldBothAggregatable() {
@ -635,12 +732,20 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildRegressionConfig("field_2.double"), true, 100, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
assertThat(extractedFields.getAllFields().size(), equalTo(2));
List<String> extractedFieldNames = extractedFields.getAllFields().stream().map(ExtractedField::getName)
assertThat(fieldExtraction.v1().getAllFields().size(), equalTo(2));
List<String> extractedFieldNames = fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getName)
.collect(Collectors.toList());
assertThat(extractedFieldNames, contains("field_1", "field_2.double"));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.included("field_1", Collections.singleton("keyword"), false, FieldSelection.FeatureType.CATEGORICAL),
FieldSelection.excluded("field_1.keyword", Collections.singleton("keyword"),
"[field_1] is preferred because it is aggregatable"),
FieldSelection.included("field_2.double", Collections.singleton("double"), true, FieldSelection.FeatureType.NUMERICAL),
FieldSelection.excluded("field_2.keyword", Collections.singleton("float"), "[field_2.double] is required instead")
);
}
public void testDetect_GivenParentAndMultiFieldNoneAggregatable() {
@ -652,12 +757,19 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildRegressionConfig("field_2"), true, 100, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
assertThat(extractedFields.getAllFields().size(), equalTo(2));
List<String> extractedFieldNames = extractedFields.getAllFields().stream().map(ExtractedField::getName)
assertThat(fieldExtraction.v1().getAllFields().size(), equalTo(2));
List<String> extractedFieldNames = fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getName)
.collect(Collectors.toList());
assertThat(extractedFieldNames, contains("field_1", "field_2"));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.included("field_1", Collections.singleton("text"), false, FieldSelection.FeatureType.CATEGORICAL),
FieldSelection.excluded("field_1.text", Collections.singleton("text"),
"[field_1] is preferred because none of the multi-fields are aggregatable"),
FieldSelection.included("field_2", Collections.singleton("float"), true, FieldSelection.FeatureType.NUMERICAL)
);
}
public void testDetect_GivenMultiFields_AndExplicitlyIncludedFields() {
@ -670,12 +782,18 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
SOURCE_INDEX, buildRegressionConfig("field_2", analyzedFields), false, 100, fieldCapabilities, Collections.emptyMap());
ExtractedFields extractedFields = extractedFieldsDetector.detect();
Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
assertThat(extractedFields.getAllFields().size(), equalTo(2));
List<String> extractedFieldNames = extractedFields.getAllFields().stream().map(ExtractedField::getName)
assertThat(fieldExtraction.v1().getAllFields().size(), equalTo(2));
List<String> extractedFieldNames = fieldExtraction.v1().getAllFields().stream().map(ExtractedField::getName)
.collect(Collectors.toList());
assertThat(extractedFieldNames, contains("field_1", "field_2"));
assertFieldSelectionContains(fieldExtraction.v2(),
FieldSelection.included("field_1", Collections.singleton("text"), false, FieldSelection.FeatureType.CATEGORICAL),
FieldSelection.excluded("field_1.keyword", Collections.singleton("keyword"), "field not in includes list"),
FieldSelection.included("field_2", Collections.singleton("float"), true, FieldSelection.FeatureType.NUMERICAL)
);
}
private static DataFrameAnalyticsConfig buildOutlierDetectionConfig() {
@ -715,6 +833,21 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
.build();
}
/**
* We assert each field individually to get useful error messages in case of failure
*/
private static void assertFieldSelectionContains(List<FieldSelection> actual, FieldSelection... expected) {
assertThat(actual.size(), equalTo(expected.length));
for (int i = 0; i < expected.length; i++) {
assertThat("i = " + i, actual.get(i).getName(), equalTo(expected[i].getName()));
assertThat("i = " + i, actual.get(i).getMappingTypes(), equalTo(expected[i].getMappingTypes()));
assertThat("i = " + i, actual.get(i).isIncluded(), equalTo(expected[i].isIncluded()));
assertThat("i = " + i, actual.get(i).isRequired(), equalTo(expected[i].isRequired()));
assertThat("i = " + i, actual.get(i).getFeatureType(), equalTo(expected[i].getFeatureType()));
assertThat("i = " + i, actual.get(i).getReason(), equalTo(expected[i].getReason()));
}
}
private static class MockFieldCapsResponseBuilder {
private final Map<String, Map<String, FieldCapabilities>> fieldCaps = new HashMap<>();

View File

@ -1,21 +0,0 @@
{
"ml.estimate_memory_usage": {
"documentation": {
"url": "http://www.elastic.co/guide/en/elasticsearch/reference/current/estimate-memory-usage-dfanalytics.html"
},
"stability": "experimental",
"url": {
"paths" : [
{
"path" : "/_ml/data_frame/analytics/_estimate_memory_usage",
"methods": [ "POST" ],
"parts": {}
}
]
},
"body": {
"description" : "Memory usage estimation definition",
"required" : true
}
}
}

View File

@ -0,0 +1,31 @@
{
"ml.explain_data_frame_analytics": {
"documentation": {
"url": "http://www.elastic.co/guide/en/elasticsearch/reference/current/explain-dfanalytics.html"
},
"stability": "experimental",
"url": {
"paths" : [
{
"path" : "/_ml/data_frame/analytics/_explain",
"methods": [ "GET", "POST" ],
"parts": {}
},
{
"path" : "/_ml/data_frame/analytics/{id}/_explain",
"methods": [ "GET", "POST" ],
"parts":{
"id":{
"type":"string",
"description":"The ID of the data frame analytics to explain"
}
}
}
]
},
"body": {
"description" : "The data frame analytics config to explain",
"required" : false
}
}
}

View File

@ -1,84 +0,0 @@
---
setup:
- do:
indices.create:
index: index-source
body:
mappings:
properties:
x:
type: float
y:
type: float
---
"Test memory usage estimation for empty data frame":
- do:
catch: /Unable to estimate memory usage as no documents in the source indices \[index-source\] contained all the fields selected for analysis/
ml.estimate_memory_usage:
body:
source: { index: "index-source" }
analysis: { outlier_detection: {} }
- do:
index:
index: index-source
refresh: true
body: { x: 1 }
- match: { result: "created" }
# Note that value for "y" is missing and outlier detection analysis does not support missing values.
# Hence, the data frame is still considered empty.
- do:
catch: /Unable to estimate memory usage as no documents in the source indices \[index-source\] contained all the fields selected for analysis/
ml.estimate_memory_usage:
body:
source: { index: "index-source" }
analysis: { outlier_detection: {} }
---
"Test memory usage estimation for non-empty data frame":
- do:
index:
index: index-source
refresh: true
body: { x: 1, y: 10 }
- match: { result: "created" }
- do:
ml.estimate_memory_usage:
body:
source: { index: "index-source" }
analysis: { outlier_detection: {} }
- match: { expected_memory_without_disk: "3kb" }
- match: { expected_memory_with_disk: "3kb" }
- do:
index:
index: index-source
refresh: true
body: { x: 2, y: 20 }
- match: { result: "created" }
- do:
ml.estimate_memory_usage:
body:
source: { index: "index-source" }
analysis: { outlier_detection: {} }
- match: { expected_memory_without_disk: "4kb" }
- match: { expected_memory_with_disk: "4kb" }
- do:
index:
index: index-source
refresh: true
body: { x: 3, y: 30 }
- match: { result: "created" }
- do:
ml.estimate_memory_usage:
body:
source: { index: "index-source" }
analysis: { outlier_detection: {} }
- match: { expected_memory_without_disk: "6kb" }
- match: { expected_memory_with_disk: "5kb" }

View File

@ -0,0 +1,308 @@
---
"Test neither job id nor body":
- do:
catch: /Please provide a job \[id\] or the config object/
ml.explain_data_frame_analytics:
id: ""
---
"Test both job id and body":
- do:
catch: /Please provide either a job \[id\] or the config object but not both/
ml.explain_data_frame_analytics:
id: "foo"
body:
source: { index: "index-source" }
analysis: { outlier_detection: {} }
---
"Test missing job":
- do:
catch: missing
ml.explain_data_frame_analytics:
id: "no_such_job"
---
"Test id that matches multiple jobs":
- do:
indices.create:
index: index-source
- do:
ml.put_data_frame_analytics:
id: "foo-1"
body: >
{
"source": {
"index": "index-source"
},
"dest": {
"index": "index-dest"
},
"analysis": {"outlier_detection":{}}
}
- do:
ml.put_data_frame_analytics:
id: "foo-2"
body: >
{
"source": {
"index": "index-source"
},
"dest": {
"index": "index-dest"
},
"analysis": {"outlier_detection":{}}
}
- do:
catch: /expected only one config but matched \[foo-1, foo-2\]/
ml.explain_data_frame_analytics:
id: "foo-*"
---
"Test empty data frame given body":
- do:
indices.create:
index: index-source
body:
mappings:
properties:
x:
type: float
y:
type: float
- do:
catch: /Unable to estimate memory usage as no documents in the source indices \[index-source\] contained all the fields selected for analysis/
ml.explain_data_frame_analytics:
body:
source: { index: "index-source" }
analysis: { outlier_detection: {} }
- do:
index:
index: index-source
refresh: true
body: { x: 1 }
- match: { result: "created" }
# Note that value for "y" is missing and outlier detection analysis does not support missing values.
# Hence, the data frame is still considered empty.
- do:
catch: /Unable to estimate memory usage as no documents in the source indices \[index-source\] contained all the fields selected for analysis/
ml.explain_data_frame_analytics:
body:
source: { index: "index-source" }
analysis: { outlier_detection: {} }
---
"Test non-empty data frame given body":
- do:
indices.create:
index: index-source
body:
mappings:
properties:
x:
type: float
y:
type: float
- do:
index:
index: index-source
refresh: true
body: { x: 1, y: 10 }
- match: { result: "created" }
- do:
ml.explain_data_frame_analytics:
body:
source: { index: "index-source" }
analysis: { outlier_detection: {} }
- match: { memory_estimation.expected_memory_without_disk: "3kb" }
- match: { memory_estimation.expected_memory_with_disk: "3kb" }
- length: { field_selection: 2 }
- match: { field_selection.0.name: "x" }
- match: { field_selection.0.mapping_types: ["float"] }
- match: { field_selection.0.is_included: true }
- match: { field_selection.0.is_required: false }
- match: { field_selection.0.feature_type: "numerical" }
- is_false: field_selection.0.reason
- match: { field_selection.1.name: "y" }
- match: { field_selection.1.mapping_types: ["float"] }
- match: { field_selection.1.is_included: true }
- match: { field_selection.1.is_required: false }
- match: { field_selection.1.feature_type: "numerical" }
- is_false: field_selection.1.reason
- do:
index:
index: index-source
refresh: true
body: { x: 2, y: 20 }
- match: { result: "created" }
- do:
ml.explain_data_frame_analytics:
body:
source: { index: "index-source" }
analysis: { outlier_detection: {} }
- match: { memory_estimation.expected_memory_without_disk: "4kb" }
- match: { memory_estimation.expected_memory_with_disk: "4kb" }
- do:
index:
index: index-source
refresh: true
body: { x: 3, y: 30 }
- match: { result: "created" }
- do:
ml.explain_data_frame_analytics:
body:
source: { index: "index-source" }
analysis: { outlier_detection: {} }
- match: { memory_estimation.expected_memory_without_disk: "6kb" }
- match: { memory_estimation.expected_memory_with_disk: "5kb" }
---
"Test field_selection given body":
- do:
indices.create:
index: index-source
body:
mappings:
properties:
field_1:
type: integer
field_2:
type: double
field_3:
type: date
- do:
index:
index: index-source
refresh: true
body: { field_1: 3, field_2: 3.14, field_3: "2019-11-11T00:00:00", field_4: "blah" }
- match: { result: "created" }
- do:
ml.explain_data_frame_analytics:
body:
source: { index: "index-source" }
analysis: { regression: { dependent_variable: "field_1" } }
- is_true: memory_estimation.expected_memory_without_disk
- is_true: memory_estimation.expected_memory_with_disk
- length: { field_selection: 5 }
- match: { field_selection.0.name: "field_1" }
- match: { field_selection.0.mapping_types: ["integer"] }
- match: { field_selection.0.is_included: true }
- match: { field_selection.0.is_required: true }
- match: { field_selection.0.feature_type: "numerical" }
- is_false: field_selection.0.reason
- match: { field_selection.1.name: "field_2" }
- match: { field_selection.1.mapping_types: ["double"] }
- match: { field_selection.1.is_included: true }
- match: { field_selection.1.is_required: false }
- match: { field_selection.1.feature_type: "numerical" }
- is_false: field_selection.1.reason
- match: { field_selection.2.name: "field_3" }
- match: { field_selection.2.mapping_types: ["date"] }
- match: { field_selection.2.is_included: false }
- match: { field_selection.2.is_required: false }
- is_false: field_selection.2.feature_type
- match: { field_selection.2.reason: "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, ip, keyword, long, scaled_float, short, text]" }
- match: { field_selection.3.name: "field_4" }
- match: { field_selection.3.mapping_types: ["text"] }
- match: { field_selection.3.is_included: false }
- match: { field_selection.3.is_required: false }
- is_false: field_selection.3.feature_type
- match: { field_selection.3.reason: "[field_4.keyword] is preferred because it is aggregatable" }
- match: { field_selection.4.name: "field_4.keyword" }
- match: { field_selection.4.mapping_types: ["keyword"] }
- match: { field_selection.4.is_included: true }
- match: { field_selection.4.is_required: false }
- match: { field_selection.4.feature_type: "categorical" }
- is_false: field_selection.4.reason
---
"Test field_selection given job":
- do:
indices.create:
index: index-source
body:
mappings:
properties:
field_1:
type: integer
field_2:
type: double
field_3:
type: date
- do:
index:
index: index-source
refresh: true
body: { field_1: 3, field_2: 3.14, field_3: "2019-11-11T00:00:00", field_4: "blah" }
- match: { result: "created" }
- do:
ml.put_data_frame_analytics:
id: "got-a-job-for-this-one"
body: >
{
"source": {
"index": "index-source"
},
"dest": {
"index": "index-dest"
},
"analysis": {"regression":{ "dependent_variable": "field_1" }}
}
- do:
ml.explain_data_frame_analytics:
id: "got-a-job-for-this-one"
- is_true: memory_estimation.expected_memory_without_disk
- is_true: memory_estimation.expected_memory_with_disk
- length: { field_selection: 5 }
- match: { field_selection.0.name: "field_1" }
- match: { field_selection.0.mapping_types: ["integer"] }
- match: { field_selection.0.is_included: true }
- match: { field_selection.0.is_required: true }
- match: { field_selection.0.feature_type: "numerical" }
- is_false: field_selection.0.reason
- match: { field_selection.1.name: "field_2" }
- match: { field_selection.1.mapping_types: ["double"] }
- match: { field_selection.1.is_included: true }
- match: { field_selection.1.is_required: false }
- match: { field_selection.1.feature_type: "numerical" }
- is_false: field_selection.1.reason
- match: { field_selection.2.name: "field_3" }
- match: { field_selection.2.mapping_types: ["date"] }
- match: { field_selection.2.is_included: false }
- match: { field_selection.2.is_required: false }
- is_false: field_selection.2.feature_type
- match: { field_selection.2.reason: "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, ip, keyword, long, scaled_float, short, text]" }
- match: { field_selection.3.name: "field_4" }
- match: { field_selection.3.mapping_types: ["text"] }
- match: { field_selection.3.is_included: false }
- match: { field_selection.3.is_required: false }
- is_false: field_selection.3.feature_type
- match: { field_selection.3.reason: "[field_4.keyword] is preferred because it is aggregatable" }
- match: { field_selection.4.name: "field_4.keyword" }
- match: { field_selection.4.mapping_types: ["keyword"] }
- match: { field_selection.4.is_included: true }
- match: { field_selection.4.is_required: false }
- match: { field_selection.4.feature_type: "categorical" }
- is_false: field_selection.4.reason