[ML] fixing bug where analytics process starts with 0 rows (#45879) (#45988)

The native process requires that there be a non-zero number of rows to analyze. If the flag --rows 0 is passed to the executable, it throws and does not start. When building the configuration for the process we should not start the native process if there are no rows. Adding some logging to indicate what is occurring.
2025-02-26 14:54:56 +00:00 · 2019-08-26 14:18:17 -05:00 · 2019-08-26 14:18:17 -05:00 · a3a4ae0ac2
commit a3a4ae0ac2
parent d64018f8e1
5 changed files with 96 additions and 6 deletions
--- a/x-pack/plugin/ml/qa/ml-with-security/build.gradle
+++ b/x-pack/plugin/ml/qa/ml-with-security/build.gradle
@ -143,6 +143,7 @@ integTest.runner  {
    'ml/start_data_frame_analytics/Test start given source index has no compatible fields',
    'ml/start_data_frame_analytics/Test start with inconsistent body/param ids',
    'ml/start_data_frame_analytics/Test start given dest index is not empty',
+    'ml/start_data_frame_analytics/Test start with compatible fields but no data',
    'ml/start_stop_datafeed/Test start datafeed job, but not open',
    'ml/start_stop_datafeed/Test start non existing datafeed',
    'ml/start_stop_datafeed/Test stop non existing datafeed',
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStartDataFrameAnalyticsAction.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStartDataFrameAnalyticsAction.java
@ -34,6 +34,7 @@ import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.routing.IndexRoutingTable;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.Nullable;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.settings.Settings;
@ -225,10 +226,41 @@ public class TransportStartDataFrameAnalyticsAction
    }

    private void getConfigAndValidate(String id, ActionListener<DataFrameAnalyticsConfig> finalListener) {
+
+        // Step 5. Validate that there are analyzable data in the source index
+        ActionListener<DataFrameAnalyticsConfig> validateMappingsMergeListener = ActionListener.wrap(
+            config -> DataFrameDataExtractorFactory.createForSourceIndices(client,
+                "validate_source_index_has_rows-" + id,
+                config,
+                ActionListener.wrap(
+                    dataFrameDataExtractorFactory ->
+                        dataFrameDataExtractorFactory
+                            .newExtractor(false)
+                            .collectDataSummaryAsync(ActionListener.wrap(
+                                dataSummary -> {
+                                    if (dataSummary.rows == 0) {
+                                        finalListener.onFailure(new ElasticsearchStatusException(
+                                            "Unable to start {} as there are no analyzable data in source indices [{}].",
+                                            RestStatus.BAD_REQUEST,
+                                            id,
+                                            Strings.arrayToCommaDelimitedString(config.getSource().getIndex())
+                                        ));
+                                    } else {
+                                        finalListener.onResponse(config);
+                                    }
+                                },
+                                finalListener::onFailure
+                            )),
+                    finalListener::onFailure
+                ))
+            ,
+            finalListener::onFailure
+        );
+
        // Step 4. Validate mappings can be merged
        ActionListener<DataFrameAnalyticsConfig> toValidateMappingsListener = ActionListener.wrap(
            config -> MappingsMerger.mergeMappings(client, config.getHeaders(), config.getSource().getIndex(), ActionListener.wrap(
-                    mappings -> finalListener.onResponse(config), finalListener::onFailure)),
+                mappings -> validateMappingsMergeListener.onResponse(config), finalListener::onFailure)),
            finalListener::onFailure
        );

--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/DataFrameDataExtractor.java
@ -8,6 +8,7 @@ package org.elasticsearch.xpack.ml.dataframe.extractor;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.apache.logging.log4j.message.ParameterizedMessage;
+import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.search.ClearScrollAction;
 import org.elasticsearch.action.search.ClearScrollRequest;
 import org.elasticsearch.action.search.SearchAction;
@ -234,14 +235,33 @@ public class DataFrameDataExtractor {
    }

    public DataSummary collectDataSummary() {
-        SearchRequestBuilder searchRequestBuilder = new SearchRequestBuilder(client, SearchAction.INSTANCE)
+        SearchRequestBuilder searchRequestBuilder = buildDataSummarySearchRequestBuilder();
+        SearchResponse searchResponse = executeSearchRequest(searchRequestBuilder);
+        return new DataSummary(searchResponse.getHits().getTotalHits().value, context.extractedFields.getAllFields().size());
+    }
+
+    public void collectDataSummaryAsync(ActionListener<DataSummary> dataSummaryActionListener) {
+        SearchRequestBuilder searchRequestBuilder = buildDataSummarySearchRequestBuilder();
+        final int numberOfFields = context.extractedFields.getAllFields().size();
+
+        ClientHelper.executeWithHeadersAsync(context.headers,
+            ClientHelper.ML_ORIGIN,
+            client,
+            SearchAction.INSTANCE,
+            searchRequestBuilder.request(),
+            ActionListener.wrap(
+                searchResponse -> dataSummaryActionListener.onResponse(
+                    new DataSummary(searchResponse.getHits().getTotalHits().value, numberOfFields)),
+            dataSummaryActionListener::onFailure
+        ));
+    }
+
+    private SearchRequestBuilder buildDataSummarySearchRequestBuilder() {
+        return new SearchRequestBuilder(client, SearchAction.INSTANCE)
            .setIndices(context.indices)
            .setSize(0)
            .setQuery(context.query)
            .setTrackTotalHits(true);
-
-        SearchResponse searchResponse = executeSearchRequest(searchRequestBuilder);
-        return new DataSummary(searchResponse.getHits().getTotalHits().value, context.extractedFields.getAllFields().size());
    }

    public Set<String> getCategoricalFields() {
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/AnalyticsProcessManager.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/AnalyticsProcessManager.java
@ -11,6 +11,7 @@ import org.apache.logging.log4j.message.ParameterizedMessage;
 import org.elasticsearch.action.admin.indices.refresh.RefreshAction;
 import org.elasticsearch.action.admin.indices.refresh.RefreshRequest;
 import org.elasticsearch.client.Client;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.xpack.core.ClientHelper;
 import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
@ -273,7 +274,15 @@ public class AnalyticsProcessManager {
            }

            dataExtractor = dataExtractorFactory.newExtractor(false);
-            process = createProcess(task, createProcessConfig(config, dataExtractor));
+            AnalyticsProcessConfig analyticsProcessConfig = createProcessConfig(config, dataExtractor);
+            LOGGER.trace("[{}] creating analytics process with config [{}]", config.getId(), Strings.toString(analyticsProcessConfig));
+            // If we have no rows, that means there is no data so no point in starting the native process
+            // just finish the task
+            if (analyticsProcessConfig.rows() == 0) {
+                LOGGER.info("[{}] no data found to analyze. Will not start analytics native process.", config.getId());
+                return false;
+            }
+            process = createProcess(task, analyticsProcessConfig);
            DataFrameRowsJoiner dataFrameRowsJoiner = new DataFrameRowsJoiner(config.getId(), client,
                dataExtractorFactory.newExtractor(true));
            resultProcessor = new AnalyticsResultProcessor(id, dataFrameRowsJoiner, this::isProcessKilled, task.getProgressTracker());
--- a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/start_data_frame_analytics.yml
+++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/start_data_frame_analytics.yml
@ -62,6 +62,34 @@
        id: "foo"

 ---
+"Test start with compatible fields but no data":
+  - do:
+      indices.create:
+        index: empty-index-with-compatible-fields
+        body:
+          mappings:
+            properties:
+              long_field: { "type": "long" }
+
+  - do:
+      ml.put_data_frame_analytics:
+        id: "empty-with-compatible-fields"
+        body: >
+          {
+            "source": {
+              "index": "empty-index-with-compatible-fields"
+            },
+            "dest": {
+              "index": "empty-index-with-compatible-fields-dest"
+            },
+            "analysis": {"outlier_detection":{}}
+          }
+
+  - do:
+      catch: /Unable to start empty-with-compatible-fields as there are no analyzable data in source indices \[empty-index-with-compatible-fields\]/
+      ml.start_data_frame_analytics:
+        id: "empty-with-compatible-fields"
+---
 "Test start with inconsistent body/param ids":

  - do: