[7.x][ML] Do not start data frame analytics when too many docs are analyzed (#62547) (#62558)

The data frame structure in c++ has a limit on 2^32 documents. This commit
adds a check that the number of documents involved in the analysis are
less than that and fails to start otherwise. That saves the cost of
reindexing when it is unnecessary.

Backport of #62547
This commit is contained in:
Dimitris Athanasiou 2020-09-17 19:06:38 +03:00 committed by GitHub
parent 7d36393b09
commit f5c28e2054
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 13 additions and 2 deletions

View File

@ -233,6 +233,7 @@ public class Classification implements DataFrameAnalysis {
return numTopClasses;
}
@Override
public double getTrainingPercent() {
return trainingPercent;
}

View File

@ -83,6 +83,12 @@ public interface DataFrameAnalysis extends ToXContentObject, NamedWriteable {
*/
boolean supportsInference();
/**
* @return the percentage of data to use for training
*/
default double getTrainingPercent() {
return 100.0;
}
/**
* Summarizes information about the fields that is necessary for analysis to generate
* the parameters needed for the process configuration.

View File

@ -188,6 +188,7 @@ public class Regression implements DataFrameAnalysis {
return predictionFieldName;
}
@Override
public double getTrainingPercent() {
return trainingPercent;
}

View File

@ -325,7 +325,7 @@ public class TransportStartDataFrameAnalyticsAction
private void validateSourceIndexHasAnalyzableData(StartContext startContext, ActionListener<StartContext> listener) {
ActionListener<Void> validateAtLeastOneAnalyzedFieldListener = ActionListener.wrap(
aVoid -> validateSourceIndexHasRows(startContext, listener),
aVoid -> validateSourceIndexRowsCount(startContext, listener),
listener::onFailure
);
@ -354,7 +354,7 @@ public class TransportStartDataFrameAnalyticsAction
}
}
private void validateSourceIndexHasRows(StartContext startContext, ActionListener<StartContext> listener) {
private void validateSourceIndexRowsCount(StartContext startContext, ActionListener<StartContext> listener) {
DataFrameDataExtractorFactory extractorFactory = DataFrameDataExtractorFactory.createForSourceIndices(client,
"validate_source_index_has_rows-" + startContext.config.getId(),
startContext.config,
@ -372,6 +372,9 @@ public class TransportStartDataFrameAnalyticsAction
startContext.config.getId(),
Strings.arrayToCommaDelimitedString(startContext.config.getSource().getIndex())
));
} else if (Math.floor(startContext.config.getAnalysis().getTrainingPercent() * dataSummary.rows) >= Math.pow(2, 32)) {
listener.onFailure(ExceptionsHelper.badRequestException("Unable to start because too many documents " +
"(more than 2^32) are included in the analysis. Consider downsampling."));
} else {
listener.onResponse(startContext);
}