The data frame structure in c++ has a limit on 2^32 documents. This commit adds a check that the number of documents involved in the analysis are less than that and fails to start otherwise. That saves the cost of reindexing when it is unnecessary. Backport of #62547
This commit is contained in:
parent
7d36393b09
commit
f5c28e2054
|
@ -233,6 +233,7 @@ public class Classification implements DataFrameAnalysis {
|
|||
return numTopClasses;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getTrainingPercent() {
|
||||
return trainingPercent;
|
||||
}
|
||||
|
|
|
@ -83,6 +83,12 @@ public interface DataFrameAnalysis extends ToXContentObject, NamedWriteable {
|
|||
*/
|
||||
boolean supportsInference();
|
||||
|
||||
/**
|
||||
* @return the percentage of data to use for training
|
||||
*/
|
||||
default double getTrainingPercent() {
|
||||
return 100.0;
|
||||
}
|
||||
/**
|
||||
* Summarizes information about the fields that is necessary for analysis to generate
|
||||
* the parameters needed for the process configuration.
|
||||
|
|
|
@ -188,6 +188,7 @@ public class Regression implements DataFrameAnalysis {
|
|||
return predictionFieldName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getTrainingPercent() {
|
||||
return trainingPercent;
|
||||
}
|
||||
|
|
|
@ -325,7 +325,7 @@ public class TransportStartDataFrameAnalyticsAction
|
|||
|
||||
private void validateSourceIndexHasAnalyzableData(StartContext startContext, ActionListener<StartContext> listener) {
|
||||
ActionListener<Void> validateAtLeastOneAnalyzedFieldListener = ActionListener.wrap(
|
||||
aVoid -> validateSourceIndexHasRows(startContext, listener),
|
||||
aVoid -> validateSourceIndexRowsCount(startContext, listener),
|
||||
listener::onFailure
|
||||
);
|
||||
|
||||
|
@ -354,7 +354,7 @@ public class TransportStartDataFrameAnalyticsAction
|
|||
}
|
||||
}
|
||||
|
||||
private void validateSourceIndexHasRows(StartContext startContext, ActionListener<StartContext> listener) {
|
||||
private void validateSourceIndexRowsCount(StartContext startContext, ActionListener<StartContext> listener) {
|
||||
DataFrameDataExtractorFactory extractorFactory = DataFrameDataExtractorFactory.createForSourceIndices(client,
|
||||
"validate_source_index_has_rows-" + startContext.config.getId(),
|
||||
startContext.config,
|
||||
|
@ -372,6 +372,9 @@ public class TransportStartDataFrameAnalyticsAction
|
|||
startContext.config.getId(),
|
||||
Strings.arrayToCommaDelimitedString(startContext.config.getSource().getIndex())
|
||||
));
|
||||
} else if (Math.floor(startContext.config.getAnalysis().getTrainingPercent() * dataSummary.rows) >= Math.pow(2, 32)) {
|
||||
listener.onFailure(ExceptionsHelper.badRequestException("Unable to start because too many documents " +
|
||||
"(more than 2^32) are included in the analysis. Consider downsampling."));
|
||||
} else {
|
||||
listener.onResponse(startContext);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue