[ML] Fix possible race condition when starting datafeed (#51302)
The ID of the datafeed's associated job was being obtained frequently by looking up the datafeed task in a map that was being modified in other threads. This could lead to NPEs if the datafeed stopped running at an unexpected time. This change reduces the number of places where a datafeed's associated job ID is looked up to avoid the possibility of failures when the datafeed's task is removed from the map of running tasks during multi-step operations in other threads. Fixes #51285
This commit is contained in:
parent
bfcfcdee33
commit
932c63297f
|
@ -85,13 +85,14 @@ public class DatafeedManager {
|
|||
|
||||
ActionListener<DatafeedJob> datafeedJobHandler = ActionListener.wrap(
|
||||
datafeedJob -> {
|
||||
String jobId = datafeedJob.getJobId();
|
||||
Holder holder = new Holder(task, datafeedId, datafeedJob,
|
||||
new ProblemTracker(auditor, datafeedJob.getJobId()), finishHandler);
|
||||
new ProblemTracker(auditor, jobId), finishHandler);
|
||||
runningDatafeedsOnThisNode.put(task.getAllocationId(), holder);
|
||||
task.updatePersistentTaskState(DatafeedState.STARTED, new ActionListener<PersistentTask<?>>() {
|
||||
@Override
|
||||
public void onResponse(PersistentTask<?> persistentTask) {
|
||||
taskRunner.runWhenJobIsOpened(task);
|
||||
taskRunner.runWhenJobIsOpened(task, jobId);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -267,17 +268,23 @@ public class DatafeedManager {
|
|||
}
|
||||
}
|
||||
|
||||
private String getJobId(TransportStartDatafeedAction.DatafeedTask task) {
|
||||
return runningDatafeedsOnThisNode.get(task.getAllocationId()).getJobId();
|
||||
/**
|
||||
* Returns <code>null</code> if the datafeed is not running on this node.
|
||||
*/
|
||||
private String getJobIdIfDatafeedRunningOnThisNode(TransportStartDatafeedAction.DatafeedTask task) {
|
||||
Holder holder = runningDatafeedsOnThisNode.get(task.getAllocationId());
|
||||
if (holder == null) {
|
||||
return null;
|
||||
}
|
||||
return holder.getJobId();
|
||||
}
|
||||
|
||||
private JobState getJobState(PersistentTasksCustomMetaData tasks, TransportStartDatafeedAction.DatafeedTask datafeedTask) {
|
||||
return MlTasks.getJobStateModifiedForReassignments(getJobId(datafeedTask), tasks);
|
||||
private JobState getJobState(PersistentTasksCustomMetaData tasks, String jobId) {
|
||||
return MlTasks.getJobStateModifiedForReassignments(jobId, tasks);
|
||||
}
|
||||
|
||||
private boolean jobHasOpenAutodetectCommunicator(PersistentTasksCustomMetaData tasks,
|
||||
TransportStartDatafeedAction.DatafeedTask datafeedTask) {
|
||||
PersistentTasksCustomMetaData.PersistentTask<?> jobTask = MlTasks.getJobTask(getJobId(datafeedTask), tasks);
|
||||
private boolean jobHasOpenAutodetectCommunicator(PersistentTasksCustomMetaData tasks, String jobId) {
|
||||
PersistentTasksCustomMetaData.PersistentTask<?> jobTask = MlTasks.getJobTask(jobId, tasks);
|
||||
if (jobTask == null) {
|
||||
return false;
|
||||
}
|
||||
|
@ -492,14 +499,14 @@ public class DatafeedManager {
|
|||
|
||||
private final List<TransportStartDatafeedAction.DatafeedTask> tasksToRun = new CopyOnWriteArrayList<>();
|
||||
|
||||
private void runWhenJobIsOpened(TransportStartDatafeedAction.DatafeedTask datafeedTask) {
|
||||
private void runWhenJobIsOpened(TransportStartDatafeedAction.DatafeedTask datafeedTask, String jobId) {
|
||||
ClusterState clusterState = clusterService.state();
|
||||
PersistentTasksCustomMetaData tasks = clusterState.getMetaData().custom(PersistentTasksCustomMetaData.TYPE);
|
||||
if (getJobState(tasks, datafeedTask) == JobState.OPENED && jobHasOpenAutodetectCommunicator(tasks, datafeedTask)) {
|
||||
if (getJobState(tasks, jobId) == JobState.OPENED && jobHasOpenAutodetectCommunicator(tasks, jobId)) {
|
||||
runTask(datafeedTask);
|
||||
} else {
|
||||
logger.info("Datafeed [{}] is waiting for job [{}] to be opened",
|
||||
datafeedTask.getDatafeedId(), getJobId(datafeedTask));
|
||||
datafeedTask.getDatafeedId(), jobId);
|
||||
tasksToRun.add(datafeedTask);
|
||||
}
|
||||
}
|
||||
|
@ -530,17 +537,19 @@ public class DatafeedManager {
|
|||
|
||||
List<TransportStartDatafeedAction.DatafeedTask> remainingTasks = new ArrayList<>();
|
||||
for (TransportStartDatafeedAction.DatafeedTask datafeedTask : tasksToRun) {
|
||||
if (runningDatafeedsOnThisNode.containsKey(datafeedTask.getAllocationId()) == false) {
|
||||
String jobId = getJobIdIfDatafeedRunningOnThisNode(datafeedTask);
|
||||
if (jobId == null) {
|
||||
// Datafeed is not running on this node any more
|
||||
continue;
|
||||
}
|
||||
JobState jobState = getJobState(currentTasks, datafeedTask);
|
||||
if (jobState == JobState.OPENING || jobHasOpenAutodetectCommunicator(currentTasks, datafeedTask) == false) {
|
||||
JobState jobState = getJobState(currentTasks, jobId);
|
||||
if (jobState == JobState.OPENING || jobHasOpenAutodetectCommunicator(currentTasks, jobId) == false) {
|
||||
remainingTasks.add(datafeedTask);
|
||||
} else if (jobState == JobState.OPENED) {
|
||||
runTask(datafeedTask);
|
||||
} else {
|
||||
logger.warn("Datafeed [{}] is stopping because job [{}] state is [{}]",
|
||||
datafeedTask.getDatafeedId(), getJobId(datafeedTask), jobState);
|
||||
datafeedTask.getDatafeedId(), jobId, jobState);
|
||||
datafeedTask.stop("job_never_opened", TimeValue.timeValueSeconds(20));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue