fix TaskQueue-HRTR deadlock (#6212)

* fix TaskQueue-HRTR deadlock causing https://github.com/apache/incubator-druid/issues/6201

* address review comments
This commit is contained in:
Himanshu 2018-08-25 14:15:57 -07:00 committed by Gian Merlino
parent 28e6ae3664
commit c3aaf8122d
1 changed files with 25 additions and 10 deletions

View File

@ -376,6 +376,7 @@ public class HttpRemoteTaskRunner implements WorkerTaskRunner, TaskLogStreamer
// on a worker - this avoids overflowing a worker with tasks
long waitMs = config.getTaskAssignmentTimeout().toStandardDuration().getMillis();
long waitStart = System.currentTimeMillis();
boolean isTaskAssignmentTimedOut = false;
synchronized (statusLock) {
while (tasks.containsKey(taskId)
&& tasks.get(taskId).getState() == HttpRemoteTaskRunnerWorkItem.State.PENDING) {
@ -383,6 +384,13 @@ public class HttpRemoteTaskRunner implements WorkerTaskRunner, TaskLogStreamer
if (remaining > 0) {
statusLock.wait(remaining);
} else {
isTaskAssignmentTimedOut = true;
break;
}
}
}
if (isTaskAssignmentTimedOut) {
log.makeAlert(
"Task assignment timed out on worker [%s], never ran task [%s] in timeout[%s]!",
workerHost,
@ -390,22 +398,24 @@ public class HttpRemoteTaskRunner implements WorkerTaskRunner, TaskLogStreamer
config.getTaskAssignmentTimeout()
).emit();
taskComplete(workItem, workerHolder, TaskStatus.failure(taskId));
}
return true;
}
}
return true;
}
} else {
return false;
}
}
// CAUTION: This method calls RemoteTaskRunnerWorkItem.setResult(..) which results in TaskQueue.notifyStatus() being called
// because that is attached by TaskQueue to task result future. So, this method must not be called with "statusLock"
// held. See https://github.com/apache/incubator-druid/issues/6201
private void taskComplete(
HttpRemoteTaskRunnerWorkItem taskRunnerWorkItem,
WorkerHolder workerHolder,
TaskStatus taskStatus
)
{
Preconditions.checkState(!Thread.holdsLock(statusLock), "Current thread must not hold statusLock.");
Preconditions.checkNotNull(taskRunnerWorkItem, "taskRunnerWorkItem");
Preconditions.checkNotNull(taskStatus, "taskStatus");
if (workerHolder != null) {
@ -1170,6 +1180,7 @@ public class HttpRemoteTaskRunner implements WorkerTaskRunner, TaskLogStreamer
HttpRemoteTaskRunnerWorkItem taskItem;
boolean shouldShutdownTask = false;
boolean isTaskCompleted = false;
synchronized (statusLock) {
taskItem = tasks.get(taskId);
@ -1293,7 +1304,7 @@ public class HttpRemoteTaskRunner implements WorkerTaskRunner, TaskLogStreamer
TaskRunnerUtils.notifyLocationChanged(listeners, taskId, announcement.getTaskLocation());
}
taskComplete(taskItem, workerHolder, announcement.getTaskStatus());
isTaskCompleted = true;
} else {
log.warn(
"Worker[%s] reported completed task[%s] which is being run by another worker[%s]. Notification ignored.",
@ -1327,6 +1338,10 @@ public class HttpRemoteTaskRunner implements WorkerTaskRunner, TaskLogStreamer
}
}
if (isTaskCompleted) {
taskComplete(taskItem, workerHolder, announcement.getTaskStatus());
}
if (shouldShutdownTask) {
log.warn("Killing task[%s] on worker[%s].", taskId, worker.getHost());
workerHolder.shutdownTask(taskId);