mirror of https://github.com/apache/druid.git
fix bug in RTR for not correctly exiting after timeout
This commit is contained in:
parent
219a8d7d05
commit
9ab22f238e
|
@ -230,6 +230,13 @@ public class RemoteTaskRunner implements TaskRunner, TaskLogProvider
|
|||
return null;
|
||||
}
|
||||
|
||||
public boolean isWorkerRunningTask(String workerHost, String taskId)
|
||||
{
|
||||
ZkWorker zkWorker = zkWorkers.get(workerHost);
|
||||
|
||||
return (zkWorker != null && zkWorker.getRunningTasks().contains(taskId));
|
||||
}
|
||||
|
||||
/**
|
||||
* A task will be run only if there is no current knowledge in the RemoteTaskRunner of the task.
|
||||
*
|
||||
|
@ -350,6 +357,7 @@ public class RemoteTaskRunner implements TaskRunner, TaskLogProvider
|
|||
|
||||
/**
|
||||
* Adds a task to the pending queue
|
||||
*
|
||||
* @param taskRunnerWorkItem
|
||||
*/
|
||||
private void addPendingTask(final TaskRunnerWorkItem taskRunnerWorkItem)
|
||||
|
@ -506,8 +514,18 @@ public class RemoteTaskRunner implements TaskRunner, TaskLogProvider
|
|||
// Syncing state with Zookeeper - don't assign new tasks until the task we just assigned is actually running
|
||||
// on a worker - this avoids overflowing a worker with tasks
|
||||
synchronized (statusLock) {
|
||||
while (findWorkerRunningTask(task.getId()) == null) {
|
||||
while (!isWorkerRunningTask(theWorker.getHost(), task.getId())) {
|
||||
statusLock.wait(config.getTaskAssignmentTimeoutDuration().getMillis());
|
||||
if (!isWorkerRunningTask(theWorker.getHost(), task.getId())) {
|
||||
log.error(
|
||||
"Something went wrong! %s never ran task %s after %s!",
|
||||
theWorker.getHost(),
|
||||
task.getId(),
|
||||
config.getTaskAssignmentTimeoutDuration()
|
||||
);
|
||||
retryTask(runningTasks.get(task.getId()), theWorker.getHost());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue