YARN-8362. Bugfix logic in container retries in node manager.
Contributed by Chandni Singh
(cherry picked from commit 135941e00d
)
This commit is contained in:
parent
11a425d11a
commit
03209e8966
|
@ -1602,8 +1602,10 @@ public class ContainerImpl implements Container {
|
||||||
}
|
}
|
||||||
container.addDiagnostics(exitEvent.getDiagnosticInfo() + "\n");
|
container.addDiagnostics(exitEvent.getDiagnosticInfo() + "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (container.shouldRetry(container.exitCode)) {
|
if (container.shouldRetry(container.exitCode)) {
|
||||||
|
// Updates to the retry context should be protected from concurrent
|
||||||
|
// writes. It should only be called from this transition.
|
||||||
|
container.retryPolicy.updateRetryContext(container.windowRetryContext);
|
||||||
container.storeRetryContext();
|
container.storeRetryContext();
|
||||||
doRelaunch(container,
|
doRelaunch(container,
|
||||||
container.windowRetryContext.getRemainingRetries(),
|
container.windowRetryContext.getRemainingRetries(),
|
||||||
|
|
|
@ -42,38 +42,58 @@ public class SlidingWindowRetryPolicy {
|
||||||
|
|
||||||
public boolean shouldRetry(RetryContext retryContext,
|
public boolean shouldRetry(RetryContext retryContext,
|
||||||
int errorCode) {
|
int errorCode) {
|
||||||
ContainerRetryContext containerRC = retryContext
|
ContainerRetryContext containerRC = retryContext.containerRetryContext;
|
||||||
.containerRetryContext;
|
|
||||||
Preconditions.checkNotNull(containerRC, "container retry context null");
|
Preconditions.checkNotNull(containerRC, "container retry context null");
|
||||||
ContainerRetryPolicy retryPolicy = containerRC.getRetryPolicy();
|
ContainerRetryPolicy retryPolicy = containerRC.getRetryPolicy();
|
||||||
if (retryPolicy == ContainerRetryPolicy.RETRY_ON_ALL_ERRORS
|
if (retryPolicy == ContainerRetryPolicy.RETRY_ON_ALL_ERRORS
|
||||||
|| (retryPolicy == ContainerRetryPolicy.RETRY_ON_SPECIFIC_ERROR_CODES
|
|| (retryPolicy == ContainerRetryPolicy.RETRY_ON_SPECIFIC_ERROR_CODES
|
||||||
&& containerRC.getErrorCodes() != null
|
&& containerRC.getErrorCodes() != null
|
||||||
&& containerRC.getErrorCodes().contains(errorCode))) {
|
&& containerRC.getErrorCodes().contains(errorCode))) {
|
||||||
if (containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER) {
|
return containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER
|
||||||
return true;
|
|| calculateRemainingRetries(retryContext) > 0;
|
||||||
}
|
|
||||||
int pendingRetries = calculatePendingRetries(retryContext);
|
|
||||||
updateRetryContext(retryContext, pendingRetries);
|
|
||||||
return pendingRetries > 0;
|
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculates the pending number of retries.
|
* Calculates the remaining number of retries.
|
||||||
* <p>
|
|
||||||
* When failuresValidityInterval is > 0, it also removes time entries from
|
|
||||||
* <code>restartTimes</code> which are outside the validity interval.
|
|
||||||
*
|
*
|
||||||
* @return the pending retries.
|
* @return the remaining retries.
|
||||||
*/
|
*/
|
||||||
private int calculatePendingRetries(RetryContext retryContext) {
|
private int calculateRemainingRetries(RetryContext retryContext) {
|
||||||
ContainerRetryContext containerRC =
|
ContainerRetryContext containerRC =
|
||||||
retryContext.containerRetryContext;
|
retryContext.containerRetryContext;
|
||||||
if (containerRC.getFailuresValidityInterval() > 0) {
|
if (containerRC.getFailuresValidityInterval() > 0) {
|
||||||
|
int validFailuresCount = 0;
|
||||||
|
long currentTime = clock.getTime();
|
||||||
|
for (int i = retryContext.restartTimes.size() - 1; i >= 0; i--) {
|
||||||
|
long restartTime = retryContext.restartTimes.get(i);
|
||||||
|
if (currentTime - restartTime
|
||||||
|
<= containerRC.getFailuresValidityInterval()) {
|
||||||
|
validFailuresCount++;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return containerRC.getMaxRetries() - validFailuresCount;
|
||||||
|
} else {
|
||||||
|
return retryContext.getRemainingRetries();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Updates remaining retries and the restart time when
|
||||||
|
* required in the retryContext.
|
||||||
|
* <p>
|
||||||
|
* When failuresValidityInterval is > 0, it also removes time entries from
|
||||||
|
* <code>restartTimes</code> which are outside the validity interval.
|
||||||
|
*/
|
||||||
|
protected void updateRetryContext(RetryContext retryContext) {
|
||||||
|
if (retryContext.containerRetryContext.getFailuresValidityInterval() > 0) {
|
||||||
|
ContainerRetryContext containerRC = retryContext.containerRetryContext;
|
||||||
Iterator<Long> iterator = retryContext.getRestartTimes().iterator();
|
Iterator<Long> iterator = retryContext.getRestartTimes().iterator();
|
||||||
long currentTime = clock.getTime();
|
long currentTime = clock.getTime();
|
||||||
|
|
||||||
while (iterator.hasNext()) {
|
while (iterator.hasNext()) {
|
||||||
long restartTime = iterator.next();
|
long restartTime = iterator.next();
|
||||||
if (currentTime - restartTime
|
if (currentTime - restartTime
|
||||||
|
@ -83,23 +103,11 @@ public class SlidingWindowRetryPolicy {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return containerRC.getMaxRetries() -
|
retryContext.setRemainingRetries(containerRC.getMaxRetries() -
|
||||||
retryContext.getRestartTimes().size();
|
retryContext.restartTimes.size());
|
||||||
|
retryContext.getRestartTimes().add(currentTime);
|
||||||
} else {
|
} else {
|
||||||
return retryContext.getRemainingRetries();
|
retryContext.remainingRetries--;
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Updates remaining retries and the restart time when
|
|
||||||
* required in the retryContext.
|
|
||||||
*/
|
|
||||||
private void updateRetryContext(RetryContext retryContext,
|
|
||||||
int pendingRetries) {
|
|
||||||
retryContext.setRemainingRetries(pendingRetries - 1);
|
|
||||||
if (retryContext.containerRetryContext.getFailuresValidityInterval()
|
|
||||||
> 0) {
|
|
||||||
retryContext.getRestartTimes().add(clock.getTime());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -64,12 +64,18 @@ public class TestSlidingWindowRetryPolicy {
|
||||||
new SlidingWindowRetryPolicy.RetryContext(retryContext);
|
new SlidingWindowRetryPolicy.RetryContext(retryContext);
|
||||||
Assert.assertTrue("retry 1",
|
Assert.assertTrue("retry 1",
|
||||||
retryPolicy.shouldRetry(windowRetryContext, 12));
|
retryPolicy.shouldRetry(windowRetryContext, 12));
|
||||||
|
retryPolicy.updateRetryContext(windowRetryContext);
|
||||||
|
|
||||||
clock.setTime(20);
|
clock.setTime(20);
|
||||||
Assert.assertTrue("retry 2",
|
Assert.assertTrue("retry 2",
|
||||||
retryPolicy.shouldRetry(windowRetryContext, 12));
|
retryPolicy.shouldRetry(windowRetryContext, 12));
|
||||||
|
retryPolicy.updateRetryContext(windowRetryContext);
|
||||||
|
|
||||||
clock.setTime(40);
|
clock.setTime(40);
|
||||||
Assert.assertTrue("retry 3",
|
Assert.assertTrue("retry 3",
|
||||||
retryPolicy.shouldRetry(windowRetryContext, 12));
|
retryPolicy.shouldRetry(windowRetryContext, 12));
|
||||||
|
retryPolicy.updateRetryContext(windowRetryContext);
|
||||||
|
|
||||||
clock.setTime(45);
|
clock.setTime(45);
|
||||||
Assert.assertFalse("retry failed",
|
Assert.assertFalse("retry failed",
|
||||||
retryPolicy.shouldRetry(windowRetryContext, 12));
|
retryPolicy.shouldRetry(windowRetryContext, 12));
|
||||||
|
|
Loading…
Reference in New Issue