YARN-8032. Added ability to configure failure validity interval for YARN service.

Contributed by Chandni Singh

(cherry-picked from 647058efc0)
This commit is contained in:
Eric Yang 2018-03-23 15:54:52 -04:00
parent 148322ca72
commit 20453488e8
4 changed files with 11 additions and 4 deletions

View File

@ -31,6 +31,8 @@ public class YarnServiceConf {
// Retry settings for container failures // Retry settings for container failures
public static final String CONTAINER_RETRY_MAX = "yarn.service.container-failure.retry.max"; public static final String CONTAINER_RETRY_MAX = "yarn.service.container-failure.retry.max";
public static final String CONTAINER_RETRY_INTERVAL = "yarn.service.container-failure.retry-interval-ms"; public static final String CONTAINER_RETRY_INTERVAL = "yarn.service.container-failure.retry-interval-ms";
public static final String CONTAINER_FAILURES_VALIDITY_INTERVAL =
"yarn.service.container-failure.validity-interval-ms";
public static final String AM_RESTART_MAX = "yarn.service.am-restart.max-attempts"; public static final String AM_RESTART_MAX = "yarn.service.am-restart.max-attempts";
public static final String AM_RESOURCE_MEM = "yarn.service.am-resource.memory"; public static final String AM_RESOURCE_MEM = "yarn.service.am-resource.memory";

View File

@ -169,10 +169,11 @@ public class AbstractLauncher {
return containerLaunchContext; return containerLaunchContext;
} }
public void setRetryContext(int maxRetries, int retryInterval) { public void setRetryContext(int maxRetries, int retryInterval,
long failuresValidityInterval) {
ContainerRetryContext retryContext = ContainerRetryContext ContainerRetryContext retryContext = ContainerRetryContext
.newInstance(ContainerRetryPolicy.RETRY_ON_ALL_ERRORS, null, maxRetries, .newInstance(ContainerRetryPolicy.RETRY_ON_ALL_ERRORS, null,
retryInterval); maxRetries, retryInterval, failuresValidityInterval);
containerLaunchContext.setContainerRetryContext(retryContext); containerLaunchContext.setContainerRetryContext(retryContext);
} }

View File

@ -39,6 +39,7 @@ import java.io.IOException;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_FAILURES_VALIDITY_INTERVAL;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_RETRY_INTERVAL; import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_RETRY_INTERVAL;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_RETRY_MAX; import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_RETRY_MAX;
import static org.apache.hadoop.yarn.service.utils.ServiceApiUtil.$; import static org.apache.hadoop.yarn.service.utils.ServiceApiUtil.$;
@ -109,6 +110,8 @@ public abstract class AbstractProviderService implements ProviderService,
.getInt(CONTAINER_RETRY_MAX, -1, service.getConfiguration(), .getInt(CONTAINER_RETRY_MAX, -1, service.getConfiguration(),
yarnConf), YarnServiceConf yarnConf), YarnServiceConf
.getInt(CONTAINER_RETRY_INTERVAL, 30000, service.getConfiguration(), .getInt(CONTAINER_RETRY_INTERVAL, 30000, service.getConfiguration(),
yarnConf)); yarnConf),
YarnServiceConf.getLong(CONTAINER_FAILURES_VALIDITY_INTERVAL, -1,
service.getConfiguration(), yarnConf));
} }
} }

View File

@ -113,6 +113,7 @@ Above config make the service AM to be retried at max 10 times.
|yarn.service.client-am.retry-interval-ms | the retry interval in milliseconds for the service client to talk to the service AM. By default, it is 2000, i.e. 2 seconds | |yarn.service.client-am.retry-interval-ms | the retry interval in milliseconds for the service client to talk to the service AM. By default, it is 2000, i.e. 2 seconds |
|yarn.service.container-failure.retry.max | the max number of retries for the container to be auto restarted if it fails. By default, it is set to -1, which means forever. |yarn.service.container-failure.retry.max | the max number of retries for the container to be auto restarted if it fails. By default, it is set to -1, which means forever.
|yarn.service.container-failure.retry-interval-ms| the retry interval in milliseconds for the container to be restarted. By default, it is 30000, i.e. 30 seconds | |yarn.service.container-failure.retry-interval-ms| the retry interval in milliseconds for the container to be restarted. By default, it is 30000, i.e. 30 seconds |
|yarn.service.container-failure.validity-interval-ms | the failure validity interval in milliseconds which when set to a value greater than 0, will not take the failures that happened outside of this interval into failure count. By default, it is set to -1, which means that all the failures so far will be included in failure count. |
|yarn.service.am-restart.max-attempts| the max number of attempts for the framework AM |yarn.service.am-restart.max-attempts| the max number of attempts for the framework AM
|yarn.service.am-resource.memory | the memory size in GB for the framework AM. By default, it is set to 1024 |yarn.service.am-resource.memory | the memory size in GB for the framework AM. By default, it is set to 1024
|yarn.service.queue | the default queue to which the service will be submitted. By default, it is submitted to `default` queue |yarn.service.queue | the default queue to which the service will be submitted. By default, it is submitted to `default` queue