YARN-8060. Added default readiness check for service components.

Contributed by Billie Rinaldi

(cherry picked from 2d0662cfd5)
This commit is contained in:
Eric Yang 2018-04-16 14:24:26 -04:00
parent 20453488e8
commit 2911b5406b
22 changed files with 581 additions and 106 deletions

View File

@ -422,14 +422,15 @@ definitions:
type: string type: string
description: A list of quicklink keys defined at the service level, and to be resolved by this component. description: A list of quicklink keys defined at the service level, and to be resolved by this component.
ReadinessCheck: ReadinessCheck:
description: A custom command or a pluggable helper container to determine the readiness of a container of a component. Readiness for every service is different. Hence the need for a simple interface, with scope to support advanced usecases. description: A check to be performed to determine the readiness of a component instance (a container). If no readiness check is specified, the default readiness check will be used unless the yarn.service.default-readiness-check.enabled configuration property is set to false at the component, service, or system level. The artifact field is currently unsupported but may be implemented in the future, enabling a pluggable helper container to support advanced use cases.
required: required:
- type - type
properties: properties:
type: type:
type: string type: string
description: E.g. HTTP (YARN will perform a simple REST call at a regular interval and expect a 204 No content). description: DEFAULT (AM checks whether the container has an IP and optionally performs a DNS lookup for the container hostname), HTTP (AM performs default checks, plus sends a REST call to the container and expects a response code between 200 and 299), or PORT (AM performs default checks, plus attempts to open a socket connection to the container on a specified port).
enum: enum:
- DEFAULT
- HTTP - HTTP
- PORT - PORT
properties: properties:

View File

@ -19,6 +19,7 @@
package org.apache.hadoop.yarn.service; package org.apache.hadoop.yarn.service;
import org.apache.hadoop.yarn.service.component.Component; import org.apache.hadoop.yarn.service.component.Component;
import org.apache.hadoop.yarn.service.conf.YarnServiceConf;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -29,6 +30,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.DEFAULT_NODE_BLACKLIST_THRESHOLD;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.NODE_BLACKLIST_THRESHOLD; import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.NODE_BLACKLIST_THRESHOLD;
/** /**
@ -51,8 +53,9 @@ public class ContainerFailureTracker {
public ContainerFailureTracker(ServiceContext context, Component component) { public ContainerFailureTracker(ServiceContext context, Component component) {
this.context = context; this.context = context;
this.component = component; this.component = component;
maxFailurePerNode = component.getComponentSpec().getConfiguration() maxFailurePerNode = YarnServiceConf.getInt(NODE_BLACKLIST_THRESHOLD,
.getPropertyInt(NODE_BLACKLIST_THRESHOLD, 3); DEFAULT_NODE_BLACKLIST_THRESHOLD, component.getComponentSpec()
.getConfiguration(), context.scheduler.getConfig());
} }

View File

@ -234,9 +234,10 @@ public class ServiceScheduler extends CompositeService {
createConfigFileCache(context.fs.getFileSystem()); createConfigFileCache(context.fs.getFileSystem());
createAllComponents(); createAllComponents();
containerRecoveryTimeout = getConfig().getInt( containerRecoveryTimeout = YarnServiceConf.getInt(
YarnServiceConf.CONTAINER_RECOVERY_TIMEOUT_MS, YarnServiceConf.CONTAINER_RECOVERY_TIMEOUT_MS,
YarnServiceConf.DEFAULT_CONTAINER_RECOVERY_TIMEOUT_MS); YarnServiceConf.DEFAULT_CONTAINER_RECOVERY_TIMEOUT_MS,
app.getConfiguration(), getConfig());
} }
protected YarnRegistryViewForProviders createYarnRegistryOperations( protected YarnRegistryViewForProviders createYarnRegistryOperations(

View File

@ -60,6 +60,7 @@ public class ReadinessCheck implements Serializable {
@XmlType(name = "type") @XmlType(name = "type")
@XmlEnum @XmlEnum
public enum TypeEnum { public enum TypeEnum {
DEFAULT("DEFAULT"),
HTTP("HTTP"), HTTP("HTTP"),
PORT("PORT"); PORT("PORT");

View File

@ -48,8 +48,9 @@ public class ClientAMProxy extends ServerProxy{
} else { } else {
retryPolicy = retryPolicy =
createRetryPolicy(conf, YarnServiceConf.CLIENT_AM_RETRY_MAX_WAIT_MS, createRetryPolicy(conf, YarnServiceConf.CLIENT_AM_RETRY_MAX_WAIT_MS,
15 * 60 * 1000, YarnServiceConf.CLIENT_AM_RETRY_MAX_INTERVAL_MS, YarnServiceConf.DEFAULT_CLIENT_AM_RETRY_MAX_WAIT_MS,
2 * 1000); YarnServiceConf.CLIENT_AM_RETRY_MAX_INTERVAL_MS,
YarnServiceConf.DEFAULT_CLIENT_AM_RETRY_MAX_INTERVAL_MS);
} }
return createRetriableProxy(confClone, protocol, ugi, rpc, serverAddress, return createRetriableProxy(confClone, protocol, ugi, rpc, serverAddress,
retryPolicy); retryPolicy);

View File

@ -668,8 +668,8 @@ public class ServiceClient extends AppAdminClient implements SliderExitCodes,
submissionContext.setApplicationTimeouts(appTimeout); submissionContext.setApplicationTimeouts(appTimeout);
} }
submissionContext.setMaxAppAttempts(YarnServiceConf submissionContext.setMaxAppAttempts(YarnServiceConf
.getInt(YarnServiceConf.AM_RESTART_MAX, 20, app.getConfiguration(), .getInt(YarnServiceConf.AM_RESTART_MAX, DEFAULT_AM_RESTART_MAX, app
conf)); .getConfiguration(), conf));
setLogAggregationContext(app, conf, submissionContext); setLogAggregationContext(app, conf, submissionContext);
@ -695,7 +695,7 @@ public class ServiceClient extends AppAdminClient implements SliderExitCodes,
conf), 1)); conf), 1));
String queue = app.getQueue(); String queue = app.getQueue();
if (StringUtils.isEmpty(queue)) { if (StringUtils.isEmpty(queue)) {
queue = conf.get(YARN_QUEUE, "default"); queue = conf.get(YARN_QUEUE, DEFAULT_YARN_QUEUE);
} }
submissionContext.setQueue(queue); submissionContext.setQueue(queue);
submissionContext.setApplicationName(serviceName); submissionContext.setApplicationName(serviceName);

View File

@ -45,6 +45,7 @@ import org.apache.hadoop.yarn.service.api.records.ServiceState;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance; import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEvent; import org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEvent;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstanceId; import org.apache.hadoop.yarn.service.component.instance.ComponentInstanceId;
import org.apache.hadoop.yarn.service.conf.YarnServiceConf;
import org.apache.hadoop.yarn.service.monitor.probe.MonitorUtils; import org.apache.hadoop.yarn.service.monitor.probe.MonitorUtils;
import org.apache.hadoop.yarn.service.monitor.probe.Probe; import org.apache.hadoop.yarn.service.monitor.probe.Probe;
import org.apache.hadoop.yarn.service.provider.ProviderUtils; import org.apache.hadoop.yarn.service.provider.ProviderUtils;
@ -79,6 +80,9 @@ import static org.apache.hadoop.yarn.service.component.ComponentEventType.*;
import static org.apache.hadoop.yarn.service.component.ComponentState.*; import static org.apache.hadoop.yarn.service.component.ComponentState.*;
import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEventType.*; import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEventType.*;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_FAILURE_THRESHOLD; import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_FAILURE_THRESHOLD;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.DEFAULT_CONTAINER_FAILURE_THRESHOLD;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.DEFAULT_READINESS_CHECK_ENABLED;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.DEFAULT_READINESS_CHECK_ENABLED_DEFAULT;
public class Component implements EventHandler<ComponentEvent> { public class Component implements EventHandler<ComponentEvent> {
private static final Logger LOG = LoggerFactory.getLogger(Component.class); private static final Logger LOG = LoggerFactory.getLogger(Component.class);
@ -175,9 +179,15 @@ public class Component implements EventHandler<ComponentEvent> {
dispatcher = scheduler.getDispatcher(); dispatcher = scheduler.getDispatcher();
failureTracker = failureTracker =
new ContainerFailureTracker(context, this); new ContainerFailureTracker(context, this);
probe = MonitorUtils.getProbe(componentSpec.getReadinessCheck()); if (componentSpec.getReadinessCheck() != null ||
maxContainerFailurePerComp = componentSpec.getConfiguration() YarnServiceConf.getBoolean(DEFAULT_READINESS_CHECK_ENABLED,
.getPropertyInt(CONTAINER_FAILURE_THRESHOLD, 10); DEFAULT_READINESS_CHECK_ENABLED_DEFAULT,
componentSpec.getConfiguration(), scheduler.getConfig())) {
probe = MonitorUtils.getProbe(componentSpec.getReadinessCheck());
}
maxContainerFailurePerComp = YarnServiceConf.getInt(
CONTAINER_FAILURE_THRESHOLD, DEFAULT_CONTAINER_FAILURE_THRESHOLD,
componentSpec.getConfiguration(), scheduler.getConfig());
createNumCompInstances(component.getNumberOfContainers()); createNumCompInstances(component.getNumberOfContainers());
setDesiredContainers(component.getNumberOfContainers().intValue()); setDesiredContainers(component.getNumberOfContainers().intValue());
} }

View File

@ -20,7 +20,9 @@ package org.apache.hadoop.yarn.service.component.instance;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.registry.client.api.RegistryConstants;
import org.apache.hadoop.registry.client.binding.RegistryPathUtils; import org.apache.hadoop.registry.client.binding.RegistryPathUtils;
import org.apache.hadoop.registry.client.binding.RegistryUtils;
import org.apache.hadoop.registry.client.types.ServiceRecord; import org.apache.hadoop.registry.client.types.ServiceRecord;
import org.apache.hadoop.registry.client.types.yarn.PersistencePolicies; import org.apache.hadoop.registry.client.types.yarn.PersistencePolicies;
import org.apache.hadoop.util.ExitUtil; import org.apache.hadoop.util.ExitUtil;
@ -520,6 +522,24 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
} }
} }
public String getHostname() {
String domain = getComponent().getScheduler().getConfig()
.get(RegistryConstants.KEY_DNS_DOMAIN);
String hostname;
if (domain == null || domain.isEmpty()) {
hostname = MessageFormat
.format("{0}.{1}.{2}", getCompInstanceName(),
getComponent().getContext().service.getName(),
RegistryUtils.currentUser());
} else {
hostname = MessageFormat
.format("{0}.{1}.{2}.{3}", getCompInstanceName(),
getComponent().getContext().service.getName(),
RegistryUtils.currentUser(), domain);
}
return hostname;
}
@Override @Override
public int compareTo(ComponentInstance to) { public int compareTo(ComponentInstance to) {
long delta = containerStartedTime - to.containerStartedTime; long delta = containerStartedTime - to.containerStartedTime;

View File

@ -20,27 +20,34 @@ package org.apache.hadoop.yarn.service.conf;
import org.apache.hadoop.yarn.service.api.records.Configuration; import org.apache.hadoop.yarn.service.api.records.Configuration;
// ALL SERVICE AM PROPERTIES ADDED TO THIS FILE MUST BE DOCUMENTED
// in the yarn site yarn-service/Configurations.md file.
public class YarnServiceConf { public class YarnServiceConf {
private static final String YARN_SERVICE_PREFIX = "yarn.service."; private static final String YARN_SERVICE_PREFIX = "yarn.service.";
// Retry settings for the ServiceClient to talk to Service AppMaster // Retry settings for the ServiceClient to talk to Service AppMaster
public static final String CLIENT_AM_RETRY_MAX_WAIT_MS = "yarn.service.client-am.retry.max-wait-ms"; public static final String CLIENT_AM_RETRY_MAX_WAIT_MS = "yarn.service.client-am.retry.max-wait-ms";
public static final long DEFAULT_CLIENT_AM_RETRY_MAX_WAIT_MS = 15 * 60 * 1000;
public static final String CLIENT_AM_RETRY_MAX_INTERVAL_MS = "yarn.service.client-am.retry-interval-ms"; public static final String CLIENT_AM_RETRY_MAX_INTERVAL_MS = "yarn.service.client-am.retry-interval-ms";
public static final long DEFAULT_CLIENT_AM_RETRY_MAX_INTERVAL_MS = 2 * 1000;
// Retry settings for container failures // Retry settings for container failures
public static final String CONTAINER_RETRY_MAX = "yarn.service.container-failure.retry.max"; public static final String CONTAINER_RETRY_MAX = "yarn.service.container-failure.retry.max";
public static final int DEFAULT_CONTAINER_RETRY_MAX = -1;
public static final String CONTAINER_RETRY_INTERVAL = "yarn.service.container-failure.retry-interval-ms"; public static final String CONTAINER_RETRY_INTERVAL = "yarn.service.container-failure.retry-interval-ms";
public static final int DEFAULT_CONTAINER_RETRY_INTERVAL = 30000;
public static final String CONTAINER_FAILURES_VALIDITY_INTERVAL = public static final String CONTAINER_FAILURES_VALIDITY_INTERVAL =
"yarn.service.container-failure.validity-interval-ms"; "yarn.service.container-failure.validity-interval-ms";
public static final long DEFAULT_CONTAINER_FAILURES_VALIDITY_INTERVAL = -1;
public static final String AM_RESTART_MAX = "yarn.service.am-restart.max-attempts"; public static final String AM_RESTART_MAX = "yarn.service.am-restart.max-attempts";
public static final int DEFAULT_AM_RESTART_MAX = 20;
public static final String AM_RESOURCE_MEM = "yarn.service.am-resource.memory"; public static final String AM_RESOURCE_MEM = "yarn.service.am-resource.memory";
public static final long DEFAULT_KEY_AM_RESOURCE_MEM = 1024; public static final long DEFAULT_KEY_AM_RESOURCE_MEM = 1024;
public static final String DEFAULT_AM_JVM_XMX = " -Xmx768m ";
public static final String YARN_QUEUE = "yarn.service.queue"; public static final String YARN_QUEUE = "yarn.service.queue";
public static final String DEFAULT_YARN_QUEUE = "default";
public static final String API_SERVER_ADDRESS = "yarn.service.api-server.address"; public static final String API_SERVER_ADDRESS = "yarn.service.api-server.address";
public static final String DEFAULT_API_SERVER_ADDRESS = "0.0.0.0:"; public static final String DEFAULT_API_SERVER_ADDRESS = "0.0.0.0:";
@ -67,11 +74,14 @@ public class YarnServiceConf {
*/ */
public static final String CONTAINER_FAILURE_THRESHOLD = public static final String CONTAINER_FAILURE_THRESHOLD =
"yarn.service.container-failure-per-component.threshold"; "yarn.service.container-failure-per-component.threshold";
public static final int DEFAULT_CONTAINER_FAILURE_THRESHOLD = 10;
/** /**
* Maximum number of container failures on a node before the node is blacklisted * Maximum number of container failures on a node before the node is blacklisted
*/ */
public static final String NODE_BLACKLIST_THRESHOLD = public static final String NODE_BLACKLIST_THRESHOLD =
"yarn.service.node-blacklist.threshold"; "yarn.service.node-blacklist.threshold";
public static final int DEFAULT_NODE_BLACKLIST_THRESHOLD = 3;
/** /**
* The failure count for CONTAINER_FAILURE_THRESHOLD and NODE_BLACKLIST_THRESHOLD * The failure count for CONTAINER_FAILURE_THRESHOLD and NODE_BLACKLIST_THRESHOLD
@ -79,6 +89,7 @@ public class YarnServiceConf {
*/ */
public static final String CONTAINER_FAILURE_WINDOW = public static final String CONTAINER_FAILURE_WINDOW =
"yarn.service.failure-count-reset.window"; "yarn.service.failure-count-reset.window";
public static final long DEFAULT_CONTAINER_FAILURE_WINDOW = 21600;
/** /**
* interval between readiness checks. * interval between readiness checks.
@ -86,10 +97,18 @@ public class YarnServiceConf {
public static final String READINESS_CHECK_INTERVAL = "yarn.service.readiness-check-interval.seconds"; public static final String READINESS_CHECK_INTERVAL = "yarn.service.readiness-check-interval.seconds";
public static final int DEFAULT_READINESS_CHECK_INTERVAL = 30; // seconds public static final int DEFAULT_READINESS_CHECK_INTERVAL = 30; // seconds
/**
* Default readiness check enabled.
*/
public static final String DEFAULT_READINESS_CHECK_ENABLED =
"yarn.service.default-readiness-check.enabled";
public static final boolean DEFAULT_READINESS_CHECK_ENABLED_DEFAULT = true;
/** /**
* JVM opts. * JVM opts.
*/ */
public static final String JVM_OPTS = "yarn.service.am.java.opts"; public static final String JVM_OPTS = "yarn.service.am.java.opts";
public static final String DEFAULT_AM_JVM_XMX = " -Xmx768m ";
/** /**
* How long to wait until a container is considered dead. * How long to wait until a container is considered dead.
@ -126,6 +145,12 @@ public class YarnServiceConf {
return userConf.getPropertyInt(name, systemConf.getInt(name, defaultValue)); return userConf.getPropertyInt(name, systemConf.getInt(name, defaultValue));
} }
public static boolean getBoolean(String name, boolean defaultValue,
Configuration userConf, org.apache.hadoop.conf.Configuration systemConf) {
return userConf.getPropertyBool(name, systemConf.getBoolean(name,
defaultValue));
}
public static String get(String name, String defaultVal, public static String get(String name, String defaultVal,
Configuration userConf, org.apache.hadoop.conf.Configuration systemConf) { Configuration userConf, org.apache.hadoop.conf.Configuration systemConf) {
return userConf.getProperty(name, systemConf.get(name, defaultVal)); return userConf.getProperty(name, systemConf.get(name, defaultVal));

View File

@ -43,6 +43,7 @@ import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanc
import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEventType.BECOME_READY; import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEventType.BECOME_READY;
import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceState.READY; import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceState.READY;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_FAILURE_WINDOW; import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_FAILURE_WINDOW;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.DEFAULT_CONTAINER_FAILURE_WINDOW;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.DEFAULT_READINESS_CHECK_INTERVAL; import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.DEFAULT_READINESS_CHECK_INTERVAL;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.READINESS_CHECK_INTERVAL; import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.READINESS_CHECK_INTERVAL;
@ -81,7 +82,7 @@ public class ServiceMonitor extends AbstractService {
// Default 6 hours. // Default 6 hours.
long failureResetInterval = YarnServiceConf long failureResetInterval = YarnServiceConf
.getLong(CONTAINER_FAILURE_WINDOW, 21600, .getLong(CONTAINER_FAILURE_WINDOW, DEFAULT_CONTAINER_FAILURE_WINDOW,
context.service.getConfiguration(), conf); context.service.getConfiguration(), conf);
executorService executorService
@ -109,11 +110,15 @@ public class ServiceMonitor extends AbstractService {
ProbeStatus status = instance.ping(); ProbeStatus status = instance.ping();
if (status.isSuccess()) { if (status.isSuccess()) {
if (instance.getState() == STARTED) { if (instance.getState() == STARTED) {
LOG.info("Readiness check succeeded for {}: {}", instance
.getCompInstanceName(), status);
// synchronously update the state. // synchronously update the state.
instance.handle( instance.handle(
new ComponentInstanceEvent(entry.getKey(), BECOME_READY)); new ComponentInstanceEvent(entry.getKey(), BECOME_READY));
} }
} else { } else {
LOG.info("Readiness check failed for {}: {}", instance
.getCompInstanceName(), status);
if (instance.getState() == READY) { if (instance.getState() == READY) {
instance.handle( instance.handle(
new ComponentInstanceEvent(entry.getKey(), BECOME_NOT_READY)); new ComponentInstanceEvent(entry.getKey(), BECOME_NOT_READY));

View File

@ -0,0 +1,99 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.service.monitor.probe;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
import org.apache.hadoop.yarn.service.utils.ServiceRegistryUtils;
import org.apache.hadoop.yarn.service.utils.ServiceUtils;
import java.io.IOException;
import java.util.Collections;
import java.util.Map;
/**
* A probe that checks whether the AM has retrieved an IP for a container.
* Optional parameters enable a subsequent check for whether a DNS lookup can
* be performed for the container's hostname. Configurable properties include:
*
* dns.check.enabled - true if DNS check should be performed (default false)
* dns.address - optional IP:port address of DNS server to use for DNS check
*/
public class DefaultProbe extends Probe {
private final boolean dnsCheckEnabled;
private final String dnsAddress;
public DefaultProbe(Map<String, String> props) {
this("Default probe: IP presence", props);
}
protected DefaultProbe(String name, Map<String, String> props) {
this.dnsCheckEnabled = getPropertyBool(props,
DEFAULT_PROBE_DNS_CHECK_ENABLED,
DEFAULT_PROBE_DNS_CHECK_ENABLED_DEFAULT);
this.dnsAddress = props.get(DEFAULT_PROBE_DNS_ADDRESS);
String additionalName = "";
if (dnsCheckEnabled) {
if (dnsAddress == null) {
additionalName = " with DNS checking";
} else {
additionalName = " with DNS checking and DNS server address " +
dnsAddress;
}
}
setName(name + additionalName);
}
public static DefaultProbe create() throws IOException {
return new DefaultProbe(Collections.emptyMap());
}
public static DefaultProbe create(Map<String, String> props) throws
IOException {
return new DefaultProbe(props);
}
@Override
public ProbeStatus ping(ComponentInstance instance) {
ProbeStatus status = new ProbeStatus();
ContainerStatus containerStatus = instance.getContainerStatus();
if (containerStatus == null || ServiceUtils.isEmpty(containerStatus
.getIPs())) {
status.fail(this, new IOException(
instance.getCompInstanceName() + ": IP is not available yet"));
return status;
}
String hostname = instance.getHostname();
if (dnsCheckEnabled && !ServiceRegistryUtils.registryDNSLookupExists(
dnsAddress, hostname)) {
status.fail(this, new IOException(
instance.getCompInstanceName() + ": DNS checking is enabled, but " +
"lookup for " + hostname + " is not available yet"));
return status;
}
status.succeed(this);
return status;
}
protected boolean isDnsCheckEnabled() {
return dnsCheckEnabled;
}
}

View File

@ -17,11 +17,7 @@
package org.apache.hadoop.yarn.service.monitor.probe; package org.apache.hadoop.yarn.service.monitor.probe;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance; import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
import org.apache.hadoop.yarn.service.utils.ServiceUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -30,7 +26,20 @@ import java.net.HttpURLConnection;
import java.net.URL; import java.net.URL;
import java.util.Map; import java.util.Map;
public class HttpProbe extends Probe { /**
* A probe that checks whether a successful HTTP response code can be obtained
* from a container. A well-formed URL must be provided. The URL is intended
* to contain a token ${THIS_HOST} that will be replaced by the IP of the
* container. This probe also performs the checks of the {@link DefaultProbe}.
* Additional configurable properties include:
*
* url - required URL for HTTP connection, e.g. http://${THIS_HOST}:8080
* timeout - connection timeout (default 1000)
* min.success - minimum response code considered successful (default 200)
* max.success - maximum response code considered successful (default 299)
*
*/
public class HttpProbe extends DefaultProbe {
protected static final Logger log = LoggerFactory.getLogger(HttpProbe.class); protected static final Logger log = LoggerFactory.getLogger(HttpProbe.class);
private static final String HOST_TOKEN = "${THIS_HOST}"; private static final String HOST_TOKEN = "${THIS_HOST}";
@ -40,9 +49,9 @@ public class HttpProbe extends Probe {
private final int min, max; private final int min, max;
public HttpProbe(String url, int timeout, int min, int max, Configuration public HttpProbe(String url, int timeout, int min, int max,
conf) { Map<String, String> props) {
super("Http probe of " + url + " [" + min + "-" + max + "]", conf); super("Http probe of " + url + " [" + min + "-" + max + "]", props);
this.urlString = url; this.urlString = url;
this.timeout = timeout; this.timeout = timeout;
this.min = min; this.min = min;
@ -59,7 +68,7 @@ public class HttpProbe extends Probe {
WEB_PROBE_MIN_SUCCESS_DEFAULT); WEB_PROBE_MIN_SUCCESS_DEFAULT);
int maxSuccess = getPropertyInt(props, WEB_PROBE_MAX_SUCCESS, int maxSuccess = getPropertyInt(props, WEB_PROBE_MAX_SUCCESS,
WEB_PROBE_MAX_SUCCESS_DEFAULT); WEB_PROBE_MAX_SUCCESS_DEFAULT);
return new HttpProbe(urlString, timeout, minSuccess, maxSuccess, null); return new HttpProbe(urlString, timeout, minSuccess, maxSuccess, props);
} }
@ -73,15 +82,11 @@ public class HttpProbe extends Probe {
@Override @Override
public ProbeStatus ping(ComponentInstance instance) { public ProbeStatus ping(ComponentInstance instance) {
ProbeStatus status = new ProbeStatus(); ProbeStatus status = super.ping(instance);
ContainerStatus containerStatus = instance.getContainerStatus(); if (!status.isSuccess()) {
if (containerStatus == null || ServiceUtils.isEmpty(containerStatus.getIPs())
|| StringUtils.isEmpty(containerStatus.getHost())) {
status.fail(this, new IOException("IP is not available yet"));
return status; return status;
} }
String ip = instance.getContainerStatus().getIPs().get(0);
String ip = containerStatus.getIPs().get(0);
HttpURLConnection connection = null; HttpURLConnection connection = null;
try { try {
URL url = new URL(urlString.replace(HOST_TOKEN, ip)); URL url = new URL(urlString.replace(HOST_TOKEN, ip));

View File

@ -22,6 +22,18 @@ package org.apache.hadoop.yarn.service.monitor.probe;
*/ */
public interface MonitorKeys { public interface MonitorKeys {
/**
* Default probing key : DNS check enabled {@value}.
*/
String DEFAULT_PROBE_DNS_CHECK_ENABLED = "dns.check.enabled";
/**
* Default probing default : DNS check enabled {@value}.
*/
boolean DEFAULT_PROBE_DNS_CHECK_ENABLED_DEFAULT = false;
/**
* Default probing key : DNS checking address IP:port {@value}.
*/
String DEFAULT_PROBE_DNS_ADDRESS = "dns.address";
/** /**
* Port probing key : port to attempt to create a TCP connection to {@value}. * Port probing key : port to attempt to create a TCP connection to {@value}.
*/ */

View File

@ -61,20 +61,20 @@ public final class MonitorUtils {
} }
public static Probe getProbe(ReadinessCheck readinessCheck) { public static Probe getProbe(ReadinessCheck readinessCheck) {
if (readinessCheck == null) {
return null;
}
if (readinessCheck.getType() == null) {
return null;
}
try { try {
if (readinessCheck == null) {
return DefaultProbe.create();
}
if (readinessCheck.getType() == null) {
return DefaultProbe.create(readinessCheck.getProperties());
}
switch (readinessCheck.getType()) { switch (readinessCheck.getType()) {
case HTTP: case HTTP:
return HttpProbe.create(readinessCheck.getProperties()); return HttpProbe.create(readinessCheck.getProperties());
case PORT: case PORT:
return PortProbe.create(readinessCheck.getProperties()); return PortProbe.create(readinessCheck.getProperties());
default: default:
return null; return DefaultProbe.create(readinessCheck.getProperties());
} }
} catch (Throwable t) { } catch (Throwable t) {
throw new IllegalArgumentException("Error creating readiness check " + throw new IllegalArgumentException("Error creating readiness check " +

View File

@ -19,7 +19,6 @@ package org.apache.hadoop.yarn.service.monitor.probe;
import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance; import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
import org.apache.hadoop.yarn.service.utils.ServiceUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -29,15 +28,20 @@ import java.net.Socket;
import java.util.Map; import java.util.Map;
/** /**
* Probe for a port being open. * A probe that checks whether a container has a specified port open. This
* probe also performs the checks of the {@link DefaultProbe}. Additional
* configurable properties include:
*
* port - required port for socket connection
* timeout - connection timeout (default 1000)
*/ */
public class PortProbe extends Probe { public class PortProbe extends DefaultProbe {
protected static final Logger log = LoggerFactory.getLogger(PortProbe.class); protected static final Logger log = LoggerFactory.getLogger(PortProbe.class);
private final int port; private final int port;
private final int timeout; private final int timeout;
public PortProbe(int port, int timeout) { public PortProbe(int port, int timeout, Map<String, String> props) {
super("Port probe of " + port + " for " + timeout + "ms", null); super("Port probe of " + port + " for " + timeout + "ms", props);
this.port = port; this.port = port;
this.timeout = timeout; this.timeout = timeout;
} }
@ -54,7 +58,7 @@ public class PortProbe extends Probe {
int timeout = getPropertyInt(props, PORT_PROBE_CONNECT_TIMEOUT, int timeout = getPropertyInt(props, PORT_PROBE_CONNECT_TIMEOUT,
PORT_PROBE_CONNECT_TIMEOUT_DEFAULT); PORT_PROBE_CONNECT_TIMEOUT_DEFAULT);
return new PortProbe(port, timeout); return new PortProbe(port, timeout, props);
} }
/** /**
@ -65,12 +69,8 @@ public class PortProbe extends Probe {
*/ */
@Override @Override
public ProbeStatus ping(ComponentInstance instance) { public ProbeStatus ping(ComponentInstance instance) {
ProbeStatus status = new ProbeStatus(); ProbeStatus status = super.ping(instance);
if (!status.isSuccess()) {
if (instance.getContainerStatus() == null || ServiceUtils
.isEmpty(instance.getContainerStatus().getIPs())) {
status.fail(this, new IOException(
instance.getCompInstanceName() + ": IP is not available yet"));
return status; return status;
} }

View File

@ -18,7 +18,6 @@
package org.apache.hadoop.yarn.service.monitor.probe; package org.apache.hadoop.yarn.service.monitor.probe;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance; import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
import java.io.IOException; import java.io.IOException;
@ -29,18 +28,18 @@ import java.util.Map;
*/ */
public abstract class Probe implements MonitorKeys { public abstract class Probe implements MonitorKeys {
protected final Configuration conf;
private String name; private String name;
protected Probe() {
}
/** /**
* Create a probe of a specific name * Create a probe of a specific name
* *
* @param name probe name * @param name probe name
* @param conf configuration being stored.
*/ */
public Probe(String name, Configuration conf) { public Probe(String name) {
this.name = name; this.name = name;
this.conf = conf;
} }
@ -82,6 +81,15 @@ public abstract class Probe implements MonitorKeys {
return Integer.parseInt(value); return Integer.parseInt(value);
} }
public static boolean getPropertyBool(Map<String, String> props, String name,
boolean defaultValue) {
String value = props.get(name);
if (StringUtils.isEmpty(value)) {
return defaultValue;
}
return Boolean.parseBoolean(value);
}
/** /**
* perform any prelaunch initialization * perform any prelaunch initialization
*/ */

View File

@ -42,6 +42,9 @@ import java.util.Map.Entry;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_FAILURES_VALIDITY_INTERVAL; import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_FAILURES_VALIDITY_INTERVAL;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_RETRY_INTERVAL; import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_RETRY_INTERVAL;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_RETRY_MAX; import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_RETRY_MAX;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.DEFAULT_CONTAINER_FAILURES_VALIDITY_INTERVAL;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.DEFAULT_CONTAINER_RETRY_INTERVAL;
import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.DEFAULT_CONTAINER_RETRY_MAX;
import static org.apache.hadoop.yarn.service.utils.ServiceApiUtil.$; import static org.apache.hadoop.yarn.service.utils.ServiceApiUtil.$;
public abstract class AbstractProviderService implements ProviderService, public abstract class AbstractProviderService implements ProviderService,
@ -106,12 +109,14 @@ public abstract class AbstractProviderService implements ProviderService,
} }
// By default retry forever every 30 seconds // By default retry forever every 30 seconds
launcher.setRetryContext(YarnServiceConf launcher.setRetryContext(
.getInt(CONTAINER_RETRY_MAX, -1, service.getConfiguration(), YarnServiceConf.getInt(CONTAINER_RETRY_MAX, DEFAULT_CONTAINER_RETRY_MAX,
yarnConf), YarnServiceConf component.getConfiguration(), yarnConf),
.getInt(CONTAINER_RETRY_INTERVAL, 30000, service.getConfiguration(), YarnServiceConf.getInt(CONTAINER_RETRY_INTERVAL,
DEFAULT_CONTAINER_RETRY_INTERVAL, component.getConfiguration(),
yarnConf), yarnConf),
YarnServiceConf.getLong(CONTAINER_FAILURES_VALIDITY_INTERVAL, -1, YarnServiceConf.getLong(CONTAINER_FAILURES_VALIDITY_INTERVAL,
service.getConfiguration(), yarnConf)); DEFAULT_CONTAINER_FAILURES_VALIDITY_INTERVAL,
component.getConfiguration(), yarnConf));
} }
} }

View File

@ -17,8 +17,6 @@
*/ */
package org.apache.hadoop.yarn.service.provider.docker; package org.apache.hadoop.yarn.service.provider.docker;
import org.apache.hadoop.registry.client.api.RegistryConstants;
import org.apache.hadoop.registry.client.binding.RegistryUtils;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance; import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
import org.apache.hadoop.yarn.service.provider.AbstractProviderService; import org.apache.hadoop.yarn.service.provider.AbstractProviderService;
import org.apache.hadoop.yarn.service.api.records.Service; import org.apache.hadoop.yarn.service.api.records.Service;
@ -26,7 +24,6 @@ import org.apache.hadoop.yarn.service.utils.SliderFileSystem;
import org.apache.hadoop.yarn.service.containerlaunch.AbstractLauncher; import org.apache.hadoop.yarn.service.containerlaunch.AbstractLauncher;
import java.io.IOException; import java.io.IOException;
import java.text.MessageFormat;
public class DockerProviderService extends AbstractProviderService public class DockerProviderService extends AbstractProviderService
implements DockerKeys { implements DockerKeys {
@ -38,19 +35,7 @@ public class DockerProviderService extends AbstractProviderService
launcher.setDockerImage(compInstance.getCompSpec().getArtifact().getId()); launcher.setDockerImage(compInstance.getCompSpec().getArtifact().getId());
launcher.setDockerNetwork(compInstance.getCompSpec().getConfiguration() launcher.setDockerNetwork(compInstance.getCompSpec().getConfiguration()
.getProperty(DOCKER_NETWORK)); .getProperty(DOCKER_NETWORK));
String domain = compInstance.getComponent().getScheduler().getConfig() launcher.setDockerHostname(compInstance.getHostname());
.get(RegistryConstants.KEY_DNS_DOMAIN);
String hostname;
if (domain == null || domain.isEmpty()) {
hostname = MessageFormat
.format("{0}.{1}.{2}", compInstance.getCompInstanceName(),
service.getName(), RegistryUtils.currentUser());
} else {
hostname = MessageFormat
.format("{0}.{1}.{2}.{3}", compInstance.getCompInstanceName(),
service.getName(), RegistryUtils.currentUser(), domain);
}
launcher.setDockerHostname(hostname);
launcher.setRunPrivilegedContainer( launcher.setRunPrivilegedContainer(
compInstance.getCompSpec().getRunPrivilegedContainer()); compInstance.getCompSpec().getRunPrivilegedContainer());
} }

View File

@ -20,9 +20,23 @@ package org.apache.hadoop.yarn.service.utils;
import org.apache.hadoop.registry.client.binding.RegistryUtils; import org.apache.hadoop.registry.client.binding.RegistryUtils;
import org.apache.hadoop.yarn.service.conf.YarnServiceConstants; import org.apache.hadoop.yarn.service.conf.YarnServiceConstants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.naming.Context;
import javax.naming.NameNotFoundException;
import javax.naming.NamingException;
import javax.naming.directory.Attributes;
import javax.naming.directory.DirContext;
import javax.naming.directory.InitialDirContext;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.Hashtable;
public class ServiceRegistryUtils { public class ServiceRegistryUtils {
private static final Logger LOG =
LoggerFactory.getLogger(ServiceRegistryUtils.class);
public static final String SVC_USERS = "/services/yarn/users"; public static final String SVC_USERS = "/services/yarn/users";
@ -53,4 +67,50 @@ public class ServiceRegistryUtils {
public static String mkUserHomePath(String username) { public static String mkUserHomePath(String username) {
return SVC_USERS + "/" + username; return SVC_USERS + "/" + username;
} }
/**
* Determine whether a DNS lookup exists for a given name. If a DNS server
* address is provided, the lookup will be performed against this DNS
* server. This option is provided because it may be desirable to perform
* the lookup against Registry DNS directly to avoid caching of negative
* responses that may be performed by other DNS servers, thereby allowing the
* lookup to succeed sooner.
*
* @param addr host:port dns address, or null
* @param name name to look up
* @return true if a lookup succeeds for the specified name
*/
public static boolean registryDNSLookupExists(String addr, String
name) {
if (addr == null) {
try {
InetAddress.getByName(name);
return true;
} catch (UnknownHostException e) {
return false;
}
}
String dnsURI = String.format("dns://%s", addr);
Hashtable<String, Object> env = new Hashtable<>();
env.put(Context.INITIAL_CONTEXT_FACTORY,
"com.sun.jndi.dns.DnsContextFactory");
env.put(Context.PROVIDER_URL, dnsURI);
try {
DirContext ictx = new InitialDirContext(env);
Attributes attrs = ictx.getAttributes(name, new String[]{"A"});
if (attrs.size() > 0) {
return true;
}
} catch (NameNotFoundException e) {
// this doesn't need to be logged
} catch (NamingException e) {
LOG.error("Got exception when performing DNS lookup", e);
}
return false;
}
} }

View File

@ -0,0 +1,155 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.service.monitor.probe;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.service.api.records.ReadinessCheck;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
/**
* Tests for default probe.
*/
@RunWith(Parameterized.class)
public class TestDefaultProbe {
private final DefaultProbe probe;
public TestDefaultProbe(Probe probe) {
this.probe = (DefaultProbe) probe;
}
@Parameterized.Parameters
public static Collection<Object[]> data() {
// test run 1: Default probe checks that container has an IP
Probe p1 = MonitorUtils.getProbe(null);
// test run 2: Default probe with DNS check for component instance hostname
ReadinessCheck rc2 = new ReadinessCheck()
.type(ReadinessCheck.TypeEnum.DEFAULT)
.properties(Collections.singletonMap(
MonitorKeys.DEFAULT_PROBE_DNS_CHECK_ENABLED, "true"));
Probe p2 = MonitorUtils.getProbe(rc2);
// test run 3: Default probe with DNS check using specific DNS server
Map<String, String> props = new HashMap<>();
props.put(MonitorKeys.DEFAULT_PROBE_DNS_CHECK_ENABLED, "true");
props.put(MonitorKeys.DEFAULT_PROBE_DNS_ADDRESS, "8.8.8.8");
ReadinessCheck rc3 = new ReadinessCheck()
.type(ReadinessCheck.TypeEnum.DEFAULT).properties(props);
Probe p3 = MonitorUtils.getProbe(rc3);
return Arrays.asList(new Object[][] {{p1}, {p2}, {p3}});
}
@Test
public void testDefaultProbe() {
// component instance has a good hostname, so probe will eventually succeed
// whether or not DNS checking is enabled
ComponentInstance componentInstance =
createMockComponentInstance("example.com");
checkPingResults(probe, componentInstance, false);
// component instance has a bad hostname, so probe will fail when DNS
// checking is enabled
componentInstance = createMockComponentInstance("bad.dns.test");
checkPingResults(probe, componentInstance, probe.isDnsCheckEnabled());
}
private static void checkPingResults(Probe probe, ComponentInstance
componentInstance, boolean expectDNSCheckFailure) {
// on the first ping, null container status results in failure
ProbeStatus probeStatus = probe.ping(componentInstance);
assertFalse("Expected failure for " + probeStatus.toString(),
probeStatus.isSuccess());
assertTrue("Expected IP failure for " + probeStatus.toString(),
probeStatus.toString().contains(
componentInstance.getCompInstanceName() + ": IP is not available yet"));
// on the second ping, container status is retrieved but there are no
// IPs, resulting in failure
probeStatus = probe.ping(componentInstance);
assertFalse("Expected failure for " + probeStatus.toString(),
probeStatus.isSuccess());
assertTrue("Expected IP failure for " + probeStatus.toString(),
probeStatus.toString().contains(componentInstance
.getCompInstanceName() + ": IP is not available yet"));
// on the third ping, IPs are retrieved and success depends on whether or
// not a DNS lookup can be performed for the component instance hostname
probeStatus = probe.ping(componentInstance);
if (expectDNSCheckFailure) {
assertFalse("Expected failure for " + probeStatus.toString(),
probeStatus.isSuccess());
assertTrue("Expected DNS failure for " + probeStatus.toString(),
probeStatus.toString().contains(componentInstance
.getCompInstanceName() + ": DNS checking is enabled, but lookup" +
" for " + componentInstance.getHostname() + " is not available " +
"yet"));
} else {
assertTrue("Expected success for " + probeStatus.toString(),
probeStatus.isSuccess());
}
}
private static ComponentInstance createMockComponentInstance(String
hostname) {
ComponentInstance componentInstance = mock(ComponentInstance.class);
when(componentInstance.getHostname()).thenReturn(hostname);
when(componentInstance.getCompInstanceName()).thenReturn("comp-0");
when(componentInstance.getContainerStatus())
.thenAnswer(new Answer<ContainerStatus>() {
private int count = 0;
@Override
public ContainerStatus answer(InvocationOnMock invocationOnMock) {
count++;
if (count == 1) {
// first call to getContainerStatus returns null
return null;
} else if (count == 2) {
// second call returns a ContainerStatus with no IPs
ContainerStatus containerStatus = mock(ContainerStatus.class);
when(containerStatus.getIPs()).thenReturn(null);
return containerStatus;
} else {
// third call returns a ContainerStatus with one IP
ContainerStatus containerStatus = mock(ContainerStatus.class);
when(containerStatus.getIPs())
.thenReturn(Collections.singletonList("1.2.3.4"));
return containerStatus;
}
}
});
return componentInstance;
}
}

View File

@ -12,7 +12,9 @@
limitations under the License. See accompanying LICENSE file. limitations under the License. See accompanying LICENSE file.
--> -->
This document describes how to configure the services to be deployed on YARN # YARN Service Configurations
This document describes how to configure the services to be deployed on YARN.
There are mainly three types of configurations: There are mainly three types of configurations:
@ -93,7 +95,19 @@ Below is how a configuration object typically looks like:
## Configuration for YARN service AM ## Configuration for YARN service AM
This section describes the configurations for configuring the YARN service AM. This section describes the configurations for configuring the YARN service AM.
These can be specified either in the cluster `yarn-site.xml` at the global level or in the `properties` field of the `Configuration` object as per service basis like below: ### System-wide configuration properties
System-wide service AM properties can only be configured in the cluster `yarn-site.xml` file.
| System-Level Config Name | Description |
| ------------ | ------------- |
|yarn.service.framework.path | HDFS parent directory where the service AM dependency tarball can be found.|
|yarn.service.base.path | HDFS parent directory where service artifacts will be stored (default ${user_home_dir}/.yarn/).
|yarn.service.client-am.retry.max-wait-ms | Max retry time in milliseconds for the service client to talk to the service AM (default 900000, i.e. 15 minutes).|
|yarn.service.client-am.retry-interval-ms | Retry interval in milliseconds for the service client to talk to the service AM (default 2000, i.e. 2 seconds).|
|yarn.service.queue | Default queue to which the service will be submitted (default submits to the `default` queue). Note that queue can be specified per-service through the queue field, rather than through the service-level configuration properties.|
### Service-level configuration properties
Service-level service AM configuration properties can be specified either in the cluster `yarn-site.xml` at the global level (effectively overriding the default values system-wide) or specified per service in the `properties` field of the `Configuration` object as in the example below:
``` ```
{ {
"configuration" : { "configuration" : {
@ -103,34 +117,97 @@ These can be specified either in the cluster `yarn-site.xml` at the global level
} }
} }
``` ```
Above config make the service AM to be retried at max 10 times. The above config allows the service AM to be retried a maximum of 10 times.
#### Available configurations: | Service-Level Config Name | Description |
| Name | Description |
| ------------ | ------------- | | ------------ | ------------- |
|yarn.service.client-am.retry.max-wait-ms | the max retry time in milliseconds for the service client to talk to the service AM. By default, it is set to 0, which means no retry | |yarn.service.am-restart.max-attempts | Max number of times to start the service AM, after which the service will be killed (default 20).|
|yarn.service.client-am.retry-interval-ms | the retry interval in milliseconds for the service client to talk to the service AM. By default, it is 2000, i.e. 2 seconds | |yarn.service.am-resource.memory | Memory size in GB for the service AM (default 1024).|
|yarn.service.container-failure.retry.max | the max number of retries for the container to be auto restarted if it fails. By default, it is set to -1, which means forever. |yarn.service.am.java.opts | Additional JVM options for the service AM (default " -Xmx768m" will be appended to any JVM opts that do not specify -Xmx).|
|yarn.service.container-failure.retry-interval-ms| the retry interval in milliseconds for the container to be restarted. By default, it is 30000, i.e. 30 seconds | |yarn.service.container-recovery.timeout.ms | Timeout in milliseconds after which a newly started service AM releases all the containers of previous AM attempts which are not yet recovered from the RM (default 120000, i.e. 2 minutes).|
|yarn.service.container-failure.validity-interval-ms | the failure validity interval in milliseconds which when set to a value greater than 0, will not take the failures that happened outside of this interval into failure count. By default, it is set to -1, which means that all the failures so far will be included in failure count. | |yarn.service.failure-count-reset.window | Interval in seconds after which the container failure counts that will be evaluated for the per-component `yarn.service.container-failure-per-component.threshold` and `yarn.service.node-blacklist.threshold` are reset (default 21600, i.e. 6 hours).|
|yarn.service.am-restart.max-attempts| the max number of attempts for the framework AM |yarn.service.readiness-check-interval.seconds | Interval in seconds between readiness checks (default 30 seconds).|
|yarn.service.am-resource.memory | the memory size in GB for the framework AM. By default, it is set to 1024 |yarn.service.log.include-pattern | Regex expression for including log files by name when aggregating the logs after the application completes (default includes all files).|
|yarn.service.queue | the default queue to which the service will be submitted. By default, it is submitted to `default` queue |yarn.service.log.exclude-pattern | Regex expression for excluding log files by name when aggregating the logs after the application completes. If the log file name matches both include and exclude pattern, this file will be excluded (default does not exclude any files).|
|yarn.service.base.path | the root location for the service artifacts on hdfs for a user. By default, it is under ${user_home_dir}/.yarn/ |yarn.service.rolling-log.include-pattern | Regex expression for including log files by name when aggregating the logs while app is running.|
|yarn.service.container-failure-per-component.threshold | the max number of container failures for a given component before the AM exits. |yarn.service.rolling-log.exclude-pattern | Regex expression for excluding log files by name when aggregating the logs while app is running. If the log file name matches both include and exclude pattern, this file will be excluded.|
|yarn.service.node-blacklist.threshold | Maximum number of container failures on a node before the node is blacklisted by the AM
|yarn.service.failure-count-reset.window | The interval in seconds when the `yarn.service.container-failure-per-component.threshold` and `yarn.service.node-blacklist.threshold` gets reset. By default, it is 21600, i.e. 6 hours ### Component-level configuration properties
|yarn.service.readiness-check-interval.seconds | The interval in seconds between readiness checks. By default, it is 30 seconds Component-level service AM configuration properties can be specified either in the cluster `yarn-site.xml` at the global level (effectively overriding the default values system-wide), specified per service in the `properties` field of the `Configuration` object, or specified per component in the `properties` field of the component's `Configuration` object.
|yarn.service.log.include-pattern| The regex expression for including log files whose file name matches it when aggregating the logs after the application completes.
|yarn.service.log.exclude-pattern| The regex expression for excluding log files whose file name matches it when aggregating the logs after the application completes. If the log file name matches both include and exclude pattern, this file will be excluded. | Component-Level Config Name | Description |
|yarn.service.rolling-log.include-pattern| The regex expression for including log files whose file name matches it when aggregating the logs while app is running. | ------------ | ------------- |
|yarn.service.rolling-log.exclude-pattern| The regex expression for excluding log files whose file name matches it when aggregating the logs while app is running. If the log file name matches both include and exclude pattern, this file will be excluded. |yarn.service.container-failure.retry.max | Max number of retries for the container to be auto restarted if it fails (default -1, which means forever).|
|yarn.service.container-recovery.timeout.ms| The timeout in milliseconds after which the service AM releases all the containers of previous attempt which are not yet recovered by the RM. By default, it is set to 120000, i.e. 2 minutes. |yarn.service.container-failure.retry-interval-ms | Retry interval in milliseconds for the container to be restarted (default 30000, i.e. 30 seconds).|
|yarn.service.container-failure.validity-interval-ms | Failure validity interval in milliseconds. When set to a value greater than 0, the container retry policy will not take the failures that happened outside of this interval into the failure count (default -1, which means that all the failures so far will be included in the failure count).|
|yarn.service.container-failure-per-component.threshold | Max number of container failures (not including retries) for a given component before the AM stops the service (default 10).|
|yarn.service.node-blacklist.threshold | Maximum number of container failures on a node (not including retries) before the node is blacklisted by the AM (default 3).|
|yarn.service.default-readiness-check.enabled | Whether or not the default readiness check is enabled (default true).|
There is one component-level configuration property that is set differently in the `yarn-site.xml` file than it is in the service specification.
To select the docker network type that will be used for docker containers, `docker.network` may be set in the service `Configuration` `properties` or the component `Configuration` `properties`.
The system-wide default for the docker network type (for both YARN service containers and all other application containers) is set via the `yarn.nodemanager.runtime.linux.docker.default-container-network` property in the `yarn-site.xml` file.
### Component-level readiness check properties
The AM can be configured to perform readiness checks for containers through the `Component` field `readiness_check`.
A container will not reach the `READY` state until its readiness check succeeds.
If no readiness check is specified, the default readiness check is performed unless it is disabled through the `yarn.service.default-readiness-check.enabled` component-level configuration property.
The default readiness check succeeds when an IP becomes available for a container.
There are also optional properties that configure a DNS check in addition to the IP check.
DNS checking ensures that a DNS lookup succeeds for the container hostname before the container is considered ready.
For example, DNS checking can be enabled for the default readiness check as follows:
```
"readiness_check": {
"type": "DEFAULT",
"properties": {
"dns.check.enabled": "true"
}
},
```
Here is a full list of configurable properties for readiness checks that can be performed by the AM.
| Readiness Check | Configurable Property | Description |
| ------------ | ------------- | ------------- |
|DEFAULT, HTTP, PORT| dns.check.enabled | true if DNS check should be performed (default false)|
|DEFAULT, HTTP, PORT| dns.address | optional IP:port address of DNS server to use for DNS check|
|HTTP| url | required URL for HTTP response check, e.g. http://${THIS_HOST}:8080|
|HTTP| timeout | connection timeout (default 1000)|
|HTTP| min.success | minimum response code considered successful (default 200)|
|HTTP| max.success | maximum response code considered successful (default 299)|
|PORT| port | required port for socket connection|
|PORT| timeout | socket connection timeout (default 1000)|
HTTP readiness check example:
```
"readiness_check": {
"type": "HTTP",
"properties": {
"url": "http://${THIS_HOST}:8080"
}
},
```
PORT readiness check example:
```
"readiness_check": {
"type": "PORT",
"properties": {
"port": "8080"
}
},
```
#### Warning on configuring readiness checks with `host` network for docker containers
When the `host` docker network is configured for a component that has more than one container and the containers are binding to a specific port, there will be a port collision if the containers happen to be allocated on the same host.
HTTP and PORT readiness checks will not be valid in this situation.
In particular, both containers (the one that successfully binds to the port and the one that does not) may have their HTTP or PORT readiness check succeed since the checks are being performed against the same IP (the host's IP).
A valid configuration for such a service could use the anti-affinity placement policy, ensuring that containers will be assigned on different hosts so that port collisions will not occur.
## Constant variables for custom service ## Constant variables for custom service
The service framework provides some constant variables for user to configure their services. These variables are either dynamically generated by the system or are static ones such as service name defined by the user. The service framework provides some constant variables for user to configure their services. These variables are either dynamically generated by the system or are static ones such as service name defined by the user.
User can use these constants in their configurations to be dynamically substituted by the service AM.E.g. User can use these constants in their configurations to be dynamically substituted by the service AM. E.g.
``` ```
{ {
"type" : "HADOOP_XML", "type" : "HADOOP_XML",

View File

@ -349,11 +349,13 @@ The type of placement - affinity/anti-affinity/affinity-with-cardinality with co
### ReadinessCheck ### ReadinessCheck
A custom command or a pluggable helper container to determine the readiness of a container of a component. Readiness for every service is different. Hence the need for a simple interface, with scope to support advanced usecases. A check to be performed to determine the readiness of a component instance (a container).
If no readiness check is specified, the default readiness check will be used unless the yarn.service.default-readiness-check.enabled configuration property is set to false at the component or global level.
The artifact field is currently unsupported but may be implemented in the future, enabling a pluggable helper container to support advanced use cases.
|Name|Description|Required|Schema|Default| |Name|Description|Required|Schema|Default|
|----|----|----|----|----| |----|----|----|----|----|
|type|E.g. HTTP (YARN will perform a simple REST call at a regular interval and expect a 204 No content).|true|enum (HTTP, PORT)|| |type|DEFAULT (AM checks whether the container has an IP and optionally performs a DNS lookup for the container hostname), HTTP (AM performs default checks, plus sends a REST call to the container and expects a response code between 200 and 299), or PORT (AM performs default checks, plus attempts to open a socket connection to the container on a specified port).|true|enum (DEFAULT, HTTP, PORT)||
|properties|A blob of key value pairs that will be used to configure the check.|false|object|| |properties|A blob of key value pairs that will be used to configure the check.|false|object||
|artifact|Artifact of the pluggable readiness check helper container (optional). If specified, this helper container typically hosts the http uri and encapsulates the complex scripts required to perform actual container readiness check. At the end it is expected to respond a 204 No content just like the simplified use case. This pluggable framework benefits service owners who can run services without any packaging modifications. Note, artifacts of type docker only is supported for now. NOT IMPLEMENTED YET|false|Artifact|| |artifact|Artifact of the pluggable readiness check helper container (optional). If specified, this helper container typically hosts the http uri and encapsulates the complex scripts required to perform actual container readiness check. At the end it is expected to respond a 204 No content just like the simplified use case. This pluggable framework benefits service owners who can run services without any packaging modifications. Note, artifacts of type docker only is supported for now. NOT IMPLEMENTED YET|false|Artifact||