YARN-7486. Race condition in service AM that can cause NPE. Contributed by Jian He

This commit is contained in:
Billie Rinaldi 2017-11-15 10:20:46 -08:00
parent 462e25a3b2
commit f4d5d20286
13 changed files with 290 additions and 126 deletions

View File

@ -132,7 +132,6 @@ public class ServiceScheduler extends CompositeService {
private AMRMClientAsync<AMRMClient.ContainerRequest> amRMClient; private AMRMClientAsync<AMRMClient.ContainerRequest> amRMClient;
private NMClientAsync nmClient; private NMClientAsync nmClient;
private AsyncDispatcher dispatcher; private AsyncDispatcher dispatcher;
AsyncDispatcher compInstanceDispatcher;
private YarnRegistryViewForProviders yarnRegistryOperations; private YarnRegistryViewForProviders yarnRegistryOperations;
private ServiceContext context; private ServiceContext context;
private ContainerLaunchService containerLaunchService; private ContainerLaunchService containerLaunchService;
@ -152,7 +151,7 @@ public void buildInstance(ServiceContext context, Configuration configuration)
yarnRegistryOperations = yarnRegistryOperations =
createYarnRegistryOperations(context, registryClient); createYarnRegistryOperations(context, registryClient);
// register metrics // register metrics,
serviceMetrics = ServiceMetrics serviceMetrics = ServiceMetrics
.register(app.getName(), "Metrics for service"); .register(app.getName(), "Metrics for service");
serviceMetrics.tag("type", "Metrics type [component or service]", "service"); serviceMetrics.tag("type", "Metrics type [component or service]", "service");
@ -167,14 +166,11 @@ public void buildInstance(ServiceContext context, Configuration configuration)
dispatcher = new AsyncDispatcher("Component dispatcher"); dispatcher = new AsyncDispatcher("Component dispatcher");
dispatcher.register(ComponentEventType.class, dispatcher.register(ComponentEventType.class,
new ComponentEventHandler()); new ComponentEventHandler());
dispatcher.register(ComponentInstanceEventType.class,
new ComponentInstanceEventHandler());
dispatcher.setDrainEventsOnStop(); dispatcher.setDrainEventsOnStop();
addIfService(dispatcher); addIfService(dispatcher);
compInstanceDispatcher =
new AsyncDispatcher("CompInstance dispatcher");
compInstanceDispatcher.register(ComponentInstanceEventType.class,
new ComponentInstanceEventHandler());
addIfService(compInstanceDispatcher);
containerLaunchService = new ContainerLaunchService(context.fs); containerLaunchService = new ContainerLaunchService(context.fs);
addService(containerLaunchService); addService(containerLaunchService);
@ -277,10 +273,10 @@ public void serviceStart() throws Exception {
} }
private void recoverComponents(RegisterApplicationMasterResponse response) { private void recoverComponents(RegisterApplicationMasterResponse response) {
List<Container> recoveredContainers = response List<Container> containersFromPrevAttempt = response
.getContainersFromPreviousAttempts(); .getContainersFromPreviousAttempts();
LOG.info("Received {} containers from previous attempt.", LOG.info("Received {} containers from previous attempt.",
recoveredContainers.size()); containersFromPrevAttempt.size());
Map<String, ServiceRecord> existingRecords = new HashMap<>(); Map<String, ServiceRecord> existingRecords = new HashMap<>();
List<String> existingComps = null; List<String> existingComps = null;
try { try {
@ -302,9 +298,8 @@ private void recoverComponents(RegisterApplicationMasterResponse response) {
} }
} }
} }
for (Container container : recoveredContainers) { for (Container container : containersFromPrevAttempt) {
LOG.info("Handling container {} from previous attempt", LOG.info("Handling {} from previous attempt", container.getId());
container.getId());
ServiceRecord record = existingRecords.get(RegistryPathUtils ServiceRecord record = existingRecords.get(RegistryPathUtils
.encodeYarnID(container.getId().toString())); .encodeYarnID(container.getId().toString()));
if (record != null) { if (record != null) {
@ -487,16 +482,21 @@ public void onContainersAllocated(List<Container> containers) {
new ComponentEvent(comp.getName(), CONTAINER_ALLOCATED) new ComponentEvent(comp.getName(), CONTAINER_ALLOCATED)
.setContainer(container); .setContainer(container);
dispatcher.getEventHandler().handle(event); dispatcher.getEventHandler().handle(event);
Collection<AMRMClient.ContainerRequest> requests = amRMClient try {
.getMatchingRequests(container.getAllocationRequestId()); Collection<AMRMClient.ContainerRequest> requests = amRMClient
LOG.info("[COMPONENT {}]: {} outstanding container requests.", .getMatchingRequests(container.getAllocationRequestId());
comp.getName(), requests.size()); LOG.info("[COMPONENT {}]: remove {} outstanding container requests " +
// remove the corresponding request "for allocateId " + container.getAllocationRequestId(),
if (requests.iterator().hasNext()) { comp.getName(), requests.size());
LOG.info("[COMPONENT {}]: removing one container request.", comp // remove the corresponding request
.getName()); if (requests.iterator().hasNext()) {
AMRMClient.ContainerRequest request = requests.iterator().next(); AMRMClient.ContainerRequest request = requests.iterator().next();
amRMClient.removeContainerRequest(request); amRMClient.removeContainerRequest(request);
}
} catch(Exception e) {
//TODO Due to YARN-7490, exception may be thrown, catch and ignore for
//now.
LOG.error("Exception when removing the matching requests. ", e);
} }
} }
} }
@ -569,7 +569,7 @@ private class NMClientCallback extends NMClientAsync.AbstractCallbackHandler {
} }
ComponentEvent event = ComponentEvent event =
new ComponentEvent(instance.getCompName(), CONTAINER_STARTED) new ComponentEvent(instance.getCompName(), CONTAINER_STARTED)
.setInstance(instance); .setInstance(instance).setContainerId(containerId);
dispatcher.getEventHandler().handle(event); dispatcher.getEventHandler().handle(event);
} }
@ -649,10 +649,6 @@ public void removeLiveCompInstance(ContainerId containerId) {
liveInstances.remove(containerId); liveInstances.remove(containerId);
} }
public AsyncDispatcher getCompInstanceDispatcher() {
return compInstanceDispatcher;
}
public YarnRegistryViewForProviders getYarnRegistryOperations() { public YarnRegistryViewForProviders getYarnRegistryOperations() {
return yarnRegistryOperations; return yarnRegistryOperations;
} }

View File

@ -82,7 +82,8 @@ public class Component implements EventHandler<ComponentEvent> {
private Map<String, ComponentInstance> compInstances = private Map<String, ComponentInstance> compInstances =
new ConcurrentHashMap<>(); new ConcurrentHashMap<>();
// component instances to be assigned with a container // component instances to be assigned with a container
private List<ComponentInstance> pendingInstances = new LinkedList<>(); private List<ComponentInstance> pendingInstances =
Collections.synchronizedList(new LinkedList<>());
private ContainerFailureTracker failureTracker; private ContainerFailureTracker failureTracker;
private Probe probe; private Probe probe;
private final ReentrantReadWriteLock.ReadLock readLock; private final ReentrantReadWriteLock.ReadLock readLock;
@ -94,7 +95,7 @@ public class Component implements EventHandler<ComponentEvent> {
private StateMachine<ComponentState, ComponentEventType, ComponentEvent> private StateMachine<ComponentState, ComponentEventType, ComponentEvent>
stateMachine; stateMachine;
private AsyncDispatcher compInstanceDispatcher; private AsyncDispatcher dispatcher;
private static final StateMachineFactory<Component, ComponentState, ComponentEventType, ComponentEvent> private static final StateMachineFactory<Component, ComponentState, ComponentEventType, ComponentEvent>
stateMachineFactory = stateMachineFactory =
new StateMachineFactory<Component, ComponentState, ComponentEventType, ComponentEvent>( new StateMachineFactory<Component, ComponentState, ComponentEventType, ComponentEvent>(
@ -149,7 +150,7 @@ public Component(
this.readLock = lock.readLock(); this.readLock = lock.readLock();
this.writeLock = lock.writeLock(); this.writeLock = lock.writeLock();
this.stateMachine = stateMachineFactory.make(this); this.stateMachine = stateMachineFactory.make(this);
compInstanceDispatcher = scheduler.getCompInstanceDispatcher(); dispatcher = scheduler.getDispatcher();
failureTracker = failureTracker =
new ContainerFailureTracker(context, this); new ContainerFailureTracker(context, this);
probe = MonitorUtils.getProbe(componentSpec.getReadinessCheck()); probe = MonitorUtils.getProbe(componentSpec.getReadinessCheck());
@ -256,30 +257,18 @@ public void transition(Component component, ComponentEvent event) {
component.releaseContainer(container); component.releaseContainer(container);
return; return;
} }
if (instance.hasContainer()) {
LOG.info(
"[COMPONENT {}]: Instance {} already has container, release " +
"surplus container {}",
instance.getCompName(), instance.getCompInstanceId(), container
.getId());
component.releaseContainer(container);
return;
}
component.pendingInstances.remove(instance); component.pendingInstances.remove(instance);
LOG.info("[COMPONENT {}]: Recovered {} for component instance {} on " +
"host {}, num pending component instances reduced to {} ",
component.getName(), container.getId(), instance
.getCompInstanceName(), container.getNodeId(), component
.pendingInstances.size());
instance.setContainer(container); instance.setContainer(container);
ProviderUtils.initCompInstanceDir(component.getContext().fs, instance); ProviderUtils.initCompInstanceDir(component.getContext().fs, instance);
component.getScheduler().addLiveCompInstance(container.getId(), instance); component.getScheduler().addLiveCompInstance(container.getId(), instance);
LOG.info("[COMPONENT {}]: Marking {} as started for component " + LOG.info("[COMPONENT {}]: Recovered {} for component instance {} on " +
"instance {}", component.getName(), event.getContainer().getId(), "host {}, num pending component instances reduced to {} ",
instance.getCompInstanceId()); component.getName(), container.getId(),
component.compInstanceDispatcher.getEventHandler().handle( instance.getCompInstanceName(), container.getNodeId(),
new ComponentInstanceEvent(instance.getContainerId(), component.pendingInstances.size());
START)); component.dispatcher.getEventHandler().handle(
new ComponentInstanceEvent(container.getId(), START));
} }
} }
@ -288,9 +277,8 @@ private static class ContainerStartedTransition implements
@Override public ComponentState transition(Component component, @Override public ComponentState transition(Component component,
ComponentEvent event) { ComponentEvent event) {
component.compInstanceDispatcher.getEventHandler().handle( component.dispatcher.getEventHandler().handle(
new ComponentInstanceEvent(event.getInstance().getContainerId(), new ComponentInstanceEvent(event.getContainerId(), START));
START));
return checkIfStable(component); return checkIfStable(component);
} }
} }
@ -313,14 +301,7 @@ private static class ContainerCompletedTransition extends BaseTransition {
@Override @Override
public void transition(Component component, ComponentEvent event) { public void transition(Component component, ComponentEvent event) {
component.updateMetrics(event.getStatus()); component.updateMetrics(event.getStatus());
component.dispatcher.getEventHandler().handle(
// add back to pending list
component.pendingInstances.add(event.getInstance());
LOG.info(
"[COMPONENT {}]: {} completed, num pending comp instances increased to {}.",
component.getName(), event.getStatus().getContainerId(),
component.pendingInstances.size());
component.compInstanceDispatcher.getEventHandler().handle(
new ComponentInstanceEvent(event.getStatus().getContainerId(), new ComponentInstanceEvent(event.getStatus().getContainerId(),
STOP).setStatus(event.getStatus())); STOP).setStatus(event.getStatus()));
component.componentSpec.setState( component.componentSpec.setState(
@ -328,8 +309,8 @@ public void transition(Component component, ComponentEvent event) {
} }
} }
public ServiceMetrics getCompMetrics () { public void reInsertPendingInstance(ComponentInstance instance) {
return componentMetrics; pendingInstances.add(instance);
} }
private void releaseContainer(Container container) { private void releaseContainer(Container container) {
@ -581,4 +562,9 @@ private static class BaseTransition implements
public ServiceContext getContext() { public ServiceContext getContext() {
return context; return context;
} }
// Only for testing
public List<ComponentInstance> getPendingInstances() {
return pendingInstances;
}
} }

View File

@ -19,6 +19,7 @@
package org.apache.hadoop.yarn.service.component; package org.apache.hadoop.yarn.service.component;
import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.event.AbstractEvent; import org.apache.hadoop.yarn.event.AbstractEvent;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance; import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
@ -30,6 +31,16 @@ public class ComponentEvent extends AbstractEvent<ComponentEventType> {
private Container container; private Container container;
private ComponentInstance instance; private ComponentInstance instance;
private ContainerStatus status; private ContainerStatus status;
private ContainerId containerId;
public ContainerId getContainerId() {
return containerId;
}
public ComponentEvent setContainerId(ContainerId containerId) {
this.containerId = containerId;
return this;
}
public ComponentEvent(String name, ComponentEventType type) { public ComponentEvent(String name, ComponentEventType type) {
super(type); super(type);

View File

@ -146,7 +146,7 @@ private static class ContainerStartedTransition extends BaseTransition {
compInstance.containerStatusFuture = compInstance.containerStatusFuture =
compInstance.scheduler.executorService.scheduleAtFixedRate( compInstance.scheduler.executorService.scheduleAtFixedRate(
new ContainerStatusRetriever(compInstance.scheduler, new ContainerStatusRetriever(compInstance.scheduler,
compInstance.getContainerId(), compInstance), 0, 1, event.getContainerId(), compInstance), 0, 1,
TimeUnit.SECONDS); TimeUnit.SECONDS);
compInstance.component.incRunningContainers(); compInstance.component.incRunningContainers();
long containerStartTime = System.currentTimeMillis(); long containerStartTime = System.currentTimeMillis();
@ -160,10 +160,10 @@ private static class ContainerStartedTransition extends BaseTransition {
} }
org.apache.hadoop.yarn.service.api.records.Container container = org.apache.hadoop.yarn.service.api.records.Container container =
new org.apache.hadoop.yarn.service.api.records.Container(); new org.apache.hadoop.yarn.service.api.records.Container();
container.setId(compInstance.getContainerId().toString()); container.setId(event.getContainerId().toString());
container.setLaunchTime(new Date(containerStartTime)); container.setLaunchTime(new Date(containerStartTime));
container.setState(ContainerState.RUNNING_BUT_UNREADY); container.setState(ContainerState.RUNNING_BUT_UNREADY);
container.setBareHost(compInstance.container.getNodeId().getHost()); container.setBareHost(compInstance.getNodeId().getHost());
container.setComponentInstanceName(compInstance.getCompInstanceName()); container.setComponentInstanceName(compInstance.getCompInstanceName());
if (compInstance.containerSpec != null) { if (compInstance.containerSpec != null) {
// remove the previous container. // remove the previous container.
@ -219,15 +219,11 @@ public void transition(ComponentInstance compInstance,
// re-ask the failed container. // re-ask the failed container.
Component comp = compInstance.component; Component comp = compInstance.component;
comp.requestContainers(1); comp.requestContainers(1);
LOG.info(compInstance.getCompInstanceId()
+ ": Container completed. Requested a new container." + System
.lineSeparator() + " exitStatus={}, diagnostics={}.",
event.getStatus().getExitStatus(),
event.getStatus().getDiagnostics());
String containerDiag = String containerDiag =
compInstance.getCompInstanceId() + ": " + event.getStatus() compInstance.getCompInstanceId() + ": " + event.getStatus()
.getDiagnostics(); .getDiagnostics();
compInstance.diagnostics.append(containerDiag + System.lineSeparator()); compInstance.diagnostics.append(containerDiag + System.lineSeparator());
compInstance.cancelContainerStatusRetriever();
if (compInstance.getState().equals(READY)) { if (compInstance.getState().equals(READY)) {
compInstance.component.decContainersReady(); compInstance.component.decContainersReady();
@ -255,11 +251,13 @@ public void transition(ComponentInstance compInstance,
// hdfs dir content will be overwritten when a new container gets started, // hdfs dir content will be overwritten when a new container gets started,
// so no need remove. // so no need remove.
compInstance.scheduler.executorService compInstance.scheduler.executorService
.submit(compInstance::cleanupRegistry); .submit(() -> compInstance.cleanupRegistry(event.getContainerId()));
if (compInstance.timelineServiceEnabled) { if (compInstance.timelineServiceEnabled) {
// record in ATS // record in ATS
compInstance.serviceTimelinePublisher.componentInstanceFinished compInstance.serviceTimelinePublisher
(compInstance, event.getStatus().getExitStatus(), containerDiag); .componentInstanceFinished(event.getContainerId(),
event.getStatus().getExitStatus(), containerDiag);
} }
compInstance.containerSpec.setState(ContainerState.STOPPED); compInstance.containerSpec.setState(ContainerState.STOPPED);
} }
@ -267,6 +265,14 @@ public void transition(ComponentInstance compInstance,
// remove the failed ContainerId -> CompInstance mapping // remove the failed ContainerId -> CompInstance mapping
comp.getScheduler().removeLiveCompInstance(event.getContainerId()); comp.getScheduler().removeLiveCompInstance(event.getContainerId());
comp.reInsertPendingInstance(compInstance);
LOG.info(compInstance.getCompInstanceId()
+ ": {} completed. Reinsert back to pending list and requested " +
"a new container." + System.lineSeparator() +
" exitStatus={}, diagnostics={}.",
event.getContainerId(), event.getStatus().getExitStatus(),
event.getStatus().getDiagnostics());
if (shouldExit) { if (shouldExit) {
// Sleep for 5 seconds in hope that the state can be recorded in ATS. // Sleep for 5 seconds in hope that the state can be recorded in ATS.
// in case there's a client polling the comp state, it can be notified. // in case there's a client polling the comp state, it can be notified.
@ -277,8 +283,6 @@ public void transition(ComponentInstance compInstance,
} }
ExitUtil.terminate(-1); ExitUtil.terminate(-1);
} }
compInstance.removeContainer();
} }
} }
@ -312,15 +316,6 @@ public void handle(ComponentInstanceEvent event) {
} }
} }
public boolean hasContainer() {
return this.container != null;
}
public void removeContainer() {
this.container = null;
this.compInstanceId.setContainerId(null);
}
public void setContainer(Container container) { public void setContainer(Container container) {
this.container = container; this.container = container;
this.compInstanceId.setContainerId(container.getId()); this.compInstanceId.setContainerId(container.getId());
@ -337,7 +332,7 @@ public ContainerStatus getContainerStatus() {
public void updateContainerStatus(ContainerStatus status) { public void updateContainerStatus(ContainerStatus status) {
this.status = status; this.status = status;
org.apache.hadoop.yarn.service.api.records.Container container = org.apache.hadoop.yarn.service.api.records.Container container =
getCompSpec().getContainer(getContainerId().toString()); getCompSpec().getContainer(status.getContainerId().toString());
if (container != null) { if (container != null) {
container.setIp(StringUtils.join(",", status.getIPs())); container.setIp(StringUtils.join(",", status.getIPs()));
container.setHostname(status.getHost()); container.setHostname(status.getHost());
@ -348,10 +343,6 @@ public void updateContainerStatus(ContainerStatus status) {
updateServiceRecord(yarnRegistryOperations, status); updateServiceRecord(yarnRegistryOperations, status);
} }
public ContainerId getContainerId() {
return container.getId();
}
public String getCompName() { public String getCompName() {
return compInstanceId.getCompName(); return compInstanceId.getCompName();
} }
@ -423,12 +414,7 @@ private void updateServiceRecord(
public void destroy() { public void destroy() {
LOG.info(getCompInstanceId() + ": Flexed down by user, destroying."); LOG.info(getCompInstanceId() + ": Flexed down by user, destroying.");
diagnostics.append(getCompInstanceId() + ": Flexed down by user"); diagnostics.append(getCompInstanceId() + ": Flexed down by user");
if (container != null) {
scheduler.removeLiveCompInstance(container.getId());
component.getScheduler().getAmRMClient()
.releaseAssignedContainer(container.getId());
getCompSpec().removeContainer(containerSpec);
}
// update metrics // update metrics
if (getState() == STARTED) { if (getState() == STARTED) {
component.decRunningContainers(); component.decRunningContainers();
@ -437,16 +423,29 @@ public void destroy() {
component.decContainersReady(); component.decContainersReady();
component.decRunningContainers(); component.decRunningContainers();
} }
getCompSpec().removeContainer(containerSpec);
if (container == null) {
LOG.info(getCompInstanceId() + " no container is assigned when " +
"destroying");
return;
}
ContainerId containerId = container.getId();
scheduler.removeLiveCompInstance(containerId);
component.getScheduler().getAmRMClient()
.releaseAssignedContainer(containerId);
if (timelineServiceEnabled) { if (timelineServiceEnabled) {
serviceTimelinePublisher.componentInstanceFinished(this, serviceTimelinePublisher.componentInstanceFinished(containerId,
KILLED_BY_APPMASTER, diagnostics.toString()); KILLED_BY_APPMASTER, diagnostics.toString());
} }
scheduler.executorService.submit(this::cleanupRegistryAndCompHdfsDir); cancelContainerStatusRetriever();
scheduler.executorService.submit(() ->
cleanupRegistryAndCompHdfsDir(containerId));
} }
private void cleanupRegistry() { private void cleanupRegistry(ContainerId containerId) {
ContainerId containerId = getContainerId();
String cid = RegistryPathUtils.encodeYarnID(containerId.toString()); String cid = RegistryPathUtils.encodeYarnID(containerId.toString());
try { try {
yarnRegistryOperations.deleteComponent(getCompInstanceId(), cid); yarnRegistryOperations.deleteComponent(getCompInstanceId(), cid);
@ -456,8 +455,8 @@ private void cleanupRegistry() {
} }
//TODO Maybe have a dedicated cleanup service. //TODO Maybe have a dedicated cleanup service.
public void cleanupRegistryAndCompHdfsDir() { public void cleanupRegistryAndCompHdfsDir(ContainerId containerId) {
cleanupRegistry(); cleanupRegistry(containerId);
try { try {
if (compInstanceDir != null && fs.exists(compInstanceDir)) { if (compInstanceDir != null && fs.exists(compInstanceDir)) {
boolean deleted = fs.delete(compInstanceDir, true); boolean deleted = fs.delete(compInstanceDir, true);
@ -515,6 +514,12 @@ private static class ContainerStatusRetriever implements Runnable {
} }
} }
private void cancelContainerStatusRetriever() {
if (containerStatusFuture != null && !containerStatusFuture.isDone()) {
containerStatusFuture.cancel(true);
}
}
@Override @Override
public int compareTo(ComponentInstance to) { public int compareTo(ComponentInstance to) {
long delta = containerStartedTime - to.containerStartedTime; long delta = containerStartedTime - to.containerStartedTime;

View File

@ -87,7 +87,7 @@ public ContainerLauncher(
AbstractLauncher launcher = new AbstractLauncher(fs, null); AbstractLauncher launcher = new AbstractLauncher(fs, null);
try { try {
provider.buildContainerLaunchContext(launcher, service, provider.buildContainerLaunchContext(launcher, service,
instance, fs, getConfig()); instance, fs, getConfig(), container);
instance.getComponent().getScheduler().getNmClient() instance.getComponent().getScheduler().getNmClient()
.startContainerAsync(container, .startContainerAsync(container,
launcher.completeContainerLaunch()); launcher.completeContainerLaunch());

View File

@ -20,6 +20,7 @@
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.ApplicationConstants; import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.service.api.records.Service; import org.apache.hadoop.yarn.service.api.records.Service;
import org.apache.hadoop.yarn.service.conf.YarnServiceConf; import org.apache.hadoop.yarn.service.conf.YarnServiceConf;
import org.apache.hadoop.yarn.service.api.records.Component; import org.apache.hadoop.yarn.service.api.records.Component;
@ -55,7 +56,7 @@ public abstract void processArtifact(AbstractLauncher launcher,
public void buildContainerLaunchContext(AbstractLauncher launcher, public void buildContainerLaunchContext(AbstractLauncher launcher,
Service service, ComponentInstance instance, Service service, ComponentInstance instance,
SliderFileSystem fileSystem, Configuration yarnConf) SliderFileSystem fileSystem, Configuration yarnConf, Container container)
throws IOException, SliderException { throws IOException, SliderException {
Component component = instance.getComponent().getComponentSpec();; Component component = instance.getComponent().getComponentSpec();;
processArtifact(launcher, instance, fileSystem, service); processArtifact(launcher, instance, fileSystem, service);
@ -67,7 +68,7 @@ public void buildContainerLaunchContext(AbstractLauncher launcher,
Map<String, String> globalTokens = Map<String, String> globalTokens =
instance.getComponent().getScheduler().globalTokens; instance.getComponent().getScheduler().globalTokens;
Map<String, String> tokensForSubstitution = ProviderUtils Map<String, String> tokensForSubstitution = ProviderUtils
.initCompTokensForSubstitute(instance); .initCompTokensForSubstitute(instance, container);
tokensForSubstitution.putAll(globalTokens); tokensForSubstitution.putAll(globalTokens);
// Set the environment variables in launcher // Set the environment variables in launcher
launcher.putEnv(ServiceUtils launcher.putEnv(ServiceUtils

View File

@ -19,6 +19,7 @@
package org.apache.hadoop.yarn.service.provider; package org.apache.hadoop.yarn.service.provider;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.service.api.records.Service; import org.apache.hadoop.yarn.service.api.records.Service;
import org.apache.hadoop.yarn.service.utils.SliderFileSystem; import org.apache.hadoop.yarn.service.utils.SliderFileSystem;
import org.apache.hadoop.yarn.service.exceptions.SliderException; import org.apache.hadoop.yarn.service.exceptions.SliderException;
@ -34,6 +35,6 @@ public interface ProviderService {
*/ */
void buildContainerLaunchContext(AbstractLauncher containerLauncher, void buildContainerLaunchContext(AbstractLauncher containerLauncher,
Service service, ComponentInstance instance, Service service, ComponentInstance instance,
SliderFileSystem sliderFileSystem, Configuration yarnConf) SliderFileSystem sliderFileSystem, Configuration yarnConf, Container
throws IOException, SliderException; container) throws IOException, SliderException;
} }

View File

@ -24,6 +24,7 @@
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.LocalResourceType; import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.service.ServiceContext; import org.apache.hadoop.yarn.service.ServiceContext;
@ -393,13 +394,13 @@ private static void resolvePlainTemplateAndSaveOnHdfs(FileSystem fs,
* @return tokens to replace * @return tokens to replace
*/ */
public static Map<String, String> initCompTokensForSubstitute( public static Map<String, String> initCompTokensForSubstitute(
ComponentInstance instance) { ComponentInstance instance, Container container) {
Map<String, String> tokens = new HashMap<>(); Map<String, String> tokens = new HashMap<>();
tokens.put(COMPONENT_NAME, instance.getCompSpec().getName()); tokens.put(COMPONENT_NAME, instance.getCompSpec().getName());
tokens tokens
.put(COMPONENT_NAME_LC, instance.getCompSpec().getName().toLowerCase()); .put(COMPONENT_NAME_LC, instance.getCompSpec().getName().toLowerCase());
tokens.put(COMPONENT_INSTANCE_NAME, instance.getCompInstanceName()); tokens.put(COMPONENT_INSTANCE_NAME, instance.getCompInstanceName());
tokens.put(CONTAINER_ID, instance.getContainer().getId().toString()); tokens.put(CONTAINER_ID, container.getId().toString());
tokens.put(COMPONENT_ID, tokens.put(COMPONENT_ID,
String.valueOf(instance.getCompInstanceId().getId())); String.valueOf(instance.getCompInstanceId().getId()));
tokens.putAll(instance.getComponent().getDependencyHostIpTokens()); tokens.putAll(instance.getComponent().getDependencyHostIpTokens());

View File

@ -20,6 +20,7 @@
import org.apache.hadoop.metrics2.AbstractMetric; import org.apache.hadoop.metrics2.AbstractMetric;
import org.apache.hadoop.service.CompositeService; import org.apache.hadoop.service.CompositeService;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEntity; import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEntity;
import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEvent; import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEvent;
@ -178,10 +179,10 @@ public void componentInstanceStarted(Container container,
putEntity(entity); putEntity(entity);
} }
public void componentInstanceFinished(ComponentInstance instance, public void componentInstanceFinished(ContainerId containerId,
int exitCode, String diagnostics) { int exitCode, String diagnostics) {
TimelineEntity entity = createComponentInstanceEntity( TimelineEntity entity = createComponentInstanceEntity(
instance.getContainer().getId().toString()); containerId.toString());
// create info keys // create info keys
Map<String, Object> entityInfos = new HashMap<String, Object>(); Map<String, Object> entityInfos = new HashMap<String, Object>();

View File

@ -24,14 +24,8 @@
import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse; import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse; import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.*;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.client.api.AMRMClient; import org.apache.hadoop.yarn.client.api.AMRMClient;
import org.apache.hadoop.yarn.client.api.NMClient; import org.apache.hadoop.yarn.client.api.NMClient;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync; import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
@ -42,15 +36,15 @@
import org.apache.hadoop.yarn.service.api.records.Service; import org.apache.hadoop.yarn.service.api.records.Service;
import org.apache.hadoop.yarn.service.component.Component; import org.apache.hadoop.yarn.service.component.Component;
import org.apache.hadoop.yarn.service.component.ComponentState; import org.apache.hadoop.yarn.service.component.ComponentState;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstanceState;
import org.apache.hadoop.yarn.service.exceptions.BadClusterStateException; import org.apache.hadoop.yarn.service.exceptions.BadClusterStateException;
import org.apache.hadoop.yarn.service.registry.YarnRegistryViewForProviders; import org.apache.hadoop.yarn.service.registry.YarnRegistryViewForProviders;
import org.apache.hadoop.yarn.service.utils.SliderFileSystem; import org.apache.hadoop.yarn.service.utils.SliderFileSystem;
import org.apache.hadoop.yarn.util.Records;
import java.io.IOException; import java.io.IOException;
import java.util.Collections; import java.util.*;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import static org.mockito.Mockito.mock; import static org.mockito.Mockito.mock;
@ -63,6 +57,8 @@ public class MockServiceAM extends ServiceMaster {
final List<Container> feedContainers = final List<Container> feedContainers =
Collections.synchronizedList(new LinkedList<>()); Collections.synchronizedList(new LinkedList<>());
final List<ContainerStatus> failedContainers =
Collections.synchronizedList(new LinkedList<>());
public MockServiceAM(Service service) { public MockServiceAM(Service service) {
super(service.getName()); super(service.getName());
this.service = service; this.service = service;
@ -102,10 +98,10 @@ protected AMRMClientAsync<AMRMClient.ContainerRequest> createAMRMClient() {
AllocateResponse.AllocateResponseBuilder builder = AllocateResponse.AllocateResponseBuilder builder =
AllocateResponse.newBuilder(); AllocateResponse.newBuilder();
// add new containers if any
synchronized (feedContainers) { synchronized (feedContainers) {
if (feedContainers.isEmpty()) { if (feedContainers.isEmpty()) {
System.out.println("Allocating........ no containers"); System.out.println("Allocating........ no containers");
return builder.build();
} else { } else {
// The AMRMClient will return containers for compoenent that are // The AMRMClient will return containers for compoenent that are
// at FLEXING state // at FLEXING state
@ -121,9 +117,20 @@ protected AMRMClientAsync<AMRMClient.ContainerRequest> createAMRMClient() {
itor.remove(); itor.remove();
} }
} }
return builder.allocatedContainers(allocatedContainers).build(); builder.allocatedContainers(allocatedContainers);
} }
} }
// add failed containers if any
synchronized (failedContainers) {
if (!failedContainers.isEmpty()) {
List<ContainerStatus> failed =
new LinkedList<>(failedContainers);
failedContainers.clear();
builder.completedContainersStatuses(failed);
}
}
return builder.build();
} }
@Override @Override
@ -184,6 +191,19 @@ public Container feedContainerToComp(Service service, int id,
return container; return container;
} }
public void feedFailedContainerToComp(Service service, int id, String
compName) {
ApplicationId applicationId = ApplicationId.fromString(service.getId());
ContainerId containerId = ContainerId
.newContainerId(ApplicationAttemptId.newInstance(applicationId, 1), id);
ContainerStatus containerStatus = Records.newRecord(ContainerStatus.class);
containerStatus.setContainerId(containerId);
synchronized (failedContainers) {
failedContainers.add(containerStatus);
}
}
public void flexComponent(String compName, long numberOfContainers) public void flexComponent(String compName, long numberOfContainers)
throws IOException { throws IOException {
ClientAMProtocol.ComponentCountProto componentCountProto = ClientAMProtocol.ComponentCountProto componentCountProto =
@ -218,4 +238,22 @@ public void waitForNumDesiredContainers(String compName,
} }
}, 1000, 20000); }, 1000, 20000);
} }
public ComponentInstance getCompInstance(String compName, String
instanceName) {
return context.scheduler.getAllComponents().get(compName)
.getComponentInstance(instanceName);
}
public void waitForCompInstanceState(ComponentInstance instance,
ComponentInstanceState state)
throws TimeoutException, InterruptedException {
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
return instance.getState().equals(state);
}
}, 1000, 20000);
}
} }

View File

@ -65,6 +65,7 @@ public class ServiceTestUtils {
private MiniYARNCluster yarnCluster = null; private MiniYARNCluster yarnCluster = null;
private MiniDFSCluster hdfsCluster = null; private MiniDFSCluster hdfsCluster = null;
TestingCluster zkCluster;
private FileSystem fs = null; private FileSystem fs = null;
private Configuration conf = null; private Configuration conf = null;
public static final int NUM_NMS = 1; public static final int NUM_NMS = 1;
@ -165,7 +166,6 @@ protected void setupInternal(int numNodeManager)
conf.setBoolean(NM_VMEM_CHECK_ENABLED, false); conf.setBoolean(NM_VMEM_CHECK_ENABLED, false);
conf.setBoolean(NM_PMEM_CHECK_ENABLED, false); conf.setBoolean(NM_PMEM_CHECK_ENABLED, false);
// setup zk cluster // setup zk cluster
TestingCluster zkCluster;
zkCluster = new TestingCluster(1); zkCluster = new TestingCluster(1);
zkCluster.start(); zkCluster.start();
conf.set(YarnConfiguration.RM_ZK_ADDRESS, zkCluster.getConnectString()); conf.set(YarnConfiguration.RM_ZK_ADDRESS, zkCluster.getConnectString());
@ -239,6 +239,9 @@ public void shutdown() throws IOException {
hdfsCluster = null; hdfsCluster = null;
} }
} }
if (zkCluster != null) {
zkCluster.stop();
}
if (basedir != null) { if (basedir != null) {
FileUtils.deleteDirectory(basedir); FileUtils.deleteDirectory(basedir);
} }

View File

@ -0,0 +1,109 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.service;
import org.apache.commons.io.FileUtils;
import org.apache.curator.test.TestingCluster;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.service.api.records.Service;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstanceState;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.TimeoutException;
import static org.apache.hadoop.registry.client.api.RegistryConstants
.KEY_REGISTRY_ZK_QUORUM;
public class TestServiceAM extends ServiceTestUtils{
private File basedir;
YarnConfiguration conf = new YarnConfiguration();
TestingCluster zkCluster;
@Before
public void setup() throws Exception {
basedir = new File("target", "apps");
if (basedir.exists()) {
FileUtils.deleteDirectory(basedir);
} else {
basedir.mkdirs();
}
zkCluster = new TestingCluster(1);
zkCluster.start();
conf.set(KEY_REGISTRY_ZK_QUORUM, zkCluster.getConnectString());
System.out.println("ZK cluster: " + zkCluster.getConnectString());
}
@After
public void tearDown() throws IOException {
if (basedir != null) {
FileUtils.deleteDirectory(basedir);
}
if (zkCluster != null) {
zkCluster.stop();
}
}
// Race condition YARN-7486
// 1. Allocate 1 container to compa and wait it to be started
// 2. Fail this container, and in the meanwhile allocate the 2nd container.
// 3. The 2nd container should not be assigned to compa-0 instance, because
// the compa-0 instance is not stopped yet.
// 4. check compa still has the instance in the pending list.
@Test
public void testContainerCompleted() throws TimeoutException,
InterruptedException {
ApplicationId applicationId = ApplicationId.newInstance(123456, 1);
Service exampleApp = new Service();
exampleApp.setId(applicationId.toString());
exampleApp.setName("testContainerCompleted");
exampleApp.addComponent(createComponent("compa", 1, "pwd"));
MockServiceAM am = new MockServiceAM(exampleApp);
am.init(conf);
am.start();
ComponentInstance compa0 = am.getCompInstance("compa", "compa-0");
// allocate a container
am.feedContainerToComp(exampleApp, 1, "compa");
am.waitForCompInstanceState(compa0, ComponentInstanceState.STARTED);
System.out.println("Fail the container 1");
// fail the container
am.feedFailedContainerToComp(exampleApp, 1, "compa");
// allocate the second container immediately, this container will not be
// assigned to comp instance
// because the instance is not yet added to the pending list.
am.feedContainerToComp(exampleApp, 2, "compa");
am.waitForCompInstanceState(compa0, ComponentInstanceState.INIT);
// still 1 pending instance
Assert.assertEquals(1,
am.getComponent("compa").getPendingInstances().size());
am.stop();
}
}

View File

@ -20,6 +20,7 @@
package org.apache.hadoop.yarn.service.monitor; package org.apache.hadoop.yarn.service.monitor;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.curator.test.TestingCluster;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.service.MockServiceAM; import org.apache.hadoop.yarn.service.MockServiceAM;
@ -37,10 +38,14 @@
import java.io.IOException; import java.io.IOException;
import java.util.Collections; import java.util.Collections;
import static org.apache.hadoop.registry.client.api.RegistryConstants
.KEY_REGISTRY_ZK_QUORUM;
public class TestServiceMonitor extends ServiceTestUtils { public class TestServiceMonitor extends ServiceTestUtils {
private File basedir; private File basedir;
YarnConfiguration conf = new YarnConfiguration(); YarnConfiguration conf = new YarnConfiguration();
TestingCluster zkCluster;
@Before @Before
public void setup() throws Exception { public void setup() throws Exception {
@ -51,6 +56,10 @@ public void setup() throws Exception {
basedir.mkdirs(); basedir.mkdirs();
} }
conf.setLong(YarnServiceConf.READINESS_CHECK_INTERVAL, 2); conf.setLong(YarnServiceConf.READINESS_CHECK_INTERVAL, 2);
zkCluster = new TestingCluster(1);
zkCluster.start();
conf.set(KEY_REGISTRY_ZK_QUORUM, zkCluster.getConnectString());
System.out.println("ZK cluster: " + zkCluster.getConnectString());
} }
@After @After
@ -58,6 +67,9 @@ public void tearDown() throws IOException {
if (basedir != null) { if (basedir != null) {
FileUtils.deleteDirectory(basedir); FileUtils.deleteDirectory(basedir);
} }
if (zkCluster != null) {
zkCluster.stop();
}
} }
// Create compa with 1 container // Create compa with 1 container