YARN-7486. Race condition in service AM that can cause NPE. Contributed by Jian He
This commit is contained in:
parent
462e25a3b2
commit
f4d5d20286
@ -132,7 +132,6 @@ public class ServiceScheduler extends CompositeService {
|
|||||||
private AMRMClientAsync<AMRMClient.ContainerRequest> amRMClient;
|
private AMRMClientAsync<AMRMClient.ContainerRequest> amRMClient;
|
||||||
private NMClientAsync nmClient;
|
private NMClientAsync nmClient;
|
||||||
private AsyncDispatcher dispatcher;
|
private AsyncDispatcher dispatcher;
|
||||||
AsyncDispatcher compInstanceDispatcher;
|
|
||||||
private YarnRegistryViewForProviders yarnRegistryOperations;
|
private YarnRegistryViewForProviders yarnRegistryOperations;
|
||||||
private ServiceContext context;
|
private ServiceContext context;
|
||||||
private ContainerLaunchService containerLaunchService;
|
private ContainerLaunchService containerLaunchService;
|
||||||
@ -152,7 +151,7 @@ public void buildInstance(ServiceContext context, Configuration configuration)
|
|||||||
yarnRegistryOperations =
|
yarnRegistryOperations =
|
||||||
createYarnRegistryOperations(context, registryClient);
|
createYarnRegistryOperations(context, registryClient);
|
||||||
|
|
||||||
// register metrics
|
// register metrics,
|
||||||
serviceMetrics = ServiceMetrics
|
serviceMetrics = ServiceMetrics
|
||||||
.register(app.getName(), "Metrics for service");
|
.register(app.getName(), "Metrics for service");
|
||||||
serviceMetrics.tag("type", "Metrics type [component or service]", "service");
|
serviceMetrics.tag("type", "Metrics type [component or service]", "service");
|
||||||
@ -167,14 +166,11 @@ public void buildInstance(ServiceContext context, Configuration configuration)
|
|||||||
dispatcher = new AsyncDispatcher("Component dispatcher");
|
dispatcher = new AsyncDispatcher("Component dispatcher");
|
||||||
dispatcher.register(ComponentEventType.class,
|
dispatcher.register(ComponentEventType.class,
|
||||||
new ComponentEventHandler());
|
new ComponentEventHandler());
|
||||||
|
dispatcher.register(ComponentInstanceEventType.class,
|
||||||
|
new ComponentInstanceEventHandler());
|
||||||
dispatcher.setDrainEventsOnStop();
|
dispatcher.setDrainEventsOnStop();
|
||||||
addIfService(dispatcher);
|
addIfService(dispatcher);
|
||||||
|
|
||||||
compInstanceDispatcher =
|
|
||||||
new AsyncDispatcher("CompInstance dispatcher");
|
|
||||||
compInstanceDispatcher.register(ComponentInstanceEventType.class,
|
|
||||||
new ComponentInstanceEventHandler());
|
|
||||||
addIfService(compInstanceDispatcher);
|
|
||||||
containerLaunchService = new ContainerLaunchService(context.fs);
|
containerLaunchService = new ContainerLaunchService(context.fs);
|
||||||
addService(containerLaunchService);
|
addService(containerLaunchService);
|
||||||
|
|
||||||
@ -277,10 +273,10 @@ public void serviceStart() throws Exception {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void recoverComponents(RegisterApplicationMasterResponse response) {
|
private void recoverComponents(RegisterApplicationMasterResponse response) {
|
||||||
List<Container> recoveredContainers = response
|
List<Container> containersFromPrevAttempt = response
|
||||||
.getContainersFromPreviousAttempts();
|
.getContainersFromPreviousAttempts();
|
||||||
LOG.info("Received {} containers from previous attempt.",
|
LOG.info("Received {} containers from previous attempt.",
|
||||||
recoveredContainers.size());
|
containersFromPrevAttempt.size());
|
||||||
Map<String, ServiceRecord> existingRecords = new HashMap<>();
|
Map<String, ServiceRecord> existingRecords = new HashMap<>();
|
||||||
List<String> existingComps = null;
|
List<String> existingComps = null;
|
||||||
try {
|
try {
|
||||||
@ -302,9 +298,8 @@ private void recoverComponents(RegisterApplicationMasterResponse response) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (Container container : recoveredContainers) {
|
for (Container container : containersFromPrevAttempt) {
|
||||||
LOG.info("Handling container {} from previous attempt",
|
LOG.info("Handling {} from previous attempt", container.getId());
|
||||||
container.getId());
|
|
||||||
ServiceRecord record = existingRecords.get(RegistryPathUtils
|
ServiceRecord record = existingRecords.get(RegistryPathUtils
|
||||||
.encodeYarnID(container.getId().toString()));
|
.encodeYarnID(container.getId().toString()));
|
||||||
if (record != null) {
|
if (record != null) {
|
||||||
@ -487,16 +482,21 @@ public void onContainersAllocated(List<Container> containers) {
|
|||||||
new ComponentEvent(comp.getName(), CONTAINER_ALLOCATED)
|
new ComponentEvent(comp.getName(), CONTAINER_ALLOCATED)
|
||||||
.setContainer(container);
|
.setContainer(container);
|
||||||
dispatcher.getEventHandler().handle(event);
|
dispatcher.getEventHandler().handle(event);
|
||||||
Collection<AMRMClient.ContainerRequest> requests = amRMClient
|
try {
|
||||||
.getMatchingRequests(container.getAllocationRequestId());
|
Collection<AMRMClient.ContainerRequest> requests = amRMClient
|
||||||
LOG.info("[COMPONENT {}]: {} outstanding container requests.",
|
.getMatchingRequests(container.getAllocationRequestId());
|
||||||
comp.getName(), requests.size());
|
LOG.info("[COMPONENT {}]: remove {} outstanding container requests " +
|
||||||
// remove the corresponding request
|
"for allocateId " + container.getAllocationRequestId(),
|
||||||
if (requests.iterator().hasNext()) {
|
comp.getName(), requests.size());
|
||||||
LOG.info("[COMPONENT {}]: removing one container request.", comp
|
// remove the corresponding request
|
||||||
.getName());
|
if (requests.iterator().hasNext()) {
|
||||||
AMRMClient.ContainerRequest request = requests.iterator().next();
|
AMRMClient.ContainerRequest request = requests.iterator().next();
|
||||||
amRMClient.removeContainerRequest(request);
|
amRMClient.removeContainerRequest(request);
|
||||||
|
}
|
||||||
|
} catch(Exception e) {
|
||||||
|
//TODO Due to YARN-7490, exception may be thrown, catch and ignore for
|
||||||
|
//now.
|
||||||
|
LOG.error("Exception when removing the matching requests. ", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -569,7 +569,7 @@ private class NMClientCallback extends NMClientAsync.AbstractCallbackHandler {
|
|||||||
}
|
}
|
||||||
ComponentEvent event =
|
ComponentEvent event =
|
||||||
new ComponentEvent(instance.getCompName(), CONTAINER_STARTED)
|
new ComponentEvent(instance.getCompName(), CONTAINER_STARTED)
|
||||||
.setInstance(instance);
|
.setInstance(instance).setContainerId(containerId);
|
||||||
dispatcher.getEventHandler().handle(event);
|
dispatcher.getEventHandler().handle(event);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -649,10 +649,6 @@ public void removeLiveCompInstance(ContainerId containerId) {
|
|||||||
liveInstances.remove(containerId);
|
liveInstances.remove(containerId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public AsyncDispatcher getCompInstanceDispatcher() {
|
|
||||||
return compInstanceDispatcher;
|
|
||||||
}
|
|
||||||
|
|
||||||
public YarnRegistryViewForProviders getYarnRegistryOperations() {
|
public YarnRegistryViewForProviders getYarnRegistryOperations() {
|
||||||
return yarnRegistryOperations;
|
return yarnRegistryOperations;
|
||||||
}
|
}
|
||||||
|
@ -82,7 +82,8 @@ public class Component implements EventHandler<ComponentEvent> {
|
|||||||
private Map<String, ComponentInstance> compInstances =
|
private Map<String, ComponentInstance> compInstances =
|
||||||
new ConcurrentHashMap<>();
|
new ConcurrentHashMap<>();
|
||||||
// component instances to be assigned with a container
|
// component instances to be assigned with a container
|
||||||
private List<ComponentInstance> pendingInstances = new LinkedList<>();
|
private List<ComponentInstance> pendingInstances =
|
||||||
|
Collections.synchronizedList(new LinkedList<>());
|
||||||
private ContainerFailureTracker failureTracker;
|
private ContainerFailureTracker failureTracker;
|
||||||
private Probe probe;
|
private Probe probe;
|
||||||
private final ReentrantReadWriteLock.ReadLock readLock;
|
private final ReentrantReadWriteLock.ReadLock readLock;
|
||||||
@ -94,7 +95,7 @@ public class Component implements EventHandler<ComponentEvent> {
|
|||||||
|
|
||||||
private StateMachine<ComponentState, ComponentEventType, ComponentEvent>
|
private StateMachine<ComponentState, ComponentEventType, ComponentEvent>
|
||||||
stateMachine;
|
stateMachine;
|
||||||
private AsyncDispatcher compInstanceDispatcher;
|
private AsyncDispatcher dispatcher;
|
||||||
private static final StateMachineFactory<Component, ComponentState, ComponentEventType, ComponentEvent>
|
private static final StateMachineFactory<Component, ComponentState, ComponentEventType, ComponentEvent>
|
||||||
stateMachineFactory =
|
stateMachineFactory =
|
||||||
new StateMachineFactory<Component, ComponentState, ComponentEventType, ComponentEvent>(
|
new StateMachineFactory<Component, ComponentState, ComponentEventType, ComponentEvent>(
|
||||||
@ -149,7 +150,7 @@ public Component(
|
|||||||
this.readLock = lock.readLock();
|
this.readLock = lock.readLock();
|
||||||
this.writeLock = lock.writeLock();
|
this.writeLock = lock.writeLock();
|
||||||
this.stateMachine = stateMachineFactory.make(this);
|
this.stateMachine = stateMachineFactory.make(this);
|
||||||
compInstanceDispatcher = scheduler.getCompInstanceDispatcher();
|
dispatcher = scheduler.getDispatcher();
|
||||||
failureTracker =
|
failureTracker =
|
||||||
new ContainerFailureTracker(context, this);
|
new ContainerFailureTracker(context, this);
|
||||||
probe = MonitorUtils.getProbe(componentSpec.getReadinessCheck());
|
probe = MonitorUtils.getProbe(componentSpec.getReadinessCheck());
|
||||||
@ -256,30 +257,18 @@ public void transition(Component component, ComponentEvent event) {
|
|||||||
component.releaseContainer(container);
|
component.releaseContainer(container);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (instance.hasContainer()) {
|
|
||||||
LOG.info(
|
|
||||||
"[COMPONENT {}]: Instance {} already has container, release " +
|
|
||||||
"surplus container {}",
|
|
||||||
instance.getCompName(), instance.getCompInstanceId(), container
|
|
||||||
.getId());
|
|
||||||
component.releaseContainer(container);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
component.pendingInstances.remove(instance);
|
component.pendingInstances.remove(instance);
|
||||||
LOG.info("[COMPONENT {}]: Recovered {} for component instance {} on " +
|
|
||||||
"host {}, num pending component instances reduced to {} ",
|
|
||||||
component.getName(), container.getId(), instance
|
|
||||||
.getCompInstanceName(), container.getNodeId(), component
|
|
||||||
.pendingInstances.size());
|
|
||||||
instance.setContainer(container);
|
instance.setContainer(container);
|
||||||
ProviderUtils.initCompInstanceDir(component.getContext().fs, instance);
|
ProviderUtils.initCompInstanceDir(component.getContext().fs, instance);
|
||||||
component.getScheduler().addLiveCompInstance(container.getId(), instance);
|
component.getScheduler().addLiveCompInstance(container.getId(), instance);
|
||||||
LOG.info("[COMPONENT {}]: Marking {} as started for component " +
|
LOG.info("[COMPONENT {}]: Recovered {} for component instance {} on " +
|
||||||
"instance {}", component.getName(), event.getContainer().getId(),
|
"host {}, num pending component instances reduced to {} ",
|
||||||
instance.getCompInstanceId());
|
component.getName(), container.getId(),
|
||||||
component.compInstanceDispatcher.getEventHandler().handle(
|
instance.getCompInstanceName(), container.getNodeId(),
|
||||||
new ComponentInstanceEvent(instance.getContainerId(),
|
component.pendingInstances.size());
|
||||||
START));
|
component.dispatcher.getEventHandler().handle(
|
||||||
|
new ComponentInstanceEvent(container.getId(), START));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -288,9 +277,8 @@ private static class ContainerStartedTransition implements
|
|||||||
|
|
||||||
@Override public ComponentState transition(Component component,
|
@Override public ComponentState transition(Component component,
|
||||||
ComponentEvent event) {
|
ComponentEvent event) {
|
||||||
component.compInstanceDispatcher.getEventHandler().handle(
|
component.dispatcher.getEventHandler().handle(
|
||||||
new ComponentInstanceEvent(event.getInstance().getContainerId(),
|
new ComponentInstanceEvent(event.getContainerId(), START));
|
||||||
START));
|
|
||||||
return checkIfStable(component);
|
return checkIfStable(component);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -313,14 +301,7 @@ private static class ContainerCompletedTransition extends BaseTransition {
|
|||||||
@Override
|
@Override
|
||||||
public void transition(Component component, ComponentEvent event) {
|
public void transition(Component component, ComponentEvent event) {
|
||||||
component.updateMetrics(event.getStatus());
|
component.updateMetrics(event.getStatus());
|
||||||
|
component.dispatcher.getEventHandler().handle(
|
||||||
// add back to pending list
|
|
||||||
component.pendingInstances.add(event.getInstance());
|
|
||||||
LOG.info(
|
|
||||||
"[COMPONENT {}]: {} completed, num pending comp instances increased to {}.",
|
|
||||||
component.getName(), event.getStatus().getContainerId(),
|
|
||||||
component.pendingInstances.size());
|
|
||||||
component.compInstanceDispatcher.getEventHandler().handle(
|
|
||||||
new ComponentInstanceEvent(event.getStatus().getContainerId(),
|
new ComponentInstanceEvent(event.getStatus().getContainerId(),
|
||||||
STOP).setStatus(event.getStatus()));
|
STOP).setStatus(event.getStatus()));
|
||||||
component.componentSpec.setState(
|
component.componentSpec.setState(
|
||||||
@ -328,8 +309,8 @@ public void transition(Component component, ComponentEvent event) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public ServiceMetrics getCompMetrics () {
|
public void reInsertPendingInstance(ComponentInstance instance) {
|
||||||
return componentMetrics;
|
pendingInstances.add(instance);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void releaseContainer(Container container) {
|
private void releaseContainer(Container container) {
|
||||||
@ -581,4 +562,9 @@ private static class BaseTransition implements
|
|||||||
public ServiceContext getContext() {
|
public ServiceContext getContext() {
|
||||||
return context;
|
return context;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Only for testing
|
||||||
|
public List<ComponentInstance> getPendingInstances() {
|
||||||
|
return pendingInstances;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,6 +19,7 @@
|
|||||||
package org.apache.hadoop.yarn.service.component;
|
package org.apache.hadoop.yarn.service.component;
|
||||||
|
|
||||||
import org.apache.hadoop.yarn.api.records.Container;
|
import org.apache.hadoop.yarn.api.records.Container;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||||
import org.apache.hadoop.yarn.event.AbstractEvent;
|
import org.apache.hadoop.yarn.event.AbstractEvent;
|
||||||
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
|
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
|
||||||
@ -30,6 +31,16 @@ public class ComponentEvent extends AbstractEvent<ComponentEventType> {
|
|||||||
private Container container;
|
private Container container;
|
||||||
private ComponentInstance instance;
|
private ComponentInstance instance;
|
||||||
private ContainerStatus status;
|
private ContainerStatus status;
|
||||||
|
private ContainerId containerId;
|
||||||
|
|
||||||
|
public ContainerId getContainerId() {
|
||||||
|
return containerId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ComponentEvent setContainerId(ContainerId containerId) {
|
||||||
|
this.containerId = containerId;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
public ComponentEvent(String name, ComponentEventType type) {
|
public ComponentEvent(String name, ComponentEventType type) {
|
||||||
super(type);
|
super(type);
|
||||||
|
@ -146,7 +146,7 @@ private static class ContainerStartedTransition extends BaseTransition {
|
|||||||
compInstance.containerStatusFuture =
|
compInstance.containerStatusFuture =
|
||||||
compInstance.scheduler.executorService.scheduleAtFixedRate(
|
compInstance.scheduler.executorService.scheduleAtFixedRate(
|
||||||
new ContainerStatusRetriever(compInstance.scheduler,
|
new ContainerStatusRetriever(compInstance.scheduler,
|
||||||
compInstance.getContainerId(), compInstance), 0, 1,
|
event.getContainerId(), compInstance), 0, 1,
|
||||||
TimeUnit.SECONDS);
|
TimeUnit.SECONDS);
|
||||||
compInstance.component.incRunningContainers();
|
compInstance.component.incRunningContainers();
|
||||||
long containerStartTime = System.currentTimeMillis();
|
long containerStartTime = System.currentTimeMillis();
|
||||||
@ -160,10 +160,10 @@ private static class ContainerStartedTransition extends BaseTransition {
|
|||||||
}
|
}
|
||||||
org.apache.hadoop.yarn.service.api.records.Container container =
|
org.apache.hadoop.yarn.service.api.records.Container container =
|
||||||
new org.apache.hadoop.yarn.service.api.records.Container();
|
new org.apache.hadoop.yarn.service.api.records.Container();
|
||||||
container.setId(compInstance.getContainerId().toString());
|
container.setId(event.getContainerId().toString());
|
||||||
container.setLaunchTime(new Date(containerStartTime));
|
container.setLaunchTime(new Date(containerStartTime));
|
||||||
container.setState(ContainerState.RUNNING_BUT_UNREADY);
|
container.setState(ContainerState.RUNNING_BUT_UNREADY);
|
||||||
container.setBareHost(compInstance.container.getNodeId().getHost());
|
container.setBareHost(compInstance.getNodeId().getHost());
|
||||||
container.setComponentInstanceName(compInstance.getCompInstanceName());
|
container.setComponentInstanceName(compInstance.getCompInstanceName());
|
||||||
if (compInstance.containerSpec != null) {
|
if (compInstance.containerSpec != null) {
|
||||||
// remove the previous container.
|
// remove the previous container.
|
||||||
@ -219,15 +219,11 @@ public void transition(ComponentInstance compInstance,
|
|||||||
// re-ask the failed container.
|
// re-ask the failed container.
|
||||||
Component comp = compInstance.component;
|
Component comp = compInstance.component;
|
||||||
comp.requestContainers(1);
|
comp.requestContainers(1);
|
||||||
LOG.info(compInstance.getCompInstanceId()
|
|
||||||
+ ": Container completed. Requested a new container." + System
|
|
||||||
.lineSeparator() + " exitStatus={}, diagnostics={}.",
|
|
||||||
event.getStatus().getExitStatus(),
|
|
||||||
event.getStatus().getDiagnostics());
|
|
||||||
String containerDiag =
|
String containerDiag =
|
||||||
compInstance.getCompInstanceId() + ": " + event.getStatus()
|
compInstance.getCompInstanceId() + ": " + event.getStatus()
|
||||||
.getDiagnostics();
|
.getDiagnostics();
|
||||||
compInstance.diagnostics.append(containerDiag + System.lineSeparator());
|
compInstance.diagnostics.append(containerDiag + System.lineSeparator());
|
||||||
|
compInstance.cancelContainerStatusRetriever();
|
||||||
|
|
||||||
if (compInstance.getState().equals(READY)) {
|
if (compInstance.getState().equals(READY)) {
|
||||||
compInstance.component.decContainersReady();
|
compInstance.component.decContainersReady();
|
||||||
@ -255,11 +251,13 @@ public void transition(ComponentInstance compInstance,
|
|||||||
// hdfs dir content will be overwritten when a new container gets started,
|
// hdfs dir content will be overwritten when a new container gets started,
|
||||||
// so no need remove.
|
// so no need remove.
|
||||||
compInstance.scheduler.executorService
|
compInstance.scheduler.executorService
|
||||||
.submit(compInstance::cleanupRegistry);
|
.submit(() -> compInstance.cleanupRegistry(event.getContainerId()));
|
||||||
|
|
||||||
if (compInstance.timelineServiceEnabled) {
|
if (compInstance.timelineServiceEnabled) {
|
||||||
// record in ATS
|
// record in ATS
|
||||||
compInstance.serviceTimelinePublisher.componentInstanceFinished
|
compInstance.serviceTimelinePublisher
|
||||||
(compInstance, event.getStatus().getExitStatus(), containerDiag);
|
.componentInstanceFinished(event.getContainerId(),
|
||||||
|
event.getStatus().getExitStatus(), containerDiag);
|
||||||
}
|
}
|
||||||
compInstance.containerSpec.setState(ContainerState.STOPPED);
|
compInstance.containerSpec.setState(ContainerState.STOPPED);
|
||||||
}
|
}
|
||||||
@ -267,6 +265,14 @@ public void transition(ComponentInstance compInstance,
|
|||||||
// remove the failed ContainerId -> CompInstance mapping
|
// remove the failed ContainerId -> CompInstance mapping
|
||||||
comp.getScheduler().removeLiveCompInstance(event.getContainerId());
|
comp.getScheduler().removeLiveCompInstance(event.getContainerId());
|
||||||
|
|
||||||
|
comp.reInsertPendingInstance(compInstance);
|
||||||
|
|
||||||
|
LOG.info(compInstance.getCompInstanceId()
|
||||||
|
+ ": {} completed. Reinsert back to pending list and requested " +
|
||||||
|
"a new container." + System.lineSeparator() +
|
||||||
|
" exitStatus={}, diagnostics={}.",
|
||||||
|
event.getContainerId(), event.getStatus().getExitStatus(),
|
||||||
|
event.getStatus().getDiagnostics());
|
||||||
if (shouldExit) {
|
if (shouldExit) {
|
||||||
// Sleep for 5 seconds in hope that the state can be recorded in ATS.
|
// Sleep for 5 seconds in hope that the state can be recorded in ATS.
|
||||||
// in case there's a client polling the comp state, it can be notified.
|
// in case there's a client polling the comp state, it can be notified.
|
||||||
@ -277,8 +283,6 @@ public void transition(ComponentInstance compInstance,
|
|||||||
}
|
}
|
||||||
ExitUtil.terminate(-1);
|
ExitUtil.terminate(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
compInstance.removeContainer();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -312,15 +316,6 @@ public void handle(ComponentInstanceEvent event) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasContainer() {
|
|
||||||
return this.container != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void removeContainer() {
|
|
||||||
this.container = null;
|
|
||||||
this.compInstanceId.setContainerId(null);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setContainer(Container container) {
|
public void setContainer(Container container) {
|
||||||
this.container = container;
|
this.container = container;
|
||||||
this.compInstanceId.setContainerId(container.getId());
|
this.compInstanceId.setContainerId(container.getId());
|
||||||
@ -337,7 +332,7 @@ public ContainerStatus getContainerStatus() {
|
|||||||
public void updateContainerStatus(ContainerStatus status) {
|
public void updateContainerStatus(ContainerStatus status) {
|
||||||
this.status = status;
|
this.status = status;
|
||||||
org.apache.hadoop.yarn.service.api.records.Container container =
|
org.apache.hadoop.yarn.service.api.records.Container container =
|
||||||
getCompSpec().getContainer(getContainerId().toString());
|
getCompSpec().getContainer(status.getContainerId().toString());
|
||||||
if (container != null) {
|
if (container != null) {
|
||||||
container.setIp(StringUtils.join(",", status.getIPs()));
|
container.setIp(StringUtils.join(",", status.getIPs()));
|
||||||
container.setHostname(status.getHost());
|
container.setHostname(status.getHost());
|
||||||
@ -348,10 +343,6 @@ public void updateContainerStatus(ContainerStatus status) {
|
|||||||
updateServiceRecord(yarnRegistryOperations, status);
|
updateServiceRecord(yarnRegistryOperations, status);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ContainerId getContainerId() {
|
|
||||||
return container.getId();
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getCompName() {
|
public String getCompName() {
|
||||||
return compInstanceId.getCompName();
|
return compInstanceId.getCompName();
|
||||||
}
|
}
|
||||||
@ -423,12 +414,7 @@ private void updateServiceRecord(
|
|||||||
public void destroy() {
|
public void destroy() {
|
||||||
LOG.info(getCompInstanceId() + ": Flexed down by user, destroying.");
|
LOG.info(getCompInstanceId() + ": Flexed down by user, destroying.");
|
||||||
diagnostics.append(getCompInstanceId() + ": Flexed down by user");
|
diagnostics.append(getCompInstanceId() + ": Flexed down by user");
|
||||||
if (container != null) {
|
|
||||||
scheduler.removeLiveCompInstance(container.getId());
|
|
||||||
component.getScheduler().getAmRMClient()
|
|
||||||
.releaseAssignedContainer(container.getId());
|
|
||||||
getCompSpec().removeContainer(containerSpec);
|
|
||||||
}
|
|
||||||
// update metrics
|
// update metrics
|
||||||
if (getState() == STARTED) {
|
if (getState() == STARTED) {
|
||||||
component.decRunningContainers();
|
component.decRunningContainers();
|
||||||
@ -437,16 +423,29 @@ public void destroy() {
|
|||||||
component.decContainersReady();
|
component.decContainersReady();
|
||||||
component.decRunningContainers();
|
component.decRunningContainers();
|
||||||
}
|
}
|
||||||
|
getCompSpec().removeContainer(containerSpec);
|
||||||
|
|
||||||
|
if (container == null) {
|
||||||
|
LOG.info(getCompInstanceId() + " no container is assigned when " +
|
||||||
|
"destroying");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ContainerId containerId = container.getId();
|
||||||
|
scheduler.removeLiveCompInstance(containerId);
|
||||||
|
component.getScheduler().getAmRMClient()
|
||||||
|
.releaseAssignedContainer(containerId);
|
||||||
|
|
||||||
if (timelineServiceEnabled) {
|
if (timelineServiceEnabled) {
|
||||||
serviceTimelinePublisher.componentInstanceFinished(this,
|
serviceTimelinePublisher.componentInstanceFinished(containerId,
|
||||||
KILLED_BY_APPMASTER, diagnostics.toString());
|
KILLED_BY_APPMASTER, diagnostics.toString());
|
||||||
}
|
}
|
||||||
scheduler.executorService.submit(this::cleanupRegistryAndCompHdfsDir);
|
cancelContainerStatusRetriever();
|
||||||
|
scheduler.executorService.submit(() ->
|
||||||
|
cleanupRegistryAndCompHdfsDir(containerId));
|
||||||
}
|
}
|
||||||
|
|
||||||
private void cleanupRegistry() {
|
private void cleanupRegistry(ContainerId containerId) {
|
||||||
ContainerId containerId = getContainerId();
|
|
||||||
String cid = RegistryPathUtils.encodeYarnID(containerId.toString());
|
String cid = RegistryPathUtils.encodeYarnID(containerId.toString());
|
||||||
try {
|
try {
|
||||||
yarnRegistryOperations.deleteComponent(getCompInstanceId(), cid);
|
yarnRegistryOperations.deleteComponent(getCompInstanceId(), cid);
|
||||||
@ -456,8 +455,8 @@ private void cleanupRegistry() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
//TODO Maybe have a dedicated cleanup service.
|
//TODO Maybe have a dedicated cleanup service.
|
||||||
public void cleanupRegistryAndCompHdfsDir() {
|
public void cleanupRegistryAndCompHdfsDir(ContainerId containerId) {
|
||||||
cleanupRegistry();
|
cleanupRegistry(containerId);
|
||||||
try {
|
try {
|
||||||
if (compInstanceDir != null && fs.exists(compInstanceDir)) {
|
if (compInstanceDir != null && fs.exists(compInstanceDir)) {
|
||||||
boolean deleted = fs.delete(compInstanceDir, true);
|
boolean deleted = fs.delete(compInstanceDir, true);
|
||||||
@ -515,6 +514,12 @@ private static class ContainerStatusRetriever implements Runnable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void cancelContainerStatusRetriever() {
|
||||||
|
if (containerStatusFuture != null && !containerStatusFuture.isDone()) {
|
||||||
|
containerStatusFuture.cancel(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compareTo(ComponentInstance to) {
|
public int compareTo(ComponentInstance to) {
|
||||||
long delta = containerStartedTime - to.containerStartedTime;
|
long delta = containerStartedTime - to.containerStartedTime;
|
||||||
|
@ -87,7 +87,7 @@ public ContainerLauncher(
|
|||||||
AbstractLauncher launcher = new AbstractLauncher(fs, null);
|
AbstractLauncher launcher = new AbstractLauncher(fs, null);
|
||||||
try {
|
try {
|
||||||
provider.buildContainerLaunchContext(launcher, service,
|
provider.buildContainerLaunchContext(launcher, service,
|
||||||
instance, fs, getConfig());
|
instance, fs, getConfig(), container);
|
||||||
instance.getComponent().getScheduler().getNmClient()
|
instance.getComponent().getScheduler().getNmClient()
|
||||||
.startContainerAsync(container,
|
.startContainerAsync(container,
|
||||||
launcher.completeContainerLaunch());
|
launcher.completeContainerLaunch());
|
||||||
|
@ -20,6 +20,7 @@
|
|||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.yarn.api.ApplicationConstants;
|
import org.apache.hadoop.yarn.api.ApplicationConstants;
|
||||||
|
import org.apache.hadoop.yarn.api.records.Container;
|
||||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
import org.apache.hadoop.yarn.service.conf.YarnServiceConf;
|
import org.apache.hadoop.yarn.service.conf.YarnServiceConf;
|
||||||
import org.apache.hadoop.yarn.service.api.records.Component;
|
import org.apache.hadoop.yarn.service.api.records.Component;
|
||||||
@ -55,7 +56,7 @@ public abstract void processArtifact(AbstractLauncher launcher,
|
|||||||
|
|
||||||
public void buildContainerLaunchContext(AbstractLauncher launcher,
|
public void buildContainerLaunchContext(AbstractLauncher launcher,
|
||||||
Service service, ComponentInstance instance,
|
Service service, ComponentInstance instance,
|
||||||
SliderFileSystem fileSystem, Configuration yarnConf)
|
SliderFileSystem fileSystem, Configuration yarnConf, Container container)
|
||||||
throws IOException, SliderException {
|
throws IOException, SliderException {
|
||||||
Component component = instance.getComponent().getComponentSpec();;
|
Component component = instance.getComponent().getComponentSpec();;
|
||||||
processArtifact(launcher, instance, fileSystem, service);
|
processArtifact(launcher, instance, fileSystem, service);
|
||||||
@ -67,7 +68,7 @@ public void buildContainerLaunchContext(AbstractLauncher launcher,
|
|||||||
Map<String, String> globalTokens =
|
Map<String, String> globalTokens =
|
||||||
instance.getComponent().getScheduler().globalTokens;
|
instance.getComponent().getScheduler().globalTokens;
|
||||||
Map<String, String> tokensForSubstitution = ProviderUtils
|
Map<String, String> tokensForSubstitution = ProviderUtils
|
||||||
.initCompTokensForSubstitute(instance);
|
.initCompTokensForSubstitute(instance, container);
|
||||||
tokensForSubstitution.putAll(globalTokens);
|
tokensForSubstitution.putAll(globalTokens);
|
||||||
// Set the environment variables in launcher
|
// Set the environment variables in launcher
|
||||||
launcher.putEnv(ServiceUtils
|
launcher.putEnv(ServiceUtils
|
||||||
|
@ -19,6 +19,7 @@
|
|||||||
package org.apache.hadoop.yarn.service.provider;
|
package org.apache.hadoop.yarn.service.provider;
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.yarn.api.records.Container;
|
||||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
import org.apache.hadoop.yarn.service.utils.SliderFileSystem;
|
import org.apache.hadoop.yarn.service.utils.SliderFileSystem;
|
||||||
import org.apache.hadoop.yarn.service.exceptions.SliderException;
|
import org.apache.hadoop.yarn.service.exceptions.SliderException;
|
||||||
@ -34,6 +35,6 @@ public interface ProviderService {
|
|||||||
*/
|
*/
|
||||||
void buildContainerLaunchContext(AbstractLauncher containerLauncher,
|
void buildContainerLaunchContext(AbstractLauncher containerLauncher,
|
||||||
Service service, ComponentInstance instance,
|
Service service, ComponentInstance instance,
|
||||||
SliderFileSystem sliderFileSystem, Configuration yarnConf)
|
SliderFileSystem sliderFileSystem, Configuration yarnConf, Container
|
||||||
throws IOException, SliderException;
|
container) throws IOException, SliderException;
|
||||||
}
|
}
|
||||||
|
@ -24,6 +24,7 @@
|
|||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.fs.permission.FsAction;
|
import org.apache.hadoop.fs.permission.FsAction;
|
||||||
import org.apache.hadoop.fs.permission.FsPermission;
|
import org.apache.hadoop.fs.permission.FsPermission;
|
||||||
|
import org.apache.hadoop.yarn.api.records.Container;
|
||||||
import org.apache.hadoop.yarn.api.records.LocalResource;
|
import org.apache.hadoop.yarn.api.records.LocalResource;
|
||||||
import org.apache.hadoop.yarn.api.records.LocalResourceType;
|
import org.apache.hadoop.yarn.api.records.LocalResourceType;
|
||||||
import org.apache.hadoop.yarn.service.ServiceContext;
|
import org.apache.hadoop.yarn.service.ServiceContext;
|
||||||
@ -393,13 +394,13 @@ private static void resolvePlainTemplateAndSaveOnHdfs(FileSystem fs,
|
|||||||
* @return tokens to replace
|
* @return tokens to replace
|
||||||
*/
|
*/
|
||||||
public static Map<String, String> initCompTokensForSubstitute(
|
public static Map<String, String> initCompTokensForSubstitute(
|
||||||
ComponentInstance instance) {
|
ComponentInstance instance, Container container) {
|
||||||
Map<String, String> tokens = new HashMap<>();
|
Map<String, String> tokens = new HashMap<>();
|
||||||
tokens.put(COMPONENT_NAME, instance.getCompSpec().getName());
|
tokens.put(COMPONENT_NAME, instance.getCompSpec().getName());
|
||||||
tokens
|
tokens
|
||||||
.put(COMPONENT_NAME_LC, instance.getCompSpec().getName().toLowerCase());
|
.put(COMPONENT_NAME_LC, instance.getCompSpec().getName().toLowerCase());
|
||||||
tokens.put(COMPONENT_INSTANCE_NAME, instance.getCompInstanceName());
|
tokens.put(COMPONENT_INSTANCE_NAME, instance.getCompInstanceName());
|
||||||
tokens.put(CONTAINER_ID, instance.getContainer().getId().toString());
|
tokens.put(CONTAINER_ID, container.getId().toString());
|
||||||
tokens.put(COMPONENT_ID,
|
tokens.put(COMPONENT_ID,
|
||||||
String.valueOf(instance.getCompInstanceId().getId()));
|
String.valueOf(instance.getCompInstanceId().getId()));
|
||||||
tokens.putAll(instance.getComponent().getDependencyHostIpTokens());
|
tokens.putAll(instance.getComponent().getDependencyHostIpTokens());
|
||||||
|
@ -20,6 +20,7 @@
|
|||||||
|
|
||||||
import org.apache.hadoop.metrics2.AbstractMetric;
|
import org.apache.hadoop.metrics2.AbstractMetric;
|
||||||
import org.apache.hadoop.service.CompositeService;
|
import org.apache.hadoop.service.CompositeService;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEntity;
|
import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEntity;
|
||||||
import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEvent;
|
import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEvent;
|
||||||
@ -178,10 +179,10 @@ public void componentInstanceStarted(Container container,
|
|||||||
putEntity(entity);
|
putEntity(entity);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void componentInstanceFinished(ComponentInstance instance,
|
public void componentInstanceFinished(ContainerId containerId,
|
||||||
int exitCode, String diagnostics) {
|
int exitCode, String diagnostics) {
|
||||||
TimelineEntity entity = createComponentInstanceEntity(
|
TimelineEntity entity = createComponentInstanceEntity(
|
||||||
instance.getContainer().getId().toString());
|
containerId.toString());
|
||||||
|
|
||||||
// create info keys
|
// create info keys
|
||||||
Map<String, Object> entityInfos = new HashMap<String, Object>();
|
Map<String, Object> entityInfos = new HashMap<String, Object>();
|
||||||
|
@ -24,14 +24,8 @@
|
|||||||
import org.apache.hadoop.test.GenericTestUtils;
|
import org.apache.hadoop.test.GenericTestUtils;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
|
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
|
||||||
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
|
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.*;
|
||||||
import org.apache.hadoop.yarn.api.records.Container;
|
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
|
||||||
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
|
||||||
import org.apache.hadoop.yarn.api.records.NodeId;
|
|
||||||
import org.apache.hadoop.yarn.api.records.Priority;
|
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
|
||||||
import org.apache.hadoop.yarn.client.api.AMRMClient;
|
import org.apache.hadoop.yarn.client.api.AMRMClient;
|
||||||
import org.apache.hadoop.yarn.client.api.NMClient;
|
import org.apache.hadoop.yarn.client.api.NMClient;
|
||||||
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
|
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
|
||||||
@ -42,15 +36,15 @@
|
|||||||
import org.apache.hadoop.yarn.service.api.records.Service;
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
import org.apache.hadoop.yarn.service.component.Component;
|
import org.apache.hadoop.yarn.service.component.Component;
|
||||||
import org.apache.hadoop.yarn.service.component.ComponentState;
|
import org.apache.hadoop.yarn.service.component.ComponentState;
|
||||||
|
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
|
||||||
|
import org.apache.hadoop.yarn.service.component.instance.ComponentInstanceState;
|
||||||
import org.apache.hadoop.yarn.service.exceptions.BadClusterStateException;
|
import org.apache.hadoop.yarn.service.exceptions.BadClusterStateException;
|
||||||
import org.apache.hadoop.yarn.service.registry.YarnRegistryViewForProviders;
|
import org.apache.hadoop.yarn.service.registry.YarnRegistryViewForProviders;
|
||||||
import org.apache.hadoop.yarn.service.utils.SliderFileSystem;
|
import org.apache.hadoop.yarn.service.utils.SliderFileSystem;
|
||||||
|
import org.apache.hadoop.yarn.util.Records;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Collections;
|
import java.util.*;
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.concurrent.TimeoutException;
|
import java.util.concurrent.TimeoutException;
|
||||||
|
|
||||||
import static org.mockito.Mockito.mock;
|
import static org.mockito.Mockito.mock;
|
||||||
@ -63,6 +57,8 @@ public class MockServiceAM extends ServiceMaster {
|
|||||||
final List<Container> feedContainers =
|
final List<Container> feedContainers =
|
||||||
Collections.synchronizedList(new LinkedList<>());
|
Collections.synchronizedList(new LinkedList<>());
|
||||||
|
|
||||||
|
final List<ContainerStatus> failedContainers =
|
||||||
|
Collections.synchronizedList(new LinkedList<>());
|
||||||
public MockServiceAM(Service service) {
|
public MockServiceAM(Service service) {
|
||||||
super(service.getName());
|
super(service.getName());
|
||||||
this.service = service;
|
this.service = service;
|
||||||
@ -102,10 +98,10 @@ protected AMRMClientAsync<AMRMClient.ContainerRequest> createAMRMClient() {
|
|||||||
|
|
||||||
AllocateResponse.AllocateResponseBuilder builder =
|
AllocateResponse.AllocateResponseBuilder builder =
|
||||||
AllocateResponse.newBuilder();
|
AllocateResponse.newBuilder();
|
||||||
|
// add new containers if any
|
||||||
synchronized (feedContainers) {
|
synchronized (feedContainers) {
|
||||||
if (feedContainers.isEmpty()) {
|
if (feedContainers.isEmpty()) {
|
||||||
System.out.println("Allocating........ no containers");
|
System.out.println("Allocating........ no containers");
|
||||||
return builder.build();
|
|
||||||
} else {
|
} else {
|
||||||
// The AMRMClient will return containers for compoenent that are
|
// The AMRMClient will return containers for compoenent that are
|
||||||
// at FLEXING state
|
// at FLEXING state
|
||||||
@ -121,9 +117,20 @@ protected AMRMClientAsync<AMRMClient.ContainerRequest> createAMRMClient() {
|
|||||||
itor.remove();
|
itor.remove();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return builder.allocatedContainers(allocatedContainers).build();
|
builder.allocatedContainers(allocatedContainers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// add failed containers if any
|
||||||
|
synchronized (failedContainers) {
|
||||||
|
if (!failedContainers.isEmpty()) {
|
||||||
|
List<ContainerStatus> failed =
|
||||||
|
new LinkedList<>(failedContainers);
|
||||||
|
failedContainers.clear();
|
||||||
|
builder.completedContainersStatuses(failed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return builder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -184,6 +191,19 @@ public Container feedContainerToComp(Service service, int id,
|
|||||||
return container;
|
return container;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void feedFailedContainerToComp(Service service, int id, String
|
||||||
|
compName) {
|
||||||
|
ApplicationId applicationId = ApplicationId.fromString(service.getId());
|
||||||
|
ContainerId containerId = ContainerId
|
||||||
|
.newContainerId(ApplicationAttemptId.newInstance(applicationId, 1), id);
|
||||||
|
ContainerStatus containerStatus = Records.newRecord(ContainerStatus.class);
|
||||||
|
containerStatus.setContainerId(containerId);
|
||||||
|
synchronized (failedContainers) {
|
||||||
|
failedContainers.add(containerStatus);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void flexComponent(String compName, long numberOfContainers)
|
public void flexComponent(String compName, long numberOfContainers)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
ClientAMProtocol.ComponentCountProto componentCountProto =
|
ClientAMProtocol.ComponentCountProto componentCountProto =
|
||||||
@ -218,4 +238,22 @@ public void waitForNumDesiredContainers(String compName,
|
|||||||
}
|
}
|
||||||
}, 1000, 20000);
|
}, 1000, 20000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public ComponentInstance getCompInstance(String compName, String
|
||||||
|
instanceName) {
|
||||||
|
return context.scheduler.getAllComponents().get(compName)
|
||||||
|
.getComponentInstance(instanceName);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void waitForCompInstanceState(ComponentInstance instance,
|
||||||
|
ComponentInstanceState state)
|
||||||
|
throws TimeoutException, InterruptedException {
|
||||||
|
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||||
|
@Override
|
||||||
|
public Boolean get() {
|
||||||
|
return instance.getState().equals(state);
|
||||||
|
}
|
||||||
|
}, 1000, 20000);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -65,6 +65,7 @@ public class ServiceTestUtils {
|
|||||||
|
|
||||||
private MiniYARNCluster yarnCluster = null;
|
private MiniYARNCluster yarnCluster = null;
|
||||||
private MiniDFSCluster hdfsCluster = null;
|
private MiniDFSCluster hdfsCluster = null;
|
||||||
|
TestingCluster zkCluster;
|
||||||
private FileSystem fs = null;
|
private FileSystem fs = null;
|
||||||
private Configuration conf = null;
|
private Configuration conf = null;
|
||||||
public static final int NUM_NMS = 1;
|
public static final int NUM_NMS = 1;
|
||||||
@ -165,7 +166,6 @@ protected void setupInternal(int numNodeManager)
|
|||||||
conf.setBoolean(NM_VMEM_CHECK_ENABLED, false);
|
conf.setBoolean(NM_VMEM_CHECK_ENABLED, false);
|
||||||
conf.setBoolean(NM_PMEM_CHECK_ENABLED, false);
|
conf.setBoolean(NM_PMEM_CHECK_ENABLED, false);
|
||||||
// setup zk cluster
|
// setup zk cluster
|
||||||
TestingCluster zkCluster;
|
|
||||||
zkCluster = new TestingCluster(1);
|
zkCluster = new TestingCluster(1);
|
||||||
zkCluster.start();
|
zkCluster.start();
|
||||||
conf.set(YarnConfiguration.RM_ZK_ADDRESS, zkCluster.getConnectString());
|
conf.set(YarnConfiguration.RM_ZK_ADDRESS, zkCluster.getConnectString());
|
||||||
@ -239,6 +239,9 @@ public void shutdown() throws IOException {
|
|||||||
hdfsCluster = null;
|
hdfsCluster = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (zkCluster != null) {
|
||||||
|
zkCluster.stop();
|
||||||
|
}
|
||||||
if (basedir != null) {
|
if (basedir != null) {
|
||||||
FileUtils.deleteDirectory(basedir);
|
FileUtils.deleteDirectory(basedir);
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,109 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.service;
|
||||||
|
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.curator.test.TestingCluster;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.service.api.records.Service;
|
||||||
|
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
|
||||||
|
import org.apache.hadoop.yarn.service.component.instance.ComponentInstanceState;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.concurrent.TimeoutException;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.registry.client.api.RegistryConstants
|
||||||
|
.KEY_REGISTRY_ZK_QUORUM;
|
||||||
|
|
||||||
|
public class TestServiceAM extends ServiceTestUtils{
|
||||||
|
|
||||||
|
private File basedir;
|
||||||
|
YarnConfiguration conf = new YarnConfiguration();
|
||||||
|
TestingCluster zkCluster;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setup() throws Exception {
|
||||||
|
basedir = new File("target", "apps");
|
||||||
|
if (basedir.exists()) {
|
||||||
|
FileUtils.deleteDirectory(basedir);
|
||||||
|
} else {
|
||||||
|
basedir.mkdirs();
|
||||||
|
}
|
||||||
|
zkCluster = new TestingCluster(1);
|
||||||
|
zkCluster.start();
|
||||||
|
conf.set(KEY_REGISTRY_ZK_QUORUM, zkCluster.getConnectString());
|
||||||
|
System.out.println("ZK cluster: " + zkCluster.getConnectString());
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
if (basedir != null) {
|
||||||
|
FileUtils.deleteDirectory(basedir);
|
||||||
|
}
|
||||||
|
if (zkCluster != null) {
|
||||||
|
zkCluster.stop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Race condition YARN-7486
|
||||||
|
// 1. Allocate 1 container to compa and wait it to be started
|
||||||
|
// 2. Fail this container, and in the meanwhile allocate the 2nd container.
|
||||||
|
// 3. The 2nd container should not be assigned to compa-0 instance, because
|
||||||
|
// the compa-0 instance is not stopped yet.
|
||||||
|
// 4. check compa still has the instance in the pending list.
|
||||||
|
@Test
|
||||||
|
public void testContainerCompleted() throws TimeoutException,
|
||||||
|
InterruptedException {
|
||||||
|
ApplicationId applicationId = ApplicationId.newInstance(123456, 1);
|
||||||
|
Service exampleApp = new Service();
|
||||||
|
exampleApp.setId(applicationId.toString());
|
||||||
|
exampleApp.setName("testContainerCompleted");
|
||||||
|
exampleApp.addComponent(createComponent("compa", 1, "pwd"));
|
||||||
|
|
||||||
|
MockServiceAM am = new MockServiceAM(exampleApp);
|
||||||
|
am.init(conf);
|
||||||
|
am.start();
|
||||||
|
|
||||||
|
ComponentInstance compa0 = am.getCompInstance("compa", "compa-0");
|
||||||
|
// allocate a container
|
||||||
|
am.feedContainerToComp(exampleApp, 1, "compa");
|
||||||
|
am.waitForCompInstanceState(compa0, ComponentInstanceState.STARTED);
|
||||||
|
|
||||||
|
System.out.println("Fail the container 1");
|
||||||
|
// fail the container
|
||||||
|
am.feedFailedContainerToComp(exampleApp, 1, "compa");
|
||||||
|
|
||||||
|
// allocate the second container immediately, this container will not be
|
||||||
|
// assigned to comp instance
|
||||||
|
// because the instance is not yet added to the pending list.
|
||||||
|
am.feedContainerToComp(exampleApp, 2, "compa");
|
||||||
|
|
||||||
|
am.waitForCompInstanceState(compa0, ComponentInstanceState.INIT);
|
||||||
|
// still 1 pending instance
|
||||||
|
Assert.assertEquals(1,
|
||||||
|
am.getComponent("compa").getPendingInstances().size());
|
||||||
|
am.stop();
|
||||||
|
}
|
||||||
|
}
|
@ -20,6 +20,7 @@
|
|||||||
package org.apache.hadoop.yarn.service.monitor;
|
package org.apache.hadoop.yarn.service.monitor;
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.curator.test.TestingCluster;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.service.MockServiceAM;
|
import org.apache.hadoop.yarn.service.MockServiceAM;
|
||||||
@ -37,10 +38,14 @@
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
|
||||||
|
import static org.apache.hadoop.registry.client.api.RegistryConstants
|
||||||
|
.KEY_REGISTRY_ZK_QUORUM;
|
||||||
|
|
||||||
public class TestServiceMonitor extends ServiceTestUtils {
|
public class TestServiceMonitor extends ServiceTestUtils {
|
||||||
|
|
||||||
private File basedir;
|
private File basedir;
|
||||||
YarnConfiguration conf = new YarnConfiguration();
|
YarnConfiguration conf = new YarnConfiguration();
|
||||||
|
TestingCluster zkCluster;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setup() throws Exception {
|
public void setup() throws Exception {
|
||||||
@ -51,6 +56,10 @@ public void setup() throws Exception {
|
|||||||
basedir.mkdirs();
|
basedir.mkdirs();
|
||||||
}
|
}
|
||||||
conf.setLong(YarnServiceConf.READINESS_CHECK_INTERVAL, 2);
|
conf.setLong(YarnServiceConf.READINESS_CHECK_INTERVAL, 2);
|
||||||
|
zkCluster = new TestingCluster(1);
|
||||||
|
zkCluster.start();
|
||||||
|
conf.set(KEY_REGISTRY_ZK_QUORUM, zkCluster.getConnectString());
|
||||||
|
System.out.println("ZK cluster: " + zkCluster.getConnectString());
|
||||||
}
|
}
|
||||||
|
|
||||||
@After
|
@After
|
||||||
@ -58,6 +67,9 @@ public void tearDown() throws IOException {
|
|||||||
if (basedir != null) {
|
if (basedir != null) {
|
||||||
FileUtils.deleteDirectory(basedir);
|
FileUtils.deleteDirectory(basedir);
|
||||||
}
|
}
|
||||||
|
if (zkCluster != null) {
|
||||||
|
zkCluster.stop();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create compa with 1 container
|
// Create compa with 1 container
|
||||||
|
Loading…
x
Reference in New Issue
Block a user