YARN-9056. Improved YARN service upgrade state logic for readiness check.

Contributed by Chandni Singh

(cherry picked from commit f657a2a661)
This commit is contained in:
Eric Yang 2018-11-27 18:36:59 -05:00
parent fa1d4ba7d4
commit db8b2a130c
9 changed files with 104 additions and 41 deletions

View File

@ -109,6 +109,7 @@ import static org.apache.hadoop.yarn.api.records.ContainerExitStatus
.KILLED_AFTER_APP_COMPLETION; .KILLED_AFTER_APP_COMPLETION;
import static org.apache.hadoop.yarn.service.api.ServiceApiConstants.*; import static org.apache.hadoop.yarn.service.api.ServiceApiConstants.*;
import static org.apache.hadoop.yarn.service.component.ComponentEventType.*; import static org.apache.hadoop.yarn.service.component.ComponentEventType.*;
import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEventType.START;
import static org.apache.hadoop.yarn.service.exceptions.LauncherExitCodes import static org.apache.hadoop.yarn.service.exceptions.LauncherExitCodes
.EXIT_FALSE; .EXIT_FALSE;
import static org.apache.hadoop.yarn.service.exceptions.LauncherExitCodes import static org.apache.hadoop.yarn.service.exceptions.LauncherExitCodes
@ -827,9 +828,8 @@ public class ServiceScheduler extends CompositeService {
LOG.error("No component instance exists for {}", containerId); LOG.error("No component instance exists for {}", containerId);
return; return;
} }
ComponentInstanceEvent becomeReadyEvent = new ComponentInstanceEvent( dispatcher.getEventHandler().handle(
containerId, ComponentInstanceEventType.BECOME_READY); new ComponentInstanceEvent(containerId, START));
dispatcher.getEventHandler().handle(becomeReadyEvent);
} }
@Override @Override

View File

@ -177,13 +177,13 @@ public class Component implements EventHandler<ComponentEvent> {
new NeedsUpgradeTransition()) new NeedsUpgradeTransition())
.addTransition(STABLE, CANCEL_UPGRADING, CANCEL_UPGRADE, .addTransition(STABLE, CANCEL_UPGRADING, CANCEL_UPGRADE,
new NeedsUpgradeTransition()) new NeedsUpgradeTransition())
.addTransition(STABLE, EnumSet.of(STABLE), CHECK_STABLE, .addTransition(STABLE, EnumSet.of(STABLE, FLEXING), CHECK_STABLE,
new CheckStableTransition()) new CheckStableTransition())
// Cancel upgrade while previous upgrade is still in progress // Cancel upgrade while previous upgrade is still in progress
.addTransition(UPGRADING, CANCEL_UPGRADING, .addTransition(UPGRADING, CANCEL_UPGRADING,
CANCEL_UPGRADE, new NeedsUpgradeTransition()) CANCEL_UPGRADE, new NeedsUpgradeTransition())
.addTransition(UPGRADING, EnumSet.of(UPGRADING, STABLE), .addTransition(UPGRADING, EnumSet.of(UPGRADING, FLEXING, STABLE),
CHECK_STABLE, new CheckStableTransition()) CHECK_STABLE, new CheckStableTransition())
.addTransition(UPGRADING, UPGRADING, CONTAINER_COMPLETED, .addTransition(UPGRADING, UPGRADING, CONTAINER_COMPLETED,
new CompletedAfterUpgradeTransition()) new CompletedAfterUpgradeTransition())

View File

@ -131,7 +131,7 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
.addTransition(STARTED, INIT, STOP, .addTransition(STARTED, INIT, STOP,
new ContainerStoppedTransition()) new ContainerStoppedTransition())
.addTransition(STARTED, READY, BECOME_READY, .addTransition(STARTED, READY, BECOME_READY,
new ContainerBecomeReadyTransition()) new ContainerBecomeReadyTransition(false))
// FROM READY // FROM READY
.addTransition(READY, STARTED, BECOME_NOT_READY, .addTransition(READY, STARTED, BECOME_NOT_READY,
@ -144,16 +144,20 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
// FROM UPGRADING // FROM UPGRADING
.addTransition(UPGRADING, EnumSet.of(READY, CANCEL_UPGRADING), .addTransition(UPGRADING, EnumSet.of(READY, CANCEL_UPGRADING),
CANCEL_UPGRADE, new CancelUpgradeTransition()) CANCEL_UPGRADE, new CancelUpgradeTransition())
.addTransition(UPGRADING, EnumSet.of(READY), BECOME_READY, .addTransition(UPGRADING, EnumSet.of(REINITIALIZED), START,
new ReadyAfterUpgradeTransition()) new StartedAfterUpgradeTransition())
.addTransition(UPGRADING, UPGRADING, STOP, .addTransition(UPGRADING, UPGRADING, STOP,
new StoppedAfterUpgradeTransition()) new StoppedAfterUpgradeTransition())
// FROM CANCEL_UPGRADING // FROM CANCEL_UPGRADING
.addTransition(CANCEL_UPGRADING, EnumSet.of(CANCEL_UPGRADING, READY), .addTransition(CANCEL_UPGRADING, EnumSet.of(CANCEL_UPGRADING,
BECOME_READY, new ReadyAfterUpgradeTransition()) REINITIALIZED), START, new StartedAfterUpgradeTransition())
.addTransition(CANCEL_UPGRADING, EnumSet.of(CANCEL_UPGRADING, INIT), .addTransition(CANCEL_UPGRADING, EnumSet.of(CANCEL_UPGRADING, INIT),
STOP, new StoppedAfterCancelUpgradeTransition()) STOP, new StoppedAfterCancelUpgradeTransition())
.addTransition(REINITIALIZED, CANCEL_UPGRADING, CANCEL_UPGRADE,
new CancelledAfterReinitTransition())
.addTransition(REINITIALIZED, READY, BECOME_READY,
new ContainerBecomeReadyTransition(true))
.installTopology(); .installTopology();
public ComponentInstance(Component component, public ComponentInstance(Component component,
@ -229,16 +233,30 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
} }
private static class ContainerBecomeReadyTransition extends BaseTransition { private static class ContainerBecomeReadyTransition extends BaseTransition {
private final boolean isReinitialized;
ContainerBecomeReadyTransition(boolean isReinitialized) {
this.isReinitialized = isReinitialized;
}
@Override @Override
public void transition(ComponentInstance compInstance, public void transition(ComponentInstance compInstance,
ComponentInstanceEvent event) { ComponentInstanceEvent event) {
compInstance.setContainerState(ContainerState.READY); compInstance.setContainerState(ContainerState.READY);
if (!isReinitialized) {
compInstance.component.incContainersReady(true); compInstance.component.incContainersReady(true);
} else {
compInstance.component.incContainersReady(false);
ComponentEvent checkState = new ComponentEvent(
compInstance.component.getName(), ComponentEventType.CHECK_STABLE);
compInstance.scheduler.getDispatcher().getEventHandler().handle(
checkState);
}
compInstance.postContainerReady(); compInstance.postContainerReady();
} }
} }
private static class ReadyAfterUpgradeTransition implements private static class StartedAfterUpgradeTransition implements
MultipleArcTransition<ComponentInstance, ComponentInstanceEvent, MultipleArcTransition<ComponentInstance, ComponentInstanceEvent,
ComponentInstanceState> { ComponentInstanceState> {
@ -249,7 +267,7 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
if (instance.pendingCancelUpgrade) { if (instance.pendingCancelUpgrade) {
// cancellation of upgrade was triggered before the upgrade was // cancellation of upgrade was triggered before the upgrade was
// finished. // finished.
LOG.info("{} received ready but cancellation pending", LOG.info("{} received started but cancellation pending",
event.getContainerId()); event.getContainerId());
instance.upgradeInProgress.set(true); instance.upgradeInProgress.set(true);
instance.cancelUpgrade(); instance.cancelUpgrade();
@ -258,8 +276,7 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
} }
instance.upgradeInProgress.set(false); instance.upgradeInProgress.set(false);
instance.setContainerState(ContainerState.READY); instance.setContainerState(ContainerState.RUNNING_BUT_UNREADY);
instance.component.incContainersReady(false);
Component.UpgradeStatus status = instance.getState().equals(UPGRADING) ? Component.UpgradeStatus status = instance.getState().equals(UPGRADING) ?
instance.component.getUpgradeStatus() : instance.component.getUpgradeStatus() :
@ -267,12 +284,7 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
status.decContainersThatNeedUpgrade(); status.decContainersThatNeedUpgrade();
instance.serviceVersion = status.getTargetVersion(); instance.serviceVersion = status.getTargetVersion();
ComponentEvent checkState = new ComponentEvent( return ComponentInstanceState.REINITIALIZED;
instance.component.getName(),
ComponentEventType.CHECK_STABLE);
instance.scheduler.getDispatcher().getEventHandler().handle(checkState);
instance.postContainerReady();
return ComponentInstanceState.READY;
} }
} }
@ -570,6 +582,19 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
} }
} }
private static class CancelledAfterReinitTransition extends BaseTransition {
@Override
public void transition(ComponentInstance instance,
ComponentInstanceEvent event) {
if (instance.upgradeInProgress.compareAndSet(false, true)) {
instance.cancelUpgrade();
} else {
LOG.info("{} pending cancellation", event.getContainerId());
instance.pendingCancelUpgrade = true;
}
}
}
private static class CancelUpgradeTransition implements private static class CancelUpgradeTransition implements
MultipleArcTransition<ComponentInstance, ComponentInstanceEvent, MultipleArcTransition<ComponentInstance, ComponentInstanceEvent,
ComponentInstanceState> { ComponentInstanceState> {

View File

@ -23,5 +23,6 @@ public enum ComponentInstanceState {
STARTED, STARTED,
READY, READY,
UPGRADING, UPGRADING,
CANCEL_UPGRADING CANCEL_UPGRADING,
REINITIALIZED
} }

View File

@ -24,6 +24,7 @@ import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.service.ServiceContext; import org.apache.hadoop.yarn.service.ServiceContext;
import org.apache.hadoop.yarn.service.component.Component; import org.apache.hadoop.yarn.service.component.Component;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstance; import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstanceState;
import org.apache.hadoop.yarn.service.conf.YarnServiceConf; import org.apache.hadoop.yarn.service.conf.YarnServiceConf;
import org.apache.hadoop.yarn.service.component.ComponentEvent; import org.apache.hadoop.yarn.service.component.ComponentEvent;
import org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEvent; import org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEvent;
@ -37,6 +38,7 @@ import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceState.REINITIALIZED;
import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceState.STARTED; import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceState.STARTED;
import static org.apache.hadoop.yarn.service.component.ComponentEventType.FLEX; import static org.apache.hadoop.yarn.service.component.ComponentEventType.FLEX;
import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEventType.BECOME_NOT_READY; import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEventType.BECOME_NOT_READY;
@ -108,8 +110,9 @@ public class ServiceMonitor extends AbstractService {
ComponentInstance instance = entry.getValue(); ComponentInstance instance = entry.getValue();
ProbeStatus status = instance.ping(); ProbeStatus status = instance.ping();
ComponentInstanceState instanceState = instance.getState();
if (status.isSuccess()) { if (status.isSuccess()) {
if (instance.getState() == STARTED) { if (instanceState == STARTED || instanceState == REINITIALIZED) {
LOG.info("Readiness check succeeded for {}: {}", instance LOG.info("Readiness check succeeded for {}: {}", instance
.getCompInstanceName(), status); .getCompInstanceName(), status);
// synchronously update the state. // synchronously update the state.

View File

@ -306,10 +306,14 @@ public class TestServiceManager {
private void makeAllInstancesReady(ServiceContext context) private void makeAllInstancesReady(ServiceContext context)
throws TimeoutException, InterruptedException { throws TimeoutException, InterruptedException {
context.scheduler.getLiveInstances().forEach(((containerId, instance) -> { context.scheduler.getLiveInstances().forEach(((containerId, instance) -> {
ComponentInstanceEvent event = new ComponentInstanceEvent(containerId, ComponentInstanceEvent startEvent = new ComponentInstanceEvent(
ComponentInstanceEventType.BECOME_READY); containerId, ComponentInstanceEventType.START);
context.scheduler.getDispatcher().getEventHandler().handle(startEvent);
context.scheduler.getDispatcher().getEventHandler().handle(event); ComponentInstanceEvent becomeReadyEvent = new ComponentInstanceEvent(
containerId, ComponentInstanceEventType.BECOME_READY);
context.scheduler.getDispatcher().getEventHandler().handle(
becomeReadyEvent);
})); }));
GenericTestUtils.waitFor(()-> { GenericTestUtils.waitFor(()-> {
for (ComponentInstance instance: for (ComponentInstance instance:
@ -350,11 +354,17 @@ public class TestServiceManager {
// instances of comp1 get upgraded and become ready event is triggered // instances of comp1 get upgraded and become ready event is triggered
// become ready // become ready
compInstances.forEach(instance -> { compInstances.forEach(instance -> {
ComponentInstanceEvent event = new ComponentInstanceEvent( ComponentInstanceEvent startEvent = new ComponentInstanceEvent(
instance.getContainer().getId(),
ComponentInstanceEventType.START);
context.scheduler.getDispatcher().getEventHandler().handle(startEvent);
ComponentInstanceEvent becomeReadyEvent = new ComponentInstanceEvent(
instance.getContainer().getId(), instance.getContainer().getId(),
ComponentInstanceEventType.BECOME_READY); ComponentInstanceEventType.BECOME_READY);
context.scheduler.getDispatcher().getEventHandler().handle(event); context.scheduler.getDispatcher().getEventHandler().handle(
becomeReadyEvent);
}); });
GenericTestUtils.waitFor(() -> { GenericTestUtils.waitFor(() -> {

View File

@ -497,7 +497,8 @@ public class TestYarnNativeServices extends ServiceTestUtils {
} }
// Test to verify ANTI_AFFINITY placement policy // Test to verify ANTI_AFFINITY placement policy
// 1. Start mini cluster with 3 NMs and scheduler placement-constraint handler // 1. Start mini cluster
// with 3 NMs and scheduler placement-constraint handler
// 2. Create an example service with 3 containers // 2. Create an example service with 3 containers
// 3. Verify no more than 1 container comes up in each of the 3 NMs // 3. Verify no more than 1 container comes up in each of the 3 NMs
// 4. Flex the component to 4 containers // 4. Flex the component to 4 containers

View File

@ -38,6 +38,7 @@ import org.junit.Test;
import java.util.Iterator; import java.util.Iterator;
import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEventType.BECOME_READY; import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEventType.BECOME_READY;
import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEventType.START;
import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEventType.STOP; import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEventType.STOP;
/** /**
@ -159,6 +160,8 @@ public class TestComponent {
// reinitialization of a container done // reinitialization of a container done
for(ComponentInstance instance : comp.getAllComponentInstances()) { for(ComponentInstance instance : comp.getAllComponentInstances()) {
instance.handle(new ComponentInstanceEvent(
instance.getContainer().getId(), START));
instance.handle(new ComponentInstanceEvent( instance.handle(new ComponentInstanceEvent(
instance.getContainer().getId(), BECOME_READY)); instance.getContainer().getId(), BECOME_READY));
} }
@ -199,6 +202,8 @@ public class TestComponent {
// second instance finished upgrading // second instance finished upgrading
ComponentInstance instance2 = iter.next(); ComponentInstance instance2 = iter.next();
instance2.handle(new ComponentInstanceEvent(
instance2.getContainer().getId(), ComponentInstanceEventType.START));
instance2.handle(new ComponentInstanceEvent( instance2.handle(new ComponentInstanceEvent(
instance2.getContainer().getId(), instance2.getContainer().getId(),
ComponentInstanceEventType.BECOME_READY)); ComponentInstanceEventType.BECOME_READY));
@ -230,6 +235,9 @@ public class TestComponent {
// cancel upgrade successful for both instances // cancel upgrade successful for both instances
for(ComponentInstance instance : comp.getAllComponentInstances()) { for(ComponentInstance instance : comp.getAllComponentInstances()) {
instance.handle(new ComponentInstanceEvent(
instance.getContainer().getId(),
ComponentInstanceEventType.START));
instance.handle(new ComponentInstanceEvent( instance.handle(new ComponentInstanceEvent(
instance.getContainer().getId(), instance.getContainer().getId(),
ComponentInstanceEventType.BECOME_READY)); ComponentInstanceEventType.BECOME_READY));

View File

@ -98,7 +98,12 @@ public class TestComponentInstance {
ComponentInstanceEvent instanceEvent = new ComponentInstanceEvent( ComponentInstanceEvent instanceEvent = new ComponentInstanceEvent(
instance.getContainer().getId(), ComponentInstanceEventType.UPGRADE); instance.getContainer().getId(), ComponentInstanceEventType.UPGRADE);
instance.handle(instanceEvent); instance.handle(instanceEvent);
instance.handle(new ComponentInstanceEvent(instance.getContainer().getId(),
ComponentInstanceEventType.START));
Assert.assertEquals("instance not running",
ContainerState.RUNNING_BUT_UNREADY,
component.getComponentSpec().getContainer(instance.getContainer()
.getId().toString()).getState());
instance.handle(new ComponentInstanceEvent(instance.getContainer().getId(), instance.handle(new ComponentInstanceEvent(instance.getContainer().getId(),
ComponentInstanceEventType.BECOME_READY)); ComponentInstanceEventType.BECOME_READY));
Assert.assertEquals("instance not ready", ContainerState.READY, Assert.assertEquals("instance not ready", ContainerState.READY,
@ -246,23 +251,33 @@ public class TestComponentInstance {
instance.handle(cancelEvent); instance.handle(cancelEvent);
// either upgrade failed or successful // either upgrade failed or successful
ComponentInstanceEvent readyOrStopEvent = new ComponentInstanceEvent( if (upgradeSuccessful) {
instance.handle(new ComponentInstanceEvent(
instance.getContainer().getId(), ComponentInstanceEventType.START));
instance.handle(new ComponentInstanceEvent(
instance.getContainer().getId(), instance.getContainer().getId(),
upgradeSuccessful ? ComponentInstanceEventType.BECOME_READY : ComponentInstanceEventType.BECOME_READY));
ComponentInstanceEventType.STOP); } else {
instance.handle(new ComponentInstanceEvent(
instance.getContainer().getId(),
ComponentInstanceEventType.STOP));
}
instance.handle(readyOrStopEvent);
Assert.assertEquals("instance not upgrading", ContainerState.UPGRADING, Assert.assertEquals("instance not upgrading", ContainerState.UPGRADING,
component.getComponentSpec().getContainer(instance.getContainer() component.getComponentSpec().getContainer(instance.getContainer()
.getId().toString()).getState()); .getId().toString()).getState());
// response for cancel received // response for cancel received
ComponentInstanceEvent readyOrStopCancel = new ComponentInstanceEvent( if (cancelUpgradeSuccessful) {
instance.handle(new ComponentInstanceEvent(
instance.getContainer().getId(), ComponentInstanceEventType.START));
instance.handle(new ComponentInstanceEvent(
instance.getContainer().getId(), instance.getContainer().getId(),
cancelUpgradeSuccessful ? ComponentInstanceEventType.BECOME_READY : ComponentInstanceEventType.BECOME_READY));
ComponentInstanceEventType.STOP); } else {
instance.handle(new ComponentInstanceEvent(
instance.handle(readyOrStopCancel); instance.getContainer().getId(), ComponentInstanceEventType.STOP));
}
if (cancelUpgradeSuccessful) { if (cancelUpgradeSuccessful) {
Assert.assertEquals("instance not ready", ContainerState.READY, Assert.assertEquals("instance not ready", ContainerState.READY,
component.getComponentSpec().getContainer(instance.getContainer() component.getComponentSpec().getContainer(instance.getContainer()