From fab20109ae0e061b7999f58ee924688d787f7804 Mon Sep 17 00:00:00 2001 From: Jason Darrell Lowe Date: Thu, 21 Aug 2014 22:45:35 +0000 Subject: [PATCH] svn merge -c 1619614 FIXES: YARN-2434. RM should not recover containers from previously failed attempt when AM restart is not enabled. Contributed by Jian He git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1619616 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-yarn-project/CHANGES.txt | 3 +++ .../scheduler/AbstractYarnScheduler.java | 13 +++++++++++++ .../TestWorkPreservingRMRestart.java | 13 +++++++++++++ 3 files changed, 29 insertions(+) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index ad433648b57..026d54b2d8e 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -211,6 +211,9 @@ Release 2.6.0 - UNRELEASED YARN-2424. LCE should support non-cgroups, non-secure mode (Chris Douglas via aw) + YARN-2434. RM should not recover containers from previously failed attempt + when AM restart is not enabled (Jian He via jlowe) + Release 2.5.0 - 2014-08-11 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java index 72ee7dbbe0a..ab56bb97212 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java @@ -273,6 +273,19 @@ public abstract class AbstractYarnScheduler SchedulerApplicationAttempt schedulerAttempt = schedulerApp.getCurrentAppAttempt(); + if (!rmApp.getApplicationSubmissionContext() + .getKeepContainersAcrossApplicationAttempts()) { + // Do not recover containers for stopped attempt or previous attempt. + if (schedulerAttempt.isStopped() + || !schedulerAttempt.getApplicationAttemptId().equals( + container.getContainerId().getApplicationAttemptId())) { + LOG.info("Skip recovering container " + container + + " for already stopped attempt."); + killOrphanContainerOnNode(nm, container); + continue; + } + } + // create container RMContainer rmContainer = recoverAndCreateContainer(container, nm); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java index df64d4c32d2..d6af0d7307e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java @@ -513,6 +513,19 @@ public class TestWorkPreservingRMRestart { // just-recovered containers. assertNull(scheduler.getRMContainer(runningContainer.getContainerId())); assertNull(scheduler.getRMContainer(completedContainer.getContainerId())); + + rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1); + + MockNM nm2 = + new MockNM("127.1.1.1:4321", 8192, rm2.getResourceTrackerService()); + NMContainerStatus previousAttemptContainer = + TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 4, + ContainerState.RUNNING); + nm2.registerNode(Arrays.asList(previousAttemptContainer), null); + // Wait for RM to settle down on recovering containers; + Thread.sleep(3000); + // check containers from previous failed attempt should not be recovered. + assertNull(scheduler.getRMContainer(previousAttemptContainer.getContainerId())); } // Apps already completed before RM restart. Restarted RM scheduler should not