YARN-6872. Ensure apps could run given NodeLabels are disabled post RM switchover/restart. Contributed by Sunil G

This commit is contained in:
Jian He 2017-08-01 09:56:33 -07:00
parent b38a1eea8e
commit 91f120f743
4 changed files with 29 additions and 63 deletions

View File

@ -43,7 +43,6 @@ import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException; import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
import org.apache.hadoop.yarn.exceptions.InvalidLabelResourceRequestException;
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.ipc.RPCUtil; import org.apache.hadoop.yarn.ipc.RPCUtil;
import org.apache.hadoop.yarn.security.AccessRequest; import org.apache.hadoop.yarn.security.AccessRequest;
@ -65,7 +64,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.server.utils.BuilderUtils;
@ -349,36 +347,6 @@ public class RMAppManager implements EventHandler<RMAppManagerEvent>,
createAndPopulateNewRMApp(appContext, appState.getSubmitTime(), createAndPopulateNewRMApp(appContext, appState.getSubmitTime(),
appState.getUser(), true, appState.getStartTime()); appState.getUser(), true, appState.getStartTime());
// If null amReq has been returned, check if it is the case that
// application has specified node label expression while node label
// has been disabled. Reject the recovery of this application if it
// is true and give clear message so that user can react properly.
if (!appContext.getUnmanagedAM() &&
(application.getAMResourceRequests() == null ||
application.getAMResourceRequests().isEmpty()) &&
!YarnConfiguration.areNodeLabelsEnabled(this.conf)) {
// check application submission context and see if am resource request
// or application itself contains any node label expression.
List<ResourceRequest> amReqsFromAppContext =
appContext.getAMContainerResourceRequests();
String labelExp =
(amReqsFromAppContext != null && !amReqsFromAppContext.isEmpty()) ?
amReqsFromAppContext.get(0).getNodeLabelExpression() : null;
if (labelExp == null) {
labelExp = appContext.getNodeLabelExpression();
}
if (labelExp != null &&
!labelExp.equals(RMNodeLabelsManager.NO_LABEL)) {
String message = "Failed to recover application " + appId
+ ". NodeLabel is not enabled in cluster, but AM resource request "
+ "contains a label expression.";
LOG.warn(message);
application.handle(
new RMAppEvent(appId, RMAppEventType.APP_REJECTED, message));
return;
}
}
application.handle(new RMAppRecoverEvent(appId, rmState)); application.handle(new RMAppRecoverEvent(appId, rmState));
} }
@ -398,28 +366,8 @@ public class RMAppManager implements EventHandler<RMAppManagerEvent>,
} }
ApplicationId applicationId = submissionContext.getApplicationId(); ApplicationId applicationId = submissionContext.getApplicationId();
List<ResourceRequest> amReqs = null; List<ResourceRequest> amReqs = validateAndCreateResourceRequest(
try { submissionContext, isRecovery);
amReqs = validateAndCreateResourceRequest(submissionContext, isRecovery);
} catch (InvalidLabelResourceRequestException e) {
// This can happen if the application had been submitted and run
// with Node Label enabled but recover with Node Label disabled.
// Thus there might be node label expression in the application's
// resource requests. If this is the case, create RmAppImpl with
// null amReq and reject the application later with clear error
// message. So that the application can still be tracked by RM
// after recovery and user can see what's going on and react accordingly.
if (isRecovery &&
!YarnConfiguration.areNodeLabelsEnabled(this.conf)) {
if (LOG.isDebugEnabled()) {
LOG.debug("AMResourceRequest is not created for " + applicationId
+ ". NodeLabel is not enabled in cluster, but AM resource "
+ "request contains a label expression.");
}
} else {
throw e;
}
}
// Verify and get the update application priority and set back to // Verify and get the update application priority and set back to
// submissionContext // submissionContext

View File

@ -67,6 +67,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstant
import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils; import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType;
@ -518,12 +519,20 @@ public abstract class AbstractYarnScheduler
container.setVersion(status.getVersion()); container.setVersion(status.getVersion());
ApplicationAttemptId attemptId = ApplicationAttemptId attemptId =
container.getId().getApplicationAttemptId(); container.getId().getApplicationAttemptId();
String labelExpression = status.getNodeLabelExpression();
// If NodeLabel is disabled but recovered container has label expression
// its better to suppress that and considered as default label.
if (!status.getNodeLabelExpression().isEmpty() && !YarnConfiguration
.areNodeLabelsEnabled(rmContext.getYarnConfiguration())) {
labelExpression = RMNodeLabelsManager.NO_LABEL;
}
RMContainer rmContainer = RMContainer rmContainer =
new RMContainerImpl(container, new RMContainerImpl(container,
SchedulerRequestKey.extractFrom(container), attemptId, SchedulerRequestKey.extractFrom(container), attemptId,
node.getNodeID(), applications.get( node.getNodeID(), applications.get(
attemptId.getApplicationId()).getUser(), rmContext, attemptId.getApplicationId()).getUser(), rmContext,
status.getCreationTime(), status.getNodeLabelExpression()); status.getCreationTime(), labelExpression);
return rmContainer; return rmContainer;
} }

View File

@ -22,6 +22,8 @@ import java.util.Set;
import com.google.common.annotations.VisibleForTesting; import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
@ -55,7 +57,9 @@ import org.apache.hadoop.yarn.util.resource.Resources;
@Private @Private
@Unstable @Unstable
public class SchedulerUtils { public class SchedulerUtils {
private static final Log LOG = LogFactory.getLog(SchedulerUtils.class);
private static final RecordFactory recordFactory = private static final RecordFactory recordFactory =
RecordFactoryProvider.getRecordFactory(null); RecordFactoryProvider.getRecordFactory(null);
@ -200,9 +204,14 @@ public class SchedulerUtils {
String labelExp = resReq.getNodeLabelExpression(); String labelExp = resReq.getNodeLabelExpression();
if (!(RMNodeLabelsManager.NO_LABEL.equals(labelExp) if (!(RMNodeLabelsManager.NO_LABEL.equals(labelExp)
|| null == labelExp)) { || null == labelExp)) {
throw new InvalidLabelResourceRequestException( String message = "NodeLabel is not enabled in cluster, but resource"
"Invalid resource request, node label not enabled " + " request contains a label expression.";
+ "but request contains label expression"); LOG.warn(message);
if (!isRecovery) {
throw new InvalidLabelResourceRequestException(
"Invalid resource request, node label not enabled "
+ "but request contains label expression");
}
} }
} }
if (null == queueInfo) { if (null == queueInfo) {

View File

@ -2551,14 +2551,14 @@ public class TestRMRestart extends ParameterizedSchedulerTestBase {
} }
}; };
// rm should successfully start with app1 loaded back in FAILED state // rm should successfully start with app1 loaded back in SUCCESS state
// due to node label not enabled but am resource request contains // by pushing app to run default label for am container and let other
// node label expression. // containers to run normally.
try { try {
rm2.start(); rm2.start();
Assert.assertTrue("RM start successfully", true); Assert.assertTrue("RM start successfully", true);
Assert.assertEquals(1, rm2.getRMContext().getRMApps().size()); Assert.assertEquals(1, rm2.getRMContext().getRMApps().size());
rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED);
} catch (Exception e) { } catch (Exception e) {
LOG.debug("Exception on start", e); LOG.debug("Exception on start", e);
Assert.fail("RM should start without any issue"); Assert.fail("RM should start without any issue");