YARN-6872. Ensure apps could run given NodeLabels are disabled post RM switchover/restart. Contributed by Sunil G
This commit is contained in:
parent
b38a1eea8e
commit
91f120f743
|
@ -43,7 +43,6 @@ import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
|||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.event.EventHandler;
|
||||
import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
|
||||
import org.apache.hadoop.yarn.exceptions.InvalidLabelResourceRequestException;
|
||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||
import org.apache.hadoop.yarn.ipc.RPCUtil;
|
||||
import org.apache.hadoop.yarn.security.AccessRequest;
|
||||
|
@ -65,7 +64,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
|
|||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
|
||||
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
|
||||
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
||||
|
||||
|
@ -349,36 +347,6 @@ public class RMAppManager implements EventHandler<RMAppManagerEvent>,
|
|||
createAndPopulateNewRMApp(appContext, appState.getSubmitTime(),
|
||||
appState.getUser(), true, appState.getStartTime());
|
||||
|
||||
// If null amReq has been returned, check if it is the case that
|
||||
// application has specified node label expression while node label
|
||||
// has been disabled. Reject the recovery of this application if it
|
||||
// is true and give clear message so that user can react properly.
|
||||
if (!appContext.getUnmanagedAM() &&
|
||||
(application.getAMResourceRequests() == null ||
|
||||
application.getAMResourceRequests().isEmpty()) &&
|
||||
!YarnConfiguration.areNodeLabelsEnabled(this.conf)) {
|
||||
// check application submission context and see if am resource request
|
||||
// or application itself contains any node label expression.
|
||||
List<ResourceRequest> amReqsFromAppContext =
|
||||
appContext.getAMContainerResourceRequests();
|
||||
String labelExp =
|
||||
(amReqsFromAppContext != null && !amReqsFromAppContext.isEmpty()) ?
|
||||
amReqsFromAppContext.get(0).getNodeLabelExpression() : null;
|
||||
if (labelExp == null) {
|
||||
labelExp = appContext.getNodeLabelExpression();
|
||||
}
|
||||
if (labelExp != null &&
|
||||
!labelExp.equals(RMNodeLabelsManager.NO_LABEL)) {
|
||||
String message = "Failed to recover application " + appId
|
||||
+ ". NodeLabel is not enabled in cluster, but AM resource request "
|
||||
+ "contains a label expression.";
|
||||
LOG.warn(message);
|
||||
application.handle(
|
||||
new RMAppEvent(appId, RMAppEventType.APP_REJECTED, message));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
application.handle(new RMAppRecoverEvent(appId, rmState));
|
||||
}
|
||||
|
||||
|
@ -398,28 +366,8 @@ public class RMAppManager implements EventHandler<RMAppManagerEvent>,
|
|||
}
|
||||
|
||||
ApplicationId applicationId = submissionContext.getApplicationId();
|
||||
List<ResourceRequest> amReqs = null;
|
||||
try {
|
||||
amReqs = validateAndCreateResourceRequest(submissionContext, isRecovery);
|
||||
} catch (InvalidLabelResourceRequestException e) {
|
||||
// This can happen if the application had been submitted and run
|
||||
// with Node Label enabled but recover with Node Label disabled.
|
||||
// Thus there might be node label expression in the application's
|
||||
// resource requests. If this is the case, create RmAppImpl with
|
||||
// null amReq and reject the application later with clear error
|
||||
// message. So that the application can still be tracked by RM
|
||||
// after recovery and user can see what's going on and react accordingly.
|
||||
if (isRecovery &&
|
||||
!YarnConfiguration.areNodeLabelsEnabled(this.conf)) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("AMResourceRequest is not created for " + applicationId
|
||||
+ ". NodeLabel is not enabled in cluster, but AM resource "
|
||||
+ "request contains a label expression.");
|
||||
}
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
List<ResourceRequest> amReqs = validateAndCreateResourceRequest(
|
||||
submissionContext, isRecovery);
|
||||
|
||||
// Verify and get the update application priority and set back to
|
||||
// submissionContext
|
||||
|
|
|
@ -67,6 +67,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstant
|
|||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType;
|
||||
|
@ -518,12 +519,20 @@ public abstract class AbstractYarnScheduler
|
|||
container.setVersion(status.getVersion());
|
||||
ApplicationAttemptId attemptId =
|
||||
container.getId().getApplicationAttemptId();
|
||||
String labelExpression = status.getNodeLabelExpression();
|
||||
// If NodeLabel is disabled but recovered container has label expression
|
||||
// its better to suppress that and considered as default label.
|
||||
if (!status.getNodeLabelExpression().isEmpty() && !YarnConfiguration
|
||||
.areNodeLabelsEnabled(rmContext.getYarnConfiguration())) {
|
||||
labelExpression = RMNodeLabelsManager.NO_LABEL;
|
||||
}
|
||||
|
||||
RMContainer rmContainer =
|
||||
new RMContainerImpl(container,
|
||||
SchedulerRequestKey.extractFrom(container), attemptId,
|
||||
node.getNodeID(), applications.get(
|
||||
attemptId.getApplicationId()).getUser(), rmContext,
|
||||
status.getCreationTime(), status.getNodeLabelExpression());
|
||||
status.getCreationTime(), labelExpression);
|
||||
return rmContainer;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,8 @@ import java.util.Set;
|
|||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
||||
import org.apache.hadoop.classification.InterfaceStability.Unstable;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
@ -56,6 +58,8 @@ import org.apache.hadoop.yarn.util.resource.Resources;
|
|||
@Unstable
|
||||
public class SchedulerUtils {
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(SchedulerUtils.class);
|
||||
|
||||
private static final RecordFactory recordFactory =
|
||||
RecordFactoryProvider.getRecordFactory(null);
|
||||
|
||||
|
@ -200,9 +204,14 @@ public class SchedulerUtils {
|
|||
String labelExp = resReq.getNodeLabelExpression();
|
||||
if (!(RMNodeLabelsManager.NO_LABEL.equals(labelExp)
|
||||
|| null == labelExp)) {
|
||||
throw new InvalidLabelResourceRequestException(
|
||||
"Invalid resource request, node label not enabled "
|
||||
+ "but request contains label expression");
|
||||
String message = "NodeLabel is not enabled in cluster, but resource"
|
||||
+ " request contains a label expression.";
|
||||
LOG.warn(message);
|
||||
if (!isRecovery) {
|
||||
throw new InvalidLabelResourceRequestException(
|
||||
"Invalid resource request, node label not enabled "
|
||||
+ "but request contains label expression");
|
||||
}
|
||||
}
|
||||
}
|
||||
if (null == queueInfo) {
|
||||
|
|
|
@ -2551,14 +2551,14 @@ public class TestRMRestart extends ParameterizedSchedulerTestBase {
|
|||
}
|
||||
};
|
||||
|
||||
// rm should successfully start with app1 loaded back in FAILED state
|
||||
// due to node label not enabled but am resource request contains
|
||||
// node label expression.
|
||||
// rm should successfully start with app1 loaded back in SUCCESS state
|
||||
// by pushing app to run default label for am container and let other
|
||||
// containers to run normally.
|
||||
|
||||
try {
|
||||
rm2.start();
|
||||
Assert.assertTrue("RM start successfully", true);
|
||||
Assert.assertEquals(1, rm2.getRMContext().getRMApps().size());
|
||||
rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED);
|
||||
} catch (Exception e) {
|
||||
LOG.debug("Exception on start", e);
|
||||
Assert.fail("RM should start without any issue");
|
||||
|
|
Loading…
Reference in New Issue